From 94aed5069cc35782db2311211ca48714c301ded3 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 20 Apr 2021 08:10:02 +0000 Subject: [PATCH 001/166] new cuda framework --- .../cuda/cuda_best_split_finder.cpp | 0 src/treelearner/cuda/cuda_data_splitter.cpp | 0 .../cuda/cuda_histogram_constructor.cpp | 0 .../cuda/cuda_histogram_constructor.hpp | 113 +++++++++++ .../cuda/cuda_leaf_splits_init.cpp | 42 +++++ src/treelearner/cuda/cuda_leaf_splits_init.cu | 50 +++++ src/treelearner/cuda/cuda_leaf_splits_init.hu | 24 +++ .../cuda/new_cuda_tree_learner.cpp | 177 ++++++++++++++++++ .../cuda/new_cuda_tree_learner.hpp | 81 ++++++++ 9 files changed, 487 insertions(+) create mode 100644 src/treelearner/cuda/cuda_best_split_finder.cpp create mode 100644 src/treelearner/cuda/cuda_data_splitter.cpp create mode 100644 src/treelearner/cuda/cuda_histogram_constructor.cpp create mode 100644 src/treelearner/cuda/cuda_histogram_constructor.hpp create mode 100644 src/treelearner/cuda/cuda_leaf_splits_init.cpp create mode 100644 src/treelearner/cuda/cuda_leaf_splits_init.cu create mode 100644 src/treelearner/cuda/cuda_leaf_splits_init.hu create mode 100644 src/treelearner/cuda/new_cuda_tree_learner.cpp create mode 100644 src/treelearner/cuda/new_cuda_tree_learner.hpp diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/treelearner/cuda/cuda_data_splitter.cpp b/src/treelearner/cuda/cuda_data_splitter.cpp new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp new file mode 100644 index 000000000000..11fabdb1eea0 --- /dev/null +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -0,0 +1,113 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_NEW_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ +#define LIGHTGBM_NEW_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ + +#ifdef USE_CUDA + +#include + +#include + +namespace LightGBM { + +class CUDAHistogramConstructor { + public: + CUDAHistogramConstructor(const std::vector& feature_group_ids, + const Dataset* train_data, const int max_num_leaves, + hist_t* cuda_hist); + + void Init(); + + void PushOneData(const uint32_t feature_bin_value, const int feature_group_id, const data_size_t data_index); + + void FinishLoad(); + + void ConstructHistogramForLeaf(const int* smaller_leaf_index, const int* larger_leaf_index); + + hist_t* cuda_hist() { return cuda_hist_; } + + private: + // data on CPU, stored in row-wise style + std::vector cpu_data_; + std::vector feature_group_bin_offsets; + uint8_t* cuda_data_; + const data_size_t num_data_; + hist_t* cuda_hist_; +}; + +class CUDALeafSplitsInit { + public: + CUDALeafSplitsInit(const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t num_data); + + void Init(); + + const double* smaller_leaf_sum_gradients() { return smaller_leaf_sum_gradients_; } + + const double* smaller_leaf_sum_hessians() { return smaller_leaf_sum_hessians_; } + + const double* larger_leaf_sum_gradients() { return larger_leaf_sum_gradients_; } + + const double* larger_leaf_sum_gradients() { return larger_leaf_sum_hessians_; } + + const int* smaller_leaf_index() { return smaller_leaf_index_; } + + const int* larger_leaf_index() { return larger_leaf_index_; } + + protected: + const score_t* cuda_gradients_; + const score_t* cuda_hessians_; + double* smaller_leaf_sum_gradients_; + double* smaller_leaf_sum_hessians_; + double* larger_leaf_sum_gradients_; + double* larger_leaf_sum_hessians_; + int* smaller_leaf_index_; + int* larger_leaf_index_; + + int num_cuda_blocks_; + const int num_data_; +}; + +class CUDABestSplitFinder { + public: + CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* train_data, + const std::vector& feature_group_ids, const int max_num_leaves); + + void FindBestSplitsForLeaf(const int* leaf_id); + + void FindBestFromAllSplits(); + + int* best_leaf() { return cuda_best_leaf_; } + + int* best_split_feature_index() { return cuda_best_split_feature_index_; } + + int* best_split_threshold() { return cuda_best_split_threshold_; } + + private: + int* cuda_leaf_best_split_feature_index_; + int* cuda_leaf_best_split_threshold_; + double* cuda_leaf_best_split_gain_; + + int* cuda_best_leaf_; + int* cuda_best_split_feature_index_; + int* cuda_best_split_threshold_; +}; + +class CUDADataSplitter { + public: + CUDADataSplitter(const data_size_t* data_indices, const data_size_t num_data); + + void Init(); + + void Split(const int* leaf_id, const int* best_split_feature, const int* best_split_threshold); + + Tree* GetCPUTree(); +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_NEW_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.cpp b/src/treelearner/cuda/cuda_leaf_splits_init.cpp new file mode 100644 index 000000000000..c9176d389eca --- /dev/null +++ b/src/treelearner/cuda/cuda_leaf_splits_init.cpp @@ -0,0 +1,42 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_histogram_constructor.hpp" +#include "cuda_leaf_splits_init.hu" + +#include + +namespace LightGBM { + +CUDALeafSplitsInit::CUDALeafSplitsInit(const score_t* cuda_gradients, + const score_t* cuda_hessians, const data_size_t num_data): +cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians), num_data_(num_data) { + +} + +void CUDALeafSplitsInit::Init() { + num_cuda_blocks_ = 256; + + CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + CUDASUCCESS_OR_FATAL(cudaMalloc(&smaller_leaf_sum_gradients_, num_cuda_blocks_)); + CUDASUCCESS_OR_FATAL(cudaMalloc(&smaller_leaf_sum_hessians_, num_cuda_blocks_)); + + const int num_data_per_blocks = (num_data_ + num_cuda_blocks_ - 1) / num_cuda_blocks_; + + CUDALeafSplitsInitKernel1<<>>( + cuda_gradients_, cuda_hessians_, num_data_, smaller_leaf_sum_gradients_, + smaller_leaf_sum_hessians_); + + CUDALeafSplitsInitKernel2<<>>( + cuda_gradients_, cuda_hessians_, num_data_, smaller_leaf_sum_gradients_, + smaller_leaf_sum_hessians_); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.cu b/src/treelearner/cuda/cuda_leaf_splits_init.cu new file mode 100644 index 000000000000..9427a1bc0466 --- /dev/null +++ b/src/treelearner/cuda/cuda_leaf_splits_init.cu @@ -0,0 +1,50 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_leaf_splits_init.hu" + +namespace LightGBM { + +__global__ void CUDALeafSplitsInitKernel1(const score_t* cuda_gradients, const score_t* cuda_hessians, + const data_size_t num_data, double* grad_sum_out, double* hess_sum_out) { + extern __shared__ score_t shared_gradients[blockDim.x]; + extern __shared__ score_t shared_hessians[blockDim.x]; + double sum_gradient = 0.0f; + double sum_hessian = 0.0f; + const unsigned int tid = threadIdx.x; + const unsigned i = blockIdx.x * blockDim.x + tid; + if (i < static_cast(num_data)) { + shared_gradients[tid] = cuda_gradients[i]; + shared_hessians[tid] = cuda_hessians[i]; + } + for (unsigned int s = 1; s < blockDim.x; s *= 2) { + if (tid % (2 * s) == 0) { + shared_gradients[tid] += shared_gradients[tid + s]; + shared_hessians[tid] += shared_hessians[tid + s]; + } + __syncthreads(); + } + if (tid == 0) { + grad_sum_out[blockIdx.x] = shared_gradients[0]; + hess_sum_out[blockIdx.x] = shared_hessians[0]; + } +} + +__global__ void CUDALeafSplitsInitKernel2(const score_t* cuda_gradients, const score_t* cuda_hessians, + const data_size_t num_data, double* grad_sum_out, double* hess_sum_out) { + if (threadIdx.x == 0) { + for (unsigned int i = 1; i < blockDim.x; ++i) { + grad_sum_out[0] += grad_sum_out[i]; + hess_sum_out[0] += hess_sum_out[i]; + } + } +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.hu b/src/treelearner/cuda/cuda_leaf_splits_init.hu new file mode 100644 index 000000000000..fdd8ed94e076 --- /dev/null +++ b/src/treelearner/cuda/cuda_leaf_splits_init.hu @@ -0,0 +1,24 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HU_ +#define LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HU_ + +#ifdef USE_CUDA + +#include + +namespace LightGBM { + +__global__ void CUDALeafSplitsInitKernel1(const score_t* cuda_gradients, const score_t* cuda_hessians, + const data_size_t num_data, double* grad_sum_out, double* hess_sum_out); + +__global__ void CUDALeafSplitsInitKernel2(const score_t* cuda_gradients, const score_t* cuda_hessians, + const data_size_t num_data, double* grad_sum_out, double* hess_sum_out); + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HU_ diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp new file mode 100644 index 000000000000..d04e990d1673 --- /dev/null +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -0,0 +1,177 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "new_cuda_tree_learner.hpp" + +#include +#include + +namespace LightGBM { + +NewCUDATreeLearner::NewCUDATreeLearner(const Config* config): SerialTreeLearner(config) { + +} + +NewCUDATreeLearner::~NewCUDATreeLearner() {} + +void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { + SerialTreeLearner::Init(train_data, is_constant_hessian); + int num_total_gpus = 0; + CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&num_total_gpus)); + num_gpus_ = config_->num_gpu > num_total_gpus ? num_total_gpus : config_->num_gpu; + num_threads_ = OMP_NUM_THREADS(); + + AllocateFeatureTasks(); + AllocateCUDAMemory(); + + CreateCUDAHistogramConstructor(); +} + +void NewCUDATreeLearner::BeforeTrain() { + SerialTreeLearner::BeforeTrain(); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int device_id = 0; device_id < num_gpus_; ++device_id) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + device_leaf_splits_initializer_[device_id]->Init(); + } +} + +void NewCUDATreeLearner::AllocateFeatureTasks() { + device_feature_groups_.resize(num_gpus_); + device_num_total_bins_.resize(num_gpus_, 0); + const int num_feature_groups = train_data_->num_feature_groups(); + const int num_feature_groups_per_device = (num_feature_groups + num_gpus_ - 1) / num_gpus_; + for (int device_id = 0; device_id < num_gpus_; ++device_id) { + device_feature_groups_[device_id].clear(); + const int device_feature_group_start = device_id * num_feature_groups_per_device; + const int device_feature_group_end = std::min(device_feature_group_start + num_feature_groups_per_device, num_feature_groups); + int& num_total_bin = device_num_total_bins_[device_id]; + num_total_bin = 0; + for (int group_id = device_feature_group_start; group_id < device_feature_group_end; ++group_id) { + device_feature_groups_.emplace_back(group_id); + num_total_bin += train_data_->FeatureGroupNumBin(group_id); + } + } +} + +void NewCUDATreeLearner::AllocateCUDAMemory() { + device_data_indices_.resize(num_gpus_, nullptr); + device_gradients_.resize(num_gpus_, nullptr); + if (config_->is_constant_hessian) { + device_hessians_.resize(num_gpus_, nullptr); + } + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int device_id = 0; device_id < num_gpus_; ++device_id) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + if (device_data_indices_[device_id] != nullptr) { + CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); + } + CUDASUCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), num_data_)); + if (device_gradients_[device_id] != nullptr) { + CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id])); + } + CUDASUCESS_OR_FATAL(cudaMalloc(&(device_gradients_[device_id]), num_data_)); + if (config_->is_constant_hessian) { + if (device_hessians_[device_id] != nullptr) { + CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id])); + } + CUDASUCESS_OR_FATAL(cudaMalloc(&(device_hessians_[device_id]), num_data_)); + } + } +} + +void NewCUDATreeLearner::CreateCUDAHistogramConstructors() { + device_histogram_constructors_.resize(num_gpus_); + device_leaf_splits_initializers_.resize(num_gpus_); + device_best_split_finders_.resize(num_gpus_); + device_splitters_.ressize(num_gpus_); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int device_id = 0; device_id < num_gpus_; ++device_id) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + device_leaf_splits_initializers_[device_id].reset( + new CUDALeafSplitsInit(device_gradients_[device_id], device_hessians_[device_id])); + device_histogram_constructors_[device_id].reset( + new CUDAHistogramConstructor(device_feature_groups_[device_id], + train_data_, config_->num_leaves, device_histograms_[device_id]))); + device_best_split_finders_[device_id].reset( + new CUDABestSplitFinder(device_histogram_constructors_[device_id]->cuda_hist(), + train_data_, device_feature_groups_[device_id], config_->num_leaves)); + device_splitters_[device_id].reset( + new CUDADataSplitter(device_data_indices_[device_id], num_data_)); + } + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int device_id = 0; device_id < num_gpus_; ++device_id) { + device_leaf_splits_initializers_[device_id]->Init(); + device_histogram_constructors_[device_id]->Init(); + } + PushDataIntoDeviceHistogramConstructors(); +} + +void NewCUDATreeLearner::PushDataIntoDeviceHistogramConstructors() { + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int device_id = 0; device_id < num_gpus_; ++device_id) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + CUDAHistogramConstructor* cuda_histogram_constructor = device_histogram_constructors_[device_id].get(); + for (int group_id : device_feature_groups_[device_id]) { + BinIterator* iter = train_data_->FeatureGroupIterator(group_id); + iter->Reset(0); + for (const data_size_t data_index = 0; data_index < num_data_; ++data_index) { + const uint32_t bin = static_cast(iter->RawGet(data_index)); + cuda_histogram_constructor->PushOneData(bin, group_id, data_index); + } + } + // call finish load to tranfer data from CPU to GPU + cuda_histogram_constructor->FinishLoad(); + } +} + +void NewCUDATreeLearner::FindBestSplits(const Tree* tree) { + std::vector is_feature_used(num_features_, 1); + ConstructHistograms(is_feature_used, true); + FindBestSplitsFromHistograms(is_feature_used, true, tree); +} + +void NewCUDATreeLearner::ConstructHistograms(const std::vector& /*is_feature_used*/, + bool /*use_subtract*/) { + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int device_id = 0; device_id < num_gpus_; ++device_id) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + device_histogram_constructors_[device_id]->ConstructHistogramForLeaf( + device_leaf_splits_initializers_[device_id]->smaller_leaf_index(), + device_leaf_splits_initializers_[device_id]->larger_leaf_index()); + } +} + +void NewCUDATreeLearner::FindBestSplitsFromHistograms(const std::vector& /*is_feature_used*/, + bool /*use_subtract*/, const Tree* /*tree*/) { + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int device_id = 0; device_id < num_gpus_; ++device_id) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + device_best_split_finders_[device_id]->FindBestSplitsForLeaf( + device_leaf_splits_initializers_[device_id]->smaller_leaf_index()); + device_best_split_finders_[device_id]->FindBestSplitsForLeaf( + device_leaf_splits_initializers_[device_id]->larger_leaf_index()); + device_best_split_finders_[device_id]->FindBestFromAllSplits(); + } +} + +void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, + int* /*left_leaf*/, int* /*right_leaf*/) { + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int device_id = 0; device_id < num_gpus_; ++device_id) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + device_splitters_[device_id]->Split( + device_best_split_finders_[device_id]->best_leaf(), + device_best_split_finders_[device_id]->best_split_feature_index(), + device_best_split_finders_[device_id]->best_split_threshold()); + } +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp new file mode 100644 index 000000000000..91af7f5cf33e --- /dev/null +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -0,0 +1,81 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_NEW_CUDA_TREE_LEARNER_HPP_ +#define LIGHTGBM_NEW_CUDA_TREE_LEARNER_HPP_ + +#ifdef USE_CUDA + +#include "../serial_tree_learner.h" +#include "cuda_histogram_constructor.hpp" + +namespace LightGBM { + +class NewCUDATreeLearner: public SerialTreelearner { + public: + explicit NewCUDATreeLearner(const Config* config); + + ~NewCUDATreeLearner(); + + void Init(const Dataset* train_data, bool is_constant_hessian) override; + + void BeforeTrain() override; + + void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; + + Tree* Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) override; + + void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override; + + protected: + void AllocateFeatureTasks(); + + void AllocateCUDAMemory(); + + void CreateCUDAHistogramConstructors(); + + void PushDataIntoDeviceHistogramConstructors(); + + void FindBestSplits(const Tree* tree) override; + + void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) override; + + void FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract, const Tree* tree) override; + + void Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) override; + + // number of GPUs + int num_gpus_; + // number of threads on CPU + int num_threads_; + + // feature groups allocated to each device + std::vector> device_feature_groups_; + // number of total bins of feature groups allocated to each device + std::vector device_num_total_bins_; + // number of maximum work groups per device + std::vector device_num_workgroups_; + + // full data indices on CUDA devices, as the data indices of data_partition_ in CPU version + std::vector device_data_indices_; + // gradient values on CUDA devices + std::vector device_gradients_; + // hessian values on CUDA devices + std::vector device_hessians_; + + // device leaf splits initializer + std::vector> device_leaf_splits_initializers_; + // device histogram constructors + std::vector> device_histogram_constructors_; + // device best split finder + std::vector> device_best_split_finders_; + // device splitter + std::vector> device_splitters_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_NEW_CUDA_TREE_LEARNER_HPP_ From 18df6b22ae726d890e504d0e67ba2d5ff480bb84 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 22 Apr 2021 11:05:14 +0000 Subject: [PATCH 002/166] add histogram construction kernel --- CMakeLists.txt | 11 +++ .../cuda/cuda_best_split_finder.cpp | 18 ++++ src/treelearner/cuda/cuda_data_splitter.cpp | 56 ++++++++++++ src/treelearner/cuda/cuda_data_splitter.cu | 27 ++++++ src/treelearner/cuda/cuda_data_splitter.hpp | 56 ++++++++++++ .../cuda/cuda_histogram_constructor.cpp | 87 ++++++++++++++++++ .../cuda/cuda_histogram_constructor.cu | 90 ++++++++++++++++++ .../cuda/cuda_histogram_constructor.hpp | 68 ++++---------- .../cuda/cuda_leaf_splits_init.cpp | 52 +++++++---- src/treelearner/cuda/cuda_leaf_splits_init.cu | 38 +++++--- .../cuda/cuda_leaf_splits_init.hpp | 63 +++++++++++++ src/treelearner/cuda/cuda_leaf_splits_init.hu | 24 ----- .../cuda/new_cuda_tree_learner.cpp | 91 +++++++++++++++---- .../cuda/new_cuda_tree_learner.hpp | 17 ++-- src/treelearner/cuda/new_cuda_utils.cpp | 20 ++++ src/treelearner/cuda/new_cuda_utils.hpp | 53 +++++++++++ src/treelearner/tree_learner.cpp | 3 +- 17 files changed, 645 insertions(+), 129 deletions(-) create mode 100644 src/treelearner/cuda/cuda_data_splitter.cu create mode 100644 src/treelearner/cuda/cuda_data_splitter.hpp create mode 100644 src/treelearner/cuda/cuda_histogram_constructor.cu create mode 100644 src/treelearner/cuda/cuda_leaf_splits_init.hpp delete mode 100644 src/treelearner/cuda/cuda_leaf_splits_init.hu create mode 100644 src/treelearner/cuda/new_cuda_utils.cpp create mode 100644 src/treelearner/cuda/new_cuda_utils.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 41fb21f5e54c..51516a9ba938 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -229,6 +229,15 @@ if(USE_CUDA) add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}") add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}") endforeach() + + add_library(leaf_splits_init OBJECT src/treelearner/cuda/cuda_leaf_splits_init.cu) + set_target_properties(leaf_splits_init PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + add_library(data_splitter OBJECT src/treelearner/cuda/cuda_data_splitter.cu) + set_target_properties(data_splitter PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + add_library(histogram_constructor OBJECT src/treelearner/cuda/cuda_histogram_constructor.cu) + set_target_properties(histogram_constructor PROPERTIES CUDA_SEPARABLE_COMPILATION ON) endif(USE_CUDA) if(USE_HDFS) @@ -340,6 +349,8 @@ file(GLOB SOURCES src/treelearner/*.cpp if(USE_CUDA) src/treelearner/*.cu + src/treelearner/cuda/*.cpp + src/treelearner/cuda/*.cu endif(USE_CUDA) ) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index e69de29bb2d1..72e52adfbc95 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -0,0 +1,18 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "cuda_histogram_constructor.hpp" + +namespace LightGBM { + +CUDABestSplitFinder::CUDABestSplitFinder(const hist_t* /*cuda_hist*/, const Dataset* /*train_data*/, + const std::vector& /*feature_group_ids*/, const int /*max_num_leaves*/) {} + +void CUDABestSplitFinder::FindBestSplitsForLeaf(const int* /*leaf_id*/) {} + +void CUDABestSplitFinder::FindBestFromAllSplits() {} + +} // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_data_splitter.cpp b/src/treelearner/cuda/cuda_data_splitter.cpp index e69de29bb2d1..c61c13b2b847 100644 --- a/src/treelearner/cuda/cuda_data_splitter.cpp +++ b/src/treelearner/cuda/cuda_data_splitter.cpp @@ -0,0 +1,56 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "cuda_data_splitter.hpp" + +namespace LightGBM { + +CUDADataSplitter::CUDADataSplitter(const data_size_t num_data, const int max_num_leaves): + num_data_(num_data), max_num_leaves_(max_num_leaves) {} + +void CUDADataSplitter::Init() { + // allocate GPU memory + AllocateCUDAMemory(static_cast(num_data_), &cuda_data_indices_); + + AllocateCUDAMemory(static_cast(max_num_leaves_), &cuda_leaf_num_data_offsets_); + + AllocateCUDAMemory(1, &cuda_num_data_); + CopyFromHostToCUDADevice(cuda_num_data_, &num_data_, 1); + + AllocateCUDAMemory(1, &cuda_max_num_leaves_); + CopyFromHostToCUDADevice(cuda_max_num_leaves_, &max_num_leaves_, 1); + + AllocateCUDAMemory(static_cast(max_num_leaves_), &cuda_leaf_num_data_offsets_); + AllocateCUDAMemory(static_cast(max_num_leaves_), &cuda_leaf_num_data_); +} + +void CUDADataSplitter::BeforeTrain(const data_size_t* data_indices) { + if (data_indices == nullptr) { + // no bagging + LaunchFillDataIndicesBeforeTrain(); + SynchronizeCUDADevice(); + data_indices_.resize(num_data_); + CopyFromCUDADeviceToHost(data_indices_.data(), cuda_data_indices_, static_cast(num_data_)); + for (int i = 0; i < 100; ++i) { + Log::Warning("data_indices_[%d] = %d", i, data_indices_[i]); + Log::Warning("data_indices_[end - %d] = %d", i, data_indices_[num_data_ - 1 - i]); + } + SetCUDAMemory(cuda_leaf_num_data_offsets_, 0, max_num_leaves_); + SetCUDAMemory(cuda_leaf_num_data_, 0, max_num_leaves_); + SetCUDAMemory(cuda_leaf_num_data_, num_data_, 1); + } else { + Log::Fatal("bagging is not supported by GPU"); + } +} + +void CUDADataSplitter::Split(const int* /*leaf_id*/, + const int* /*best_split_feature*/, + const int* /*best_split_threshold*/) {} + +Tree* CUDADataSplitter::GetCPUTree() {} + + +} // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_data_splitter.cu b/src/treelearner/cuda/cuda_data_splitter.cu new file mode 100644 index 000000000000..4bb599cb055a --- /dev/null +++ b/src/treelearner/cuda/cuda_data_splitter.cu @@ -0,0 +1,27 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "cuda_data_splitter.hpp" + +#define FILL_INDICES_BLOCK_SIZE (1024) + +namespace LightGBM { + +__global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_data, + data_size_t* data_indices) { + const data_size_t num_data_ref = *cuda_num_data; + const unsigned int data_index = threadIdx.x + blockIdx.x * blockDim.x; + if (data_index < num_data_ref) { + data_indices[data_index] = data_index; + } +} + +void CUDADataSplitter::LaunchFillDataIndicesBeforeTrain() { + const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE - 1) / FILL_INDICES_BLOCK_SIZE; + FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_); +} + +} // namespace LightGBM \ No newline at end of file diff --git a/src/treelearner/cuda/cuda_data_splitter.hpp b/src/treelearner/cuda/cuda_data_splitter.hpp new file mode 100644 index 000000000000..0abc728b5b1b --- /dev/null +++ b/src/treelearner/cuda/cuda_data_splitter.hpp @@ -0,0 +1,56 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ +#define LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ + +#ifdef USE_CUDA + +#include +#include +#include "new_cuda_utils.hpp" + +namespace LightGBM { + +class CUDADataSplitter { + public: + CUDADataSplitter(const data_size_t num_data, const int max_num_leaves); + + void Init(); + + void BeforeTrain(const data_size_t* data_indices); + + void Split(const int* leaf_id, const int* best_split_feature, const int* best_split_threshold); + + Tree* GetCPUTree(); + + const data_size_t* data_indices() { return cuda_data_indices_; } + + const data_size_t* leaf_num_data_offsets() { return cuda_leaf_num_data_offsets_; } + + const data_size_t* leaf_num_data() { return cuda_leaf_num_data_; } + + private: + // kernel launch functions + void LaunchFillDataIndicesBeforeTrain(); + + // CPU + const data_size_t num_data_; + std::vector data_indices_; + const int max_num_leaves_; + + // GPU + data_size_t* cuda_data_indices_; + data_size_t* cuda_leaf_num_data_offsets_; + data_size_t* cuda_leaf_num_data_; + + data_size_t* cuda_num_data_; + int* cuda_max_num_leaves_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ \ No newline at end of file diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index e69de29bb2d1..19d16edd7a36 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -0,0 +1,87 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_histogram_constructor.hpp" + +namespace LightGBM { + +CUDAHistogramConstructor::CUDAHistogramConstructor(const std::vector& feature_group_ids, + const Dataset* train_data, const int max_num_leaves, + hist_t* cuda_hist): num_data_(train_data->num_data()), + num_feature_groups_(feature_group_ids.size()), + max_num_leaves_(max_num_leaves) { + int offset = 0; + for (size_t i = 0; i < feature_group_ids.size(); ++i) { + const int group_id = feature_group_ids[i]; + feature_group_bin_offsets_.emplace_back(offset); + offset += train_data->FeatureGroupNumBin(group_id); + } + feature_group_bin_offsets_.emplace_back(offset); + num_total_bin_ = offset; + cuda_hist_ = cuda_hist; +} + +void CUDAHistogramConstructor::Init() { + // allocate CPU memory + cpu_data_.resize(num_data_ * num_feature_groups_, 0); + // allocate GPU memory + void* cuda_data_ptr = nullptr; + CUDASUCCESS_OR_FATAL(cudaMalloc(&cuda_data_ptr, num_data_ * num_feature_groups_ * sizeof(uint8_t))); + cuda_data_ = reinterpret_cast(cuda_data_ptr); + + void* cuda_hist_ptr = nullptr; + CUDASUCCESS_OR_FATAL(cudaMalloc(&cuda_hist_ptr, num_total_bin_ * max_num_leaves_ * sizeof(double))); + cuda_hist_ = reinterpret_cast(cuda_hist_ptr); + + void* cuda_num_total_bin_ptr = nullptr; + CUDASUCCESS_OR_FATAL(cudaMalloc(&cuda_num_total_bin_ptr, sizeof(int))); + cuda_num_total_bin_ = reinterpret_cast(cuda_num_total_bin_ptr); + CUDASUCCESS_OR_FATAL(cudaMemcpy(cuda_num_total_bin_ptr, reinterpret_cast(&num_total_bin_), sizeof(int), cudaMemcpyHostToDevice)); + + AllocateCUDAMemory(1, &cuda_num_feature_groups_); + CopyFromHostToCUDADevice(cuda_num_feature_groups_, &num_feature_groups_, 1); +} + +void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, + const int feature_group_id, + const data_size_t data_index) { + const uint8_t feature_bin_value_uint8 = static_cast(feature_bin_value); + const size_t index = static_cast(data_index) * static_cast(num_feature_groups_) + + static_cast(feature_group_id); + cpu_data_[index] = feature_bin_value_uint8; +} + +void CUDAHistogramConstructor::FinishLoad() { + // copy CPU data to GPU + void* cuda_data_ptr = reinterpret_cast(cuda_data_); + const void* cpu_data_ptr = reinterpret_cast(cpu_data_.data()); + CUDASUCCESS_OR_FATAL(cudaMemcpy(cuda_data_ptr, cpu_data_ptr, sizeof(uint8_t) * num_data_ * num_feature_groups_, cudaMemcpyHostToDevice)); +} + +void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* smaller_leaf_index, const int* /*larger_leaf_index*/, + const data_size_t* num_data_in_leaf, const data_size_t* leaf_data_offset, const data_size_t* data_indices_ptr, + const score_t* cuda_gradients, const score_t* cuda_hessians) { + LaunchConstructHistogramKernel(smaller_leaf_index, num_data_in_leaf, leaf_data_offset, data_indices_ptr, + cuda_gradients, cuda_hessians); + SynchronizeCUDADevice(); + PrintLastCUDAError(); + Log::Warning("histogram construction finished"); + Log::Warning("num_total_bin_ = %d", num_total_bin_); + Log::Warning("max_num_leaves_ = %d", max_num_leaves_); + std::vector cpu_hist(200, 0.0f); + CopyFromCUDADeviceToHost(cpu_hist.data(), cuda_hist_, 200); + for (int i = 0; i < 100; ++i) { + Log::Warning("bin %d grad %f hess %f", i, cpu_hist[2 * i], cpu_hist[2 * i + 1]); + } +} + + + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu new file mode 100644 index 000000000000..276fd0b33812 --- /dev/null +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -0,0 +1,90 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_histogram_constructor.hpp" + +namespace LightGBM { + +#define SHRAE_HIST_SIZE (6144) +#define NUM_DATA_PER_THREAD (1600) +#define NUM_FEATURE_PER_THREAD_GROUP (12) + +__global__ void CUDAConstructHistogramKernel(const int* leaf_index, + const score_t* cuda_gradients, const score_t* cuda_hessians, + const data_size_t* data_indices_ptr, hist_t* feature_histogram, const int* num_feature_groups, + const int* leaf_num_data_offset, const uint8_t* data, const data_size_t* num_data_in_leaf) { + const unsigned int threadIdx_x = threadIdx.x; + if (threadIdx_x == 0) { + printf("CUDAConstructHistogramKernel step 0\n"); + } + const int num_feature_groups_ref = *num_feature_groups; + if (threadIdx_x == 0) { + printf("CUDAConstructHistogramKernel step 1\n"); + } + const int leaf_index_ref = *leaf_index; + if (threadIdx_x == 0) { + printf("CUDAConstructHistogramKernel step 2\n"); + } + const int num_data_in_smaller_leaf_ref = *(num_data_in_leaf + leaf_index_ref); + if (threadIdx_x == 0) { + printf("CUDAConstructHistogramKernel step 3\n"); + } + const int leaf_num_data_in_smaller_leaf_ref = *(leaf_num_data_offset + leaf_index_ref); + printf("num_feature_groups_ref = %d", num_feature_groups_ref); + printf("leaf_index_ref = %d", leaf_index_ref); + printf("num_data_in_smaller_leaf_ref = %d", num_data_in_smaller_leaf_ref); + printf("leaf_num_data_in_smaller_leaf_ref = %d", leaf_num_data_in_smaller_leaf_ref); + const data_size_t* data_indices_in_smaller_leaf = data_indices_ptr + leaf_num_data_in_smaller_leaf_ref; + //__shared__ double shared_hist[SHRAE_HIST_SIZE]; // 256 * 24, can use 12 features + const unsigned int threadIdx_y = threadIdx.y; + const unsigned int blockIdx_x = blockIdx.x; + const unsigned int blockDim_x = blockDim.x; + const unsigned int offset = threadIdx_x % 2; + if (threadIdx_x < 24) { + const int feature_group_index = threadIdx_x / 2 + blockIdx_x * blockDim_x / 8 * 3; + const data_size_t start = threadIdx_y * NUM_DATA_PER_THREAD; + const data_size_t end = start + NUM_DATA_PER_THREAD > num_data_in_smaller_leaf_ref ? + num_data_in_smaller_leaf_ref : start + NUM_DATA_PER_THREAD; + if (offset == 0) { + // handle gradient + for (data_size_t i = start; i < end; ++i) { + const score_t gradient = cuda_gradients[i]; + const data_size_t data_index = data_indices_in_smaller_leaf[i]; + const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + feature_group_index]); + feature_histogram[bin << 1] += gradient; + } + } else { + // handle hessian + for (data_size_t i = start; i < end; ++i) { + const score_t hessian = cuda_hessians[i]; + const data_size_t data_index = data_indices_in_smaller_leaf[i]; + const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + feature_group_index]); + feature_histogram[(bin << 1) + 1] += hessian; + } + } + } +} + +void CUDAHistogramConstructor::LaunchConstructHistogramKernel( + const int* smaller_leaf_index, const data_size_t* num_data_in_leaf, const data_size_t* leaf_num_data_offset, + const data_size_t* data_indices_ptr, const score_t* cuda_gradients, const score_t* cuda_hessians) { + const int block_dim_x = 32; + const int block_dim_y = 1024 / block_dim_x; + const int grid_dim_y = ((num_data_ + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + block_dim_y - 1) / block_dim_y; + const int grid_dim_x = (static_cast(num_feature_groups_ + NUM_FEATURE_PER_THREAD_GROUP - 1) / NUM_FEATURE_PER_THREAD_GROUP); + Log::Warning("block_dim_x = %d, block_dim_y = %d", block_dim_x, block_dim_y); + Log::Warning("gid_dim_x = %d, grid_dim_y = %d", grid_dim_x, grid_dim_y); + dim3 grid_dim(grid_dim_x, grid_dim_y); + dim3 block_dim(block_dim_x, block_dim_y); + CUDAConstructHistogramKernel<<>>(smaller_leaf_index, cuda_gradients, cuda_hessians, + data_indices_ptr, cuda_hist_, cuda_num_feature_groups_, leaf_num_data_offset, cuda_data_, num_data_in_leaf); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 11fabdb1eea0..600fde1aee42 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -3,12 +3,15 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ -#ifndef LIGHTGBM_NEW_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ -#define LIGHTGBM_NEW_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ +#ifndef LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ +#define LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ #ifdef USE_CUDA #include +#include + +#include "new_cuda_utils.hpp" #include @@ -26,49 +29,29 @@ class CUDAHistogramConstructor { void FinishLoad(); - void ConstructHistogramForLeaf(const int* smaller_leaf_index, const int* larger_leaf_index); + void ConstructHistogramForLeaf(const int* smaller_leaf_index, const int* larger_leaf_index, + const data_size_t* num_data_in_smaller_leaf, const int* smaller_leaf_data_offset, const data_size_t* data_indices_ptr, + const score_t* cuda_gradients, const score_t* cuda_hessians); + + void LaunchConstructHistogramKernel( + const int* smaller_leaf_index, const data_size_t* num_data_in_leaf, const data_size_t* leaf_num_data_offset, + const data_size_t* data_indices_ptr, const score_t* cuda_gradients, const score_t* cuda_hessians); hist_t* cuda_hist() { return cuda_hist_; } private: // data on CPU, stored in row-wise style std::vector cpu_data_; - std::vector feature_group_bin_offsets; + std::vector feature_group_bin_offsets_; uint8_t* cuda_data_; + uint32_t cuda_feature_group_bin_offsets_; const data_size_t num_data_; hist_t* cuda_hist_; -}; - -class CUDALeafSplitsInit { - public: - CUDALeafSplitsInit(const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t num_data); - - void Init(); - - const double* smaller_leaf_sum_gradients() { return smaller_leaf_sum_gradients_; } - - const double* smaller_leaf_sum_hessians() { return smaller_leaf_sum_hessians_; } - - const double* larger_leaf_sum_gradients() { return larger_leaf_sum_gradients_; } - - const double* larger_leaf_sum_gradients() { return larger_leaf_sum_hessians_; } - - const int* smaller_leaf_index() { return smaller_leaf_index_; } - - const int* larger_leaf_index() { return larger_leaf_index_; } - - protected: - const score_t* cuda_gradients_; - const score_t* cuda_hessians_; - double* smaller_leaf_sum_gradients_; - double* smaller_leaf_sum_hessians_; - double* larger_leaf_sum_gradients_; - double* larger_leaf_sum_hessians_; - int* smaller_leaf_index_; - int* larger_leaf_index_; - - int num_cuda_blocks_; - const int num_data_; + int num_total_bin_; + int* cuda_num_total_bin_; + int num_feature_groups_; + int* cuda_num_feature_groups_; + const int max_num_leaves_; }; class CUDABestSplitFinder { @@ -96,18 +79,7 @@ class CUDABestSplitFinder { int* cuda_best_split_threshold_; }; -class CUDADataSplitter { - public: - CUDADataSplitter(const data_size_t* data_indices, const data_size_t num_data); - - void Init(); - - void Split(const int* leaf_id, const int* best_split_feature, const int* best_split_threshold); - - Tree* GetCPUTree(); -}; - } // namespace LightGBM #endif // USE_CUDA -#endif // LIGHTGBM_NEW_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ +#endif // LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.cpp b/src/treelearner/cuda/cuda_leaf_splits_init.cpp index c9176d389eca..a64309303735 100644 --- a/src/treelearner/cuda/cuda_leaf_splits_init.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits_init.cpp @@ -6,35 +6,51 @@ #ifdef USE_CUDA -#include "cuda_histogram_constructor.hpp" -#include "cuda_leaf_splits_init.hu" +#include "cuda_leaf_splits_init.hpp" -#include +#include namespace LightGBM { CUDALeafSplitsInit::CUDALeafSplitsInit(const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t num_data): -cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians), num_data_(num_data) { - -} +cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians), num_data_(num_data) {} void CUDALeafSplitsInit::Init() { - num_cuda_blocks_ = 256; + num_blocks_ = (num_data_ + INIT_SUM_BLOCK_SIZE - 1) / INIT_SUM_BLOCK_SIZE; CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); - CUDASUCCESS_OR_FATAL(cudaMalloc(&smaller_leaf_sum_gradients_, num_cuda_blocks_)); - CUDASUCCESS_OR_FATAL(cudaMalloc(&smaller_leaf_sum_hessians_, num_cuda_blocks_)); - - const int num_data_per_blocks = (num_data_ + num_cuda_blocks_ - 1) / num_cuda_blocks_; - - CUDALeafSplitsInitKernel1<<>>( - cuda_gradients_, cuda_hessians_, num_data_, smaller_leaf_sum_gradients_, - smaller_leaf_sum_hessians_); + void* smaller_leaf_sum_gradients_ptr = nullptr; + void* smaller_leaf_sum_hessians_ptr = nullptr; + CUDASUCCESS_OR_FATAL(cudaMalloc(&smaller_leaf_sum_gradients_ptr, num_blocks_ * sizeof(double))); + CUDASUCCESS_OR_FATAL(cudaMalloc(&smaller_leaf_sum_hessians_ptr, num_blocks_ * sizeof(double))); + CUDASUCCESS_OR_FATAL(cudaMemset(smaller_leaf_sum_gradients_ptr, 0, num_blocks_ * sizeof(double))); + CUDASUCCESS_OR_FATAL(cudaMemset(smaller_leaf_sum_hessians_ptr, 0, num_blocks_ * sizeof(double))); + smaller_leaf_sum_gradients_ = reinterpret_cast(smaller_leaf_sum_gradients_ptr); + smaller_leaf_sum_hessians_ = reinterpret_cast(smaller_leaf_sum_hessians_ptr); + + void* cuda_num_data_ptr = nullptr; + CUDASUCCESS_OR_FATAL(cudaMalloc(&cuda_num_data_ptr, sizeof(int))); + cuda_num_data_ = reinterpret_cast(cuda_num_data_ptr); + const void* num_data_ptr = reinterpret_cast(&num_data_); + CUDASUCCESS_OR_FATAL(cudaMemcpy(cuda_num_data_ptr, num_data_ptr, sizeof(int), cudaMemcpyHostToDevice)); +} - CUDALeafSplitsInitKernel2<<>>( - cuda_gradients_, cuda_hessians_, num_data_, smaller_leaf_sum_gradients_, - smaller_leaf_sum_hessians_); +void CUDALeafSplitsInit::Compute() { + LaunchLeafSplitsInit(num_blocks_, INIT_SUM_BLOCK_SIZE, + cuda_gradients_, cuda_hessians_, cuda_num_data_, + smaller_leaf_sum_gradients_, smaller_leaf_sum_hessians_); + Log::Warning(cudaGetErrorName(cudaGetLastError())); + CUDASUCCESS_OR_FATAL(cudaDeviceSynchronize()); + + const void* smaller_leaf_sum_gradients_ptr = reinterpret_cast(smaller_leaf_sum_gradients_); + const void* smaller_leaf_sum_hessians_ptr = reinterpret_cast(smaller_leaf_sum_hessians_); + void* host_smaller_leaf_sum_gradients_ptr = reinterpret_cast(&host_smaller_leaf_sum_gradients_); + void* host_smaller_leaf_sum_hessians_ptr = reinterpret_cast(&host_smaller_leaf_sum_hessians_); + CUDASUCCESS_OR_FATAL(cudaMemcpy(host_smaller_leaf_sum_gradients_ptr, smaller_leaf_sum_gradients_ptr, sizeof(double), cudaMemcpyDeviceToHost)); + CUDASUCCESS_OR_FATAL(cudaMemcpy(host_smaller_leaf_sum_hessians_ptr, smaller_leaf_sum_hessians_ptr, sizeof(double), cudaMemcpyDeviceToHost)); + Log::Warning("host_smaller_leaf_sum_gradients_ = %f", host_smaller_leaf_sum_gradients_); + Log::Warning("host_smaller_leaf_sum_hessians_ = %f", host_smaller_leaf_sum_hessians_); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.cu b/src/treelearner/cuda/cuda_leaf_splits_init.cu index 9427a1bc0466..dc056ab3ae7c 100644 --- a/src/treelearner/cuda/cuda_leaf_splits_init.cu +++ b/src/treelearner/cuda/cuda_leaf_splits_init.cu @@ -6,24 +6,25 @@ #ifdef USE_CUDA -#include "cuda_leaf_splits_init.hu" +#include "cuda_leaf_splits_init.hpp" namespace LightGBM { -__global__ void CUDALeafSplitsInitKernel1(const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t num_data, double* grad_sum_out, double* hess_sum_out) { - extern __shared__ score_t shared_gradients[blockDim.x]; - extern __shared__ score_t shared_hessians[blockDim.x]; - double sum_gradient = 0.0f; - double sum_hessian = 0.0f; +__global__ void CUDALeafSplitsInitKernel1(const float* cuda_gradients, const float* cuda_hessians, + const data_size_t* num_data, double* grad_sum_out, double* hess_sum_out) { + __shared__ float shared_gradients[INIT_SUM_BLOCK_SIZE]; + __shared__ float shared_hessians[INIT_SUM_BLOCK_SIZE]; const unsigned int tid = threadIdx.x; - const unsigned i = blockIdx.x * blockDim.x + tid; - if (i < static_cast(num_data)) { + const unsigned int i = blockIdx.x * blockDim.x + tid; + if (i < static_cast(*num_data)) { shared_gradients[tid] = cuda_gradients[i]; shared_hessians[tid] = cuda_hessians[i]; + } else { + shared_gradients[tid] = 0.0f; + shared_hessians[tid] = 0.0f; } for (unsigned int s = 1; s < blockDim.x; s *= 2) { - if (tid % (2 * s) == 0) { + if (tid % (2 * s) == 0 && (tid + s) < INIT_SUM_BLOCK_SIZE) { shared_gradients[tid] += shared_gradients[tid + s]; shared_hessians[tid] += shared_hessians[tid + s]; } @@ -36,15 +37,26 @@ __global__ void CUDALeafSplitsInitKernel1(const score_t* cuda_gradients, const s } __global__ void CUDALeafSplitsInitKernel2(const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t num_data, double* grad_sum_out, double* hess_sum_out) { - if (threadIdx.x == 0) { - for (unsigned int i = 1; i < blockDim.x; ++i) { + const data_size_t* num_data, double* grad_sum_out, double* hess_sum_out) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + for (unsigned int i = 1; i < gridDim.x; ++i) { grad_sum_out[0] += grad_sum_out[i]; hess_sum_out[0] += hess_sum_out[i]; } } } +void CUDALeafSplitsInit::LaunchLeafSplitsInit(const int num_blocks, const int init_sum_block_size, + const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* num_data, + double* smaller_leaf_sum_gradients, double* smaller_leaf_sum_hessians) { + CUDALeafSplitsInitKernel1<<>>( + cuda_gradients, cuda_hessians, num_data, smaller_leaf_sum_gradients, + smaller_leaf_sum_hessians); + CUDALeafSplitsInitKernel2<<>>( + cuda_gradients, cuda_hessians, num_data, smaller_leaf_sum_gradients, + smaller_leaf_sum_hessians); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.hpp b/src/treelearner/cuda/cuda_leaf_splits_init.hpp new file mode 100644 index 000000000000..16ae57033a1f --- /dev/null +++ b/src/treelearner/cuda/cuda_leaf_splits_init.hpp @@ -0,0 +1,63 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HPP_ +#define LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HPP_ + +#ifdef USE_CUDA + +#include +#include +#include + +#define INIT_SUM_BLOCK_SIZE (1024) + +namespace LightGBM { + +class CUDALeafSplitsInit { + public: + CUDALeafSplitsInit(const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t num_data); + + void Init(); + + void Compute(); + + const double* smaller_leaf_sum_gradients() { return smaller_leaf_sum_gradients_; } + + const double* smaller_leaf_sum_hessians() { return smaller_leaf_sum_hessians_; } + + const double* larger_leaf_sum_gradients() { return larger_leaf_sum_gradients_; } + + const double* larger_leaf_sum_hessians() { return larger_leaf_sum_hessians_; } + + const int* smaller_leaf_index() { return smaller_leaf_index_; } + + const int* larger_leaf_index() { return larger_leaf_index_; } + + void LaunchLeafSplitsInit(const int num_blocks, const int init_sum_block_size, + const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* num_data, + double* smaller_leaf_sum_gradients, double* smaller_leaf_sum_hessians); + + protected: + const score_t* cuda_gradients_; + const score_t* cuda_hessians_; + double* smaller_leaf_sum_gradients_; + double* smaller_leaf_sum_hessians_; + double host_smaller_leaf_sum_gradients_; + double host_smaller_leaf_sum_hessians_; + double* larger_leaf_sum_gradients_; + double* larger_leaf_sum_hessians_; + int* smaller_leaf_index_; + int* larger_leaf_index_; + int* cuda_num_data_; + + const int num_data_; + int num_blocks_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HPP_ diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.hu b/src/treelearner/cuda/cuda_leaf_splits_init.hu deleted file mode 100644 index fdd8ed94e076..000000000000 --- a/src/treelearner/cuda/cuda_leaf_splits_init.hu +++ /dev/null @@ -1,24 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ -#ifndef LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HU_ -#define LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HU_ - -#ifdef USE_CUDA - -#include - -namespace LightGBM { - -__global__ void CUDALeafSplitsInitKernel1(const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t num_data, double* grad_sum_out, double* hess_sum_out); - -__global__ void CUDALeafSplitsInitKernel2(const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t num_data, double* grad_sum_out, double* hess_sum_out); - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HU_ diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index d04e990d1673..aaee5764ee2a 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -21,15 +21,21 @@ NewCUDATreeLearner::~NewCUDATreeLearner() {} void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { SerialTreeLearner::Init(train_data, is_constant_hessian); + Log::Warning("NewCUDATreeLearner::Init step 1"); int num_total_gpus = 0; CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&num_total_gpus)); - num_gpus_ = config_->num_gpu > num_total_gpus ? num_total_gpus : config_->num_gpu; + Log::Warning("NewCUDATreeLearner::Init step 2"); + num_gpus_ = 1;//config_->num_gpu > num_total_gpus ? num_total_gpus : config_->num_gpu; num_threads_ = OMP_NUM_THREADS(); + Log::Warning("NewCUDATreeLearner::Init step 3"); AllocateFeatureTasks(); - AllocateCUDAMemory(); + Log::Warning("NewCUDATreeLearner::Init step 4"); + AllocateCUDAMemory(is_constant_hessian); + Log::Warning("NewCUDATreeLearner::Init step 5"); - CreateCUDAHistogramConstructor(); + CreateCUDAHistogramConstructors(); + Log::Warning("NewCUDATreeLearner::Init step 6"); } void NewCUDATreeLearner::BeforeTrain() { @@ -37,7 +43,7 @@ void NewCUDATreeLearner::BeforeTrain() { #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int device_id = 0; device_id < num_gpus_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - device_leaf_splits_initializer_[device_id]->Init(); + device_leaf_splits_initializers_[device_id]->Init(); } } @@ -53,63 +59,89 @@ void NewCUDATreeLearner::AllocateFeatureTasks() { int& num_total_bin = device_num_total_bins_[device_id]; num_total_bin = 0; for (int group_id = device_feature_group_start; group_id < device_feature_group_end; ++group_id) { - device_feature_groups_.emplace_back(group_id); + device_feature_groups_[device_id].emplace_back(group_id); num_total_bin += train_data_->FeatureGroupNumBin(group_id); } } } -void NewCUDATreeLearner::AllocateCUDAMemory() { +void NewCUDATreeLearner::AllocateCUDAMemory(const bool is_constant_hessian) { device_data_indices_.resize(num_gpus_, nullptr); device_gradients_.resize(num_gpus_, nullptr); - if (config_->is_constant_hessian) { + if (!is_constant_hessian) { device_hessians_.resize(num_gpus_, nullptr); } + device_histograms_.resize(num_gpus_, nullptr); + const int num_total_bin_from_dataset = train_data_->NumTotalBin(); + const int num_total_bin_from_share_states = share_state_->num_hist_total_bin(); + const int num_total_bin = std::max(num_total_bin_from_dataset, num_total_bin_from_share_states); #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int device_id = 0; device_id < num_gpus_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); if (device_data_indices_[device_id] != nullptr) { CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); } - CUDASUCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), num_data_)); + void* data_indices_ptr = reinterpret_cast(device_data_indices_[device_id]); + CUDASUCCESS_OR_FATAL(cudaMalloc(&data_indices_ptr, num_data_ * sizeof(data_size_t))); + device_data_indices_[device_id] = reinterpret_cast(data_indices_ptr); if (device_gradients_[device_id] != nullptr) { CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id])); } - CUDASUCESS_OR_FATAL(cudaMalloc(&(device_gradients_[device_id]), num_data_)); - if (config_->is_constant_hessian) { + void* gradients_ptr = reinterpret_cast(device_gradients_[device_id]); + CUDASUCCESS_OR_FATAL(cudaMalloc(&gradients_ptr, num_data_ * sizeof(float))); + device_gradients_[device_id] = reinterpret_cast(gradients_ptr); + if (!is_constant_hessian) { if (device_hessians_[device_id] != nullptr) { CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id])); } - CUDASUCESS_OR_FATAL(cudaMalloc(&(device_hessians_[device_id]), num_data_)); + void* hessians_ptr = reinterpret_cast(device_hessians_[device_id]); + CUDASUCCESS_OR_FATAL(cudaMalloc(&hessians_ptr, num_data_ * sizeof(float))); + device_hessians_[device_id] = reinterpret_cast(hessians_ptr); + } + if (device_histograms_[device_id] != nullptr) { + CUDASUCCESS_OR_FATAL(cudaFree(device_histograms_[device_id])); } + void* histograms_ptr = reinterpret_cast(device_histograms_[device_id]); + CUDASUCCESS_OR_FATAL(cudaMalloc(&histograms_ptr, num_total_bin * 2 * sizeof(double))); + device_histograms_[device_id] = reinterpret_cast(histograms_ptr); } } void NewCUDATreeLearner::CreateCUDAHistogramConstructors() { + Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors num_gpus_ = %d", num_gpus_); device_histogram_constructors_.resize(num_gpus_); device_leaf_splits_initializers_.resize(num_gpus_); device_best_split_finders_.resize(num_gpus_); - device_splitters_.ressize(num_gpus_); + device_splitters_.resize(num_gpus_); #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int device_id = 0; device_id < num_gpus_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); + Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 1", num_gpus_); device_leaf_splits_initializers_[device_id].reset( - new CUDALeafSplitsInit(device_gradients_[device_id], device_hessians_[device_id])); + new CUDALeafSplitsInit(device_gradients_[device_id], device_hessians_[device_id], num_data_)); + Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 2", num_gpus_); device_histogram_constructors_[device_id].reset( new CUDAHistogramConstructor(device_feature_groups_[device_id], - train_data_, config_->num_leaves, device_histograms_[device_id]))); + train_data_, config_->num_leaves, device_histograms_[device_id])); + Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 3", num_gpus_); device_best_split_finders_[device_id].reset( new CUDABestSplitFinder(device_histogram_constructors_[device_id]->cuda_hist(), train_data_, device_feature_groups_[device_id], config_->num_leaves)); + Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 4", num_gpus_); device_splitters_[device_id].reset( - new CUDADataSplitter(device_data_indices_[device_id], num_data_)); + new CUDADataSplitter(num_data_, config_->num_leaves)); + Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 5", num_gpus_); } #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int device_id = 0; device_id < num_gpus_; ++device_id) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); device_leaf_splits_initializers_[device_id]->Init(); device_histogram_constructors_[device_id]->Init(); + device_splitters_[device_id]->Init(); } + Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 6", num_gpus_); PushDataIntoDeviceHistogramConstructors(); + Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 7", num_gpus_); } void NewCUDATreeLearner::PushDataIntoDeviceHistogramConstructors() { @@ -120,7 +152,7 @@ void NewCUDATreeLearner::PushDataIntoDeviceHistogramConstructors() { for (int group_id : device_feature_groups_[device_id]) { BinIterator* iter = train_data_->FeatureGroupIterator(group_id); iter->Reset(0); - for (const data_size_t data_index = 0; data_index < num_data_; ++data_index) { + for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { const uint32_t bin = static_cast(iter->RawGet(data_index)); cuda_histogram_constructor->PushOneData(bin, group_id, data_index); } @@ -141,9 +173,7 @@ void NewCUDATreeLearner::ConstructHistograms(const std::vector& /*is_fea #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int device_id = 0; device_id < num_gpus_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - device_histogram_constructors_[device_id]->ConstructHistogramForLeaf( - device_leaf_splits_initializers_[device_id]->smaller_leaf_index(), - device_leaf_splits_initializers_[device_id]->larger_leaf_index()); + } } @@ -172,6 +202,29 @@ void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, } } +Tree* NewCUDATreeLearner::Train(const score_t* gradients, + const score_t *hessians, bool /*is_first_tree*/) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + CUDASUCCESS_OR_FATAL(cudaMemcpy(device_gradients_[0], gradients, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice)); + CUDASUCCESS_OR_FATAL(cudaMemcpy(device_hessians_[0], hessians, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice)); + Log::Warning("before initialization of leaf splits"); + device_leaf_splits_initializers_[0]->Compute(); + Log::Warning("after initialization of leaf splits"); + device_splitters_[0]->BeforeTrain(nullptr); + Log::Warning("after initialization of data indices"); + device_histogram_constructors_[0]->ConstructHistogramForLeaf(device_leaf_splits_initializers_[0]->smaller_leaf_index(), + device_leaf_splits_initializers_[0]->larger_leaf_index(), + device_splitters_[0]->leaf_num_data(), device_splitters_[0]->leaf_num_data_offsets(), + device_splitters_[0]->data_indices(), device_gradients_[0], device_hessians_[0]); + Log::Warning("after construction of root histograms"); +} + +void NewCUDATreeLearner::ResetTrainingData(const Dataset* /*train_data*/, + bool /*is_constant_hessian*/) {} + +void NewCUDATreeLearner::SetBaggingData(const Dataset* /*subset*/, + const data_size_t* /*used_indices*/, data_size_t /*num_data*/) {} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 91af7f5cf33e..db093fb6b028 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -9,11 +9,13 @@ #ifdef USE_CUDA #include "../serial_tree_learner.h" +#include "cuda_leaf_splits_init.hpp" #include "cuda_histogram_constructor.hpp" +#include "cuda_data_splitter.hpp" namespace LightGBM { -class NewCUDATreeLearner: public SerialTreelearner { +class NewCUDATreeLearner: public SerialTreeLearner { public: explicit NewCUDATreeLearner(const Config* config); @@ -21,10 +23,9 @@ class NewCUDATreeLearner: public SerialTreelearner { void Init(const Dataset* train_data, bool is_constant_hessian) override; - void BeforeTrain() override; + void ResetTrainingData(const Dataset* train_data, + bool is_constant_hessian) override; - void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; - Tree* Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) override; void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override; @@ -32,7 +33,7 @@ class NewCUDATreeLearner: public SerialTreelearner { protected: void AllocateFeatureTasks(); - void AllocateCUDAMemory(); + void AllocateCUDAMemory(const bool is_constant_hessian); void CreateCUDAHistogramConstructors(); @@ -46,6 +47,8 @@ class NewCUDATreeLearner: public SerialTreelearner { void Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) override; + void BeforeTrain() override; + // number of GPUs int num_gpus_; // number of threads on CPU @@ -59,11 +62,13 @@ class NewCUDATreeLearner: public SerialTreelearner { std::vector device_num_workgroups_; // full data indices on CUDA devices, as the data indices of data_partition_ in CPU version - std::vector device_data_indices_; + std::vector device_data_indices_; // gradient values on CUDA devices std::vector device_gradients_; // hessian values on CUDA devices std::vector device_hessians_; + // histogram storage on CUDA devices + std::vector device_histograms_; // device leaf splits initializer std::vector> device_leaf_splits_initializers_; diff --git a/src/treelearner/cuda/new_cuda_utils.cpp b/src/treelearner/cuda/new_cuda_utils.cpp new file mode 100644 index 000000000000..58c38d5e6ae9 --- /dev/null +++ b/src/treelearner/cuda/new_cuda_utils.cpp @@ -0,0 +1,20 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "new_cuda_utils.hpp" + +namespace LightGBM { + +void SynchronizeCUDADevice() { + CUDASUCCESS_OR_FATAL(cudaDeviceSynchronize()); +} + +void PrintLastCUDAError() { + const char* error_name = cudaGetErrorName(cudaGetLastError()); + Log::Warning(error_name); +} + +} // namespace LightGBM diff --git a/src/treelearner/cuda/new_cuda_utils.hpp b/src/treelearner/cuda/new_cuda_utils.hpp new file mode 100644 index 000000000000..ccb4a4aa02c7 --- /dev/null +++ b/src/treelearner/cuda/new_cuda_utils.hpp @@ -0,0 +1,53 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifndef LIGHTGBM_NEW_CUDA_UTILS_HPP_ +#define LIGHTGBM_NEW_CUDA_UTILS_HPP_ + +#ifdef USE_CUDA + +#include +#include + +namespace LightGBM { + +template +void AllocateCUDAMemory(size_t size, T** out_ptr) { + void* tmp_ptr = nullptr; + CUDASUCCESS_OR_FATAL(cudaMalloc(&tmp_ptr, size * sizeof(T))); + *out_ptr = reinterpret_cast(tmp_ptr); +} + +template +void CopyFromHostToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice)); +} + +template +void CopyFromCUDADeviceToHost(T* dst_ptr, const T* src_ptr, size_t size) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost)); +} + +void SynchronizeCUDADevice(); + +template +void SetCUDAMemory(T* dst_ptr, int value, size_t size) { + CUDASUCCESS_OR_FATAL(cudaMemset(reinterpret_cast(dst_ptr), value, size)); +} + +void PrintLastCUDAError(); + +} // namespace LightGBM + +#endif // USE_CUDA + +#endif // LIGHTGBM_NEW_CUDA_UTILS_HPP_ diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp index ed13f646c388..44bc6eeda36b 100644 --- a/src/treelearner/tree_learner.cpp +++ b/src/treelearner/tree_learner.cpp @@ -9,6 +9,7 @@ #include "linear_tree_learner.h" #include "parallel_tree_learner.h" #include "serial_tree_learner.h" +#include "cuda/new_cuda_tree_learner.hpp" namespace LightGBM { @@ -40,7 +41,7 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con } } else if (device_type == std::string("cuda")) { if (learner_type == std::string("serial")) { - return new CUDATreeLearner(config); + return new NewCUDATreeLearner(config); } else if (learner_type == std::string("feature")) { return new FeatureParallelTreeLearner(config); } else if (learner_type == std::string("data")) { From 9b21d2b8e64f517f455919c832267699340ddcef Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 29 Apr 2021 06:36:51 +0000 Subject: [PATCH 003/166] before removing multi-gpu --- .../cuda/cuda_best_split_finder.cpp | 69 +++- .../cuda/cuda_best_split_finder.cu | 326 ++++++++++++++++++ .../cuda/cuda_best_split_finder.hpp | 96 ++++++ src/treelearner/cuda/cuda_data_splitter.cpp | 10 +- src/treelearner/cuda/cuda_data_splitter.hpp | 2 + .../cuda/cuda_histogram_constructor.cpp | 60 +++- .../cuda/cuda_histogram_constructor.cu | 132 +++++-- .../cuda/cuda_histogram_constructor.hpp | 43 +-- src/treelearner/cuda/cuda_leaf_splits.cpp | 54 +++ src/treelearner/cuda/cuda_leaf_splits.cu | 59 ++++ src/treelearner/cuda/cuda_leaf_splits.hpp | 57 +++ .../cuda/cuda_leaf_splits_init.cpp | 14 +- src/treelearner/cuda/cuda_leaf_splits_init.cu | 47 +-- .../cuda/cuda_leaf_splits_init.hpp | 12 +- .../cuda/new_cuda_tree_learner.cpp | 17 +- .../cuda/new_cuda_tree_learner.hpp | 7 +- src/treelearner/cuda/new_cuda_utils.hpp | 11 +- src/treelearner/serial_tree_learner.cpp | 3 + 18 files changed, 908 insertions(+), 111 deletions(-) create mode 100644 src/treelearner/cuda/cuda_best_split_finder.cu create mode 100644 src/treelearner/cuda/cuda_best_split_finder.hpp create mode 100644 src/treelearner/cuda/cuda_leaf_splits.cpp create mode 100644 src/treelearner/cuda/cuda_leaf_splits.cu create mode 100644 src/treelearner/cuda/cuda_leaf_splits.hpp diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 72e52adfbc95..c38892535ff5 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -4,14 +4,75 @@ * license information. */ -#include "cuda_histogram_constructor.hpp" +#include "cuda_best_split_finder.hpp" +#include "cuda_leaf_splits_init.hpp" namespace LightGBM { -CUDABestSplitFinder::CUDABestSplitFinder(const hist_t* /*cuda_hist*/, const Dataset* /*train_data*/, - const std::vector& /*feature_group_ids*/, const int /*max_num_leaves*/) {} +CUDABestSplitFinder::CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* train_data, + const std::vector& feature_hist_offsets, const int max_num_leaves, + const double lambda_l1, const data_size_t min_data_in_leaf, const double min_sum_hessian_in_leaf, + const double min_gain_to_split): +cuda_hist_(cuda_hist), num_features_(train_data->num_features()), max_num_leaves_(max_num_leaves), +feature_hist_offsets_(feature_hist_offsets), num_total_bin_(feature_hist_offsets.back()), lambda_l1(lambda_l1_), +min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf), min_gain_to_split_(min_gain_to_split) { + feature_missing_type_.resize(num_features_); + feature_mfb_offsets_.resize(num_features_); + feature_default_bins_.resize(num_features_); + for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { + const BinMapper* bin_mapper = train_data_->FeatureBinMapper(inner_feature_index); + const MissingType missing_type = bin_mapper->missing_type(); + feature_missing_type_[inner_feature_index] = missing_type; + feature_mfb_offsets_[inner_feature_index] = static_cast(bin_mapper->GetMostFreqBin() == 0); + feature_default_bins_[inner_feature_index] = bin_mapper->GetDefaultBin(); + } +} -void CUDABestSplitFinder::FindBestSplitsForLeaf(const int* /*leaf_id*/) {} +void CUDABestSplitFinder::Init() { + AllocateCUDAMemory(feature_hist_offsets_.size(), &cuda_feature_hist_offsets_); + CopyFromHostToCUDADevice(cuda_feature_hist_offsets_, feature_hist_offsets_.data(), feature_hist_offsets_.size()); + + AllocateCUDAMemory(feature_mfb_offsets_.size(), &cuda_feature_mfb_offsets_); + CopyFromHostToCUDADevice(cuda_feature_mfb_offsets_, feature_mfb_offsets_.data(), feature_mfb_offsets_.size()); + + AllocateCUDAMemory(feature_default_bins_.since(), &cuda_feature_default_bins_); + CopyFromHostToCUDADevice(cuda_feature_default_bins_, feature_default_bins_.data(), feature_default_bins_.size()); + + AllocateCUDAMemory(1, &cuda_num_total_bin_); + CopyFromHostToCUDADevice(cuda_num_total_bin_, &num_total_bin_, 1); + + AllocateCUDAMemory(num_features_, &cuda_feature_missing_type_); + + AllocateCUDAMemory(1, &cuda_lambda_l1_); + CopyFromHostToCUDADevice(cuda_lambda_l1_, &lambda_l1_, 1); + + AllocateCUDAMemory(num_total_bin_ * 2, &prefix_sum_hist_left_); + AllocateCUDAMemory(num_total_bin_ * 2, &prefix_sum_hist_right_); + + // * 2 for smaller and larger leaves, * 2 for default left or not + const size_t feature_best_split_info_buffer_size = static_cast(num_features_) * 4; + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_feature_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_default_left_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_gain_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_left_sum_gradient_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_left_sum_hessian_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_left_count_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_right_sum_gradient_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_right_sum_hessian_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_right_count_); + + AllocateCUDAMemory(1, &cuda_min_data_in_leaf_); + CopyFromHostToCUDADevice(cuda_min_data_in_leaf_, &min_data_in_leaf_, 1); + AllocateCUDAMemory(1, &cuda_min_sum_hessian_in_leaf_); + CopyFromHostToCUDADevice(cuda_min_sum_hessian_in_leaf_, &min_sum_hessian_in_leaf_, 1); + AllocateCUDAMemory(1, &cuda_min_gain_to_split_); + CopyFromHostToCUDADevice(cuda_min_gain_to_split_, &min_gain_to_split_, 1); +} + +void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplitsInit* smaller_leaf_splits, + const CUDALeafSplitsInit* larger_leaf_splits ) { + LaunchFindBestSplitsForLeafKernel(smaller_leaf_id, larger_leaf_id, parent_gain); +} void CUDABestSplitFinder::FindBestFromAllSplits() {} diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu new file mode 100644 index 000000000000..fdaac8caa8d0 --- /dev/null +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -0,0 +1,326 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "cuda_best_split_finder.hpp" + +namespace LightGBM { + +__device__ double ThresholdL1(double s, double l1) { + const double reg_s = fmax(0.0, fabs(s) - l1); + if (s >= 0.0f) { + return reg_s; + } else { + return -reg_s; + } +} + +__device__ double CalculateSplittedLeafOutput(double sum_gradients, + double sum_hessians, double l1, const bool use_l1, + double l2) { + double ret; + if (use_l1) { + ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2); + } else { + ret = -sum_gradients / (sum_hessians + l2); + } + return ret; +} + +__device__ double GetLeafGainGivenOutput(double sum_gradients, + double sum_hessians, double l1, const bool use_l1, + double l2, double output) { + if (use_l1) { + const double sg_l1 = ThresholdL1(sum_gradients, l1); + return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output); + } else { + return -(2.0 * sum_gradients * output + + (sum_hessians + l2) * output * output); + } +} + +__device__ double GetLeafGain(double sum_gradients, double sum_hessians, + double l1, const bool use_l1, double l2) { + if (use_l1) { + const double sg_l1 = ThresholdL1(sum_gradients, l1); + return (sg_l1 * sg_l1) / (sum_hessians + l2); + } else { + return (sum_gradients * sum_gradients) / (sum_hessians + l2); + } +} + +__device__ double GetSplitGains(double sum_left_gradients, + double sum_left_hessians, + double sum_right_gradients, + double sum_right_hessians, + double l1, const bool use_l1, double l2) { + return GetLeafGain(sum_left_gradients, + sum_left_hessians, + l1, use_l1, l2) + + GetLeafGain(sum_right_gradients, + sum_right_hessians, + l1, use_l1, l2); +} + +__device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, + const uint32_t feature_num_bin, const uint8_t feature_mfb_offset, + const uint32_t feature_default_bin, const uint8_t feature_missing_type, + const double lambda_l1, const double parent_gain, const data_size_t min_data_in_leaf, + const double min_sum_hessian_in_leaf, + const double sum_gradients, const double sum_hessians, const data_size_t num_data, + const bool reverse, const bool skip_default_bin, const bool na_as_missing, + // output parameters + double* output_gain, + uint8_t* output_default_left, + double* output_left_sum_gradients, + double* output_left_sum_hessians, + data_size_t* output_left_num_data, + double* output_right_sum_gradients, + double* output_right_sum_hessians, + data_size_t* output_right_num_data) { + + double best_sum_left_gradient = NAN; + double best_sum_left_hessian = NAN; + double best_gain = kMinScore; + data_size_t best_left_count = 0; + uint32_t best_threshold = feature_num_bin; + const double cnt_factor = num_data / sum_hessian; + const bool use_l1 = lambda_l1 > 0.0f; + + if (reverse) { + double sum_right_gradient = 0.0f; + double sum_right_hessian = kEpsilon; + data_size_t right_count = 0; + + int t = feature_num_bin - 1 - feature_mfb_offset - NA_AS_MISSING; + const int t_end = 1 - feature_mfb_offset; + + // from right to left, and we don't need data in bin0 + for (; t >= t_end; --t) { + // need to skip default bin + if (skip_default_bin) { + if ((t + feature_mfb_offset) == static_cast(feature_default_bin)) { + continue; + } + } + const auto grad = GET_GRAD(feature_hist_ptr, t); + const auto hess = GET_HESS(feature_hist_ptr, t); + data_size_t cnt = + static_cast(Common::RoundInt(hess * cnt_factor)); + sum_right_gradient += grad; + sum_right_hessian += hess; + right_count += cnt; + // if data not enough, or sum hessian too small + if (right_count < min_data_in_leaf || + sum_right_hessian < min_sum_hessian_in_leaf) { + continue; + } + data_size_t left_count = num_data - right_count; + // if data not enough + if (left_count < min_data_in_leaf) { + break; + } + + double sum_left_hessian = sum_hessian - sum_right_hessian; + // if sum hessian too small + if (sum_left_hessian < min_sum_hessian_in_leaf) { + break; + } + + double sum_left_gradient = sum_gradient - sum_right_gradient; + + // current split gain + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + continue; + } + + // better split point + if (current_gain > best_gain) { + best_left_count = left_count; + best_sum_left_gradient = sum_left_gradient; + best_sum_left_hessian = sum_left_hessian; + // left is <= threshold, right is > threshold. so this is t-1 + best_threshold = static_cast(t - 1 + feature_mfb_offset); + best_gain = current_gain; + } + } + } else { + double sum_left_gradient = 0.0f; + double sum_left_hessian = kEpsilon; + data_size_t left_count = 0; + + int t = 0; + const int t_end = feature_num_bin - 2 - feature_mfb_offset; + + if (na_as_missing) { + if (feature_mfb_offset == 1) { + sum_left_gradient = sum_gradient; + sum_left_hessian = sum_hessian - kEpsilon; + left_count = num_data; + for (int i = 0; i < feature_num_bin - feature_mfb_offset; ++i) { + const auto grad = GET_GRAD(feature_hist_ptr, i); + const auto hess = GET_HESS(feature_hist_ptr, i); + data_size_t cnt = + static_cast(Common::RoundInt(hess * cnt_factor)); + sum_left_gradient -= grad; + sum_left_hessian -= hess; + left_count -= cnt; + } + t = -1; + } + } + + for (; t <= t_end; ++t) { + if (skip_default_bin) { + if ((t + feature_mfb_offset) == static_cast(feature_default_bin)) { + continue; + } + } + if (t >= 0) { + sum_left_gradient += GET_GRAD(feature_hist_ptr, t); + const hist_t* hess = GET_HESS(feature_hist_ptr, t); + sum_left_hessian += hess; + left_count += static_cast( + Common::RoundInt(hess * cnt_factor)); + } + // if data not enough, or sum hessian too small + if (left_count < min_data_in_leaf || + sum_left_hessian < min_sum_hessian_in_leaf) { + continue; + } + data_size_t right_count = num_data - left_count; + // if data not enough + if (right_count < min_data_in_leaf) { + break; + } + + double sum_right_hessian = sum_hessian - sum_left_hessian; + // if sum hessian too small + if (sum_right_hessian < min_sum_hessian_in_leaf) { + break; + } + + double sum_right_gradient = sum_gradient - sum_left_gradient; + + // current split gain + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + continue; + } + + // better split point + if (current_gain > best_gain) { + best_left_count = left_count; + best_sum_left_gradient = sum_left_gradient; + best_sum_left_hessian = sum_left_hessian; + best_threshold = static_cast(t + feature_mfb_offset); + best_gain = current_gain; + } + } + } +} + +__global__ void FindBestSplitsForLeafKernel(const hist_t* leaf_hist_ptr, + const uint32_t* feature_hist_offsets, const uint8_t* feature_mfb_offsets, const uint32_t* feature_default_bins, + const uint8_t* feature_missing_types, const double* lambda_l1, const int* smaller_leaf_id, + const int* larger_leaf_id, const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, + const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, + const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, + const data_size_t* num_data_in_larger_leaf, const data_size_t* min_data_in_leaf, + const double* min_sum_hessian_in_leaf, const double* min_gain_to_split, + // output + uint8_t* cuda_best_split_default_left, double* cuda_best_split_gain, double* cuda_best_split_left_sum_gradient, + double* cuda_best_split_left_sum_hessian, data_size_t* cuda_best_split_left_count, + double* cuda_best_split_right_sum_gradient, double* cuda_best_split_right_sum_hessian, + data_size_t* cuda_best_split_right_count) { + const unsigned int num_features = blockDim.x / 2; + const unsigned int inner_feature_index = blockIdx.x % num_features; + const unsigned int threadIdx = threadIdx.x; + const unsigned int global_threadIdx = threadIdx + blockIdx.x * blockDim.x; + const bool reverse = threadIdx == 0 ? true : false; + const bool smaller_or_larger_leaf = static_cast(blockIdx.x / num_features); + const int num_bin = feature_hist_offsets[inner_feature_index + 1] - feature_hist_offsets[inner_feature_index]; + const uint8_t missing_type = feature_missing_type[inner_feature_index]; + const int leaf_index = smaller_or_larger ? *smaller_leaf_id : *larger_leaf_id; + const double parent_gain = smaller_or_larger ? *smaller_leaf_gain : *larger_leaf_gain; + const double sum_gradients = smaller_or_larger ? *sum_gradients_in_smaller_leaf : *sum_gradients_in_larger_leaf; + const double sum_hessians = smaller_or_larger ? *sum_hessians_in_smaller_leaf : *sum_hessians_in_larger_leaf; + const double num_data_in_leaf = smaller_or_larger ? *num_data_in_smaller_leaf : *num_data_in_larger_leaf; + double* out_left_sum_gradients = cuda_best_split_left_sum_gradient + global_threadIdx; + double* out_left_sum_hessians = cuda_best_split_left_sum_hessian + global_threadIdx; + double* out_right_sum_gradients = cuda_best_split_right_sum_gradient + global_threadIdx; + double* out_right_sum_hessians = cuda_best_split_right_sum_hessian + global_threadIdx; + data_size_t* out_left_num_data = cuda_best_split_left_count + global_threadIdx; + data_size_t* out_right_num_data = cuda_best_split_right_count + global_threadIdx; + uint8_t* out_default_left = cuda_best_split_default_left + global_threadIdx; + double* out_gain = cuda_best_split_gain + global_threadIdx; + if (num_bin > 2 && missing_type != 0) { + if (missing_type == 1) { + FindBestSplitsForLeafKernelInner(leaf_hist_ptr + leaf_index, + num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], + feature_missing_types[inner_feature_index], *lambda_l1, *parent_gain, + *min_data_in_leaf, *min_sum_hessian_in_leaf, sum_gradients, sum_hessians, + num_data_in_leaf, reverse, true, false, out_gain, out_default_left, + out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data); + } else { + FindBestSplitsForLeafKernelInner(leaf_hist_ptr + leaf_index, + num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], + feature_missing_types[inner_feature_index], *lambda_l1, *parent_gain, + *min_data_in_leaf, *min_sum_hessian_in_leaf, sum_gradients, sum_hessians, + num_data_in_leaf, reverse, false, true, out_gain, out_default_left, + out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data); + } + } else { + if (reverse) { + FindBestSplitsForLeafKernelInner(leaf_hist_ptr + leaf_index, + num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], + feature_missing_types[inner_feature_index], *lambda_l1, *parent_gain, + *min_data_in_leaf, *min_sum_hessian_in_leaf, sum_gradients, sum_hessians, + num_data_in_leaf, reverse, true, false, out_gain, out_default_left, + out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data); + } + if (missing_type == 2) { + *out_default_left = 0; + } + } +} + +void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(const int* smaller_leaf_id, const int* larger_leaf_id, + const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, + const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, + const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, + const data_size_t* num_data_in_larger_leaf, const data_size_t* min_data_in_leaf, + const double* min_sum_hessian_in_leaf) { + const int leaf_id_ref = *leaf_id; + const int num_total_bin_ref = *num_total_bin_; + // * 2 for smaller and larger leaves, * 2 for split direction + const int num_blocks = num_features_ * 2; + FindBestSplitsForLeafKernel<<>>(cuda_hist_, cuda_feature_hist_offsets_, + cuda_feature_mfb_offsets_, cuda_feature_default_bins_, + cuda_feature_missing_type_, cuda_lambda_l1_, + smaller_leaf_id, larger_leaf_id, smaller_leaf_gain, larger_leaf_gain, + sum_gradients_in_smaller_leaf, sum_hessians_in_smaller_leaf, num_data_in_smaller_leaf, + sum_gradients_in_larger_leaf, sum_hessians_in_larger_leaf, num_data_in_larger_leaf, + cuda_min_data_in_leaf_, cuda_min_sum_hessian_in_leaf_, cuda_min_gain_to_split, + + cuda_best_split_default_left_, cuda_best_split_gain_, + cuda_best_split_left_sum_gradient_, cuda_best_split_left_sum_hessian_, + cuda_best_split_left_count_, cuda_best_split_right_sum_gradient_, + cuda_best_split_right_sum_hessian_, cuda_best_split_right_count_); +} + +} // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp new file mode 100644 index 000000000000..50cd7e7c7fb1 --- /dev/null +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -0,0 +1,96 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_CUDA_BEST_SPLIT_FINDER_HPP_ +#define LIGHTGBM_CUDA_BEST_SPLIT_FINDER_HPP_ + +#include "new_cuda_utils.hpp" + +#include +#include + +#include + +#ifdef USE_CUDA + +namespace LightGBM { + +class CUDABestSplitFinder { + public: + CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* train_data, + const std::vector& feature_hist_offsets, const int max_num_leaves); + + void Init(); + + void FindBestSplitsForLeaf(const int* smaller_leaf_id, const int* larger_leaf_id, const double* parent_gain); + + void FindBestFromAllSplits(); + + int* best_leaf() { return cuda_best_leaf_; } + + int* best_split_feature_index() { return cuda_best_split_feature_index_; } + + int* best_split_threshold() { return cuda_best_split_threshold_; } + + private: + void LaunchFindBestSplitsForLeafKernel(const int* smaller_leaf_id, const int* larger_leaf_id, const double* parent_gain); + + int* cuda_leaf_best_split_feature_index_; + int* cuda_leaf_best_split_threshold_; + double* cuda_leaf_best_split_gain_; + + int* cuda_best_leaf_; + int* cuda_best_split_feature_index_; + int* cuda_best_split_threshold_; + + double* cuda_leaf_best_split_gain_; + int* cuda_leaf_best_split_feature_; + int* cuda_leaf_best_split_threshold_; + + int* cuda_best_split_feature_; + uint8_t* cuda_best_split_default_left_; + double* cuda_best_split_gain_; + double* cuda_best_split_left_sum_gradient_; + double* cuda_best_split_left_sum_hessian_; + data_size_t* cuda_best_split_left_count_; + double* cuda_best_split_right_sum_gradient_; + double* cuda_best_split_right_sum_hessian_; + data_size_t* cuda_best_split_right_count_; + + const hist_t* cuda_hist_; + hist_t* prefix_sum_hist_left_; + hist_t* prefix_sum_hist_right_; + const int num_features_; + const int max_num_leaves_; + const int num_total_bin_; + + int* cuda_num_total_bin_; + + std::vector feature_hist_offsets_; + std::vector feature_mfb_offsets_; + std::vector feature_default_bins_; + + // None --> 0, Zero --> 1, NaN --> 2 + std::vector feature_missing_type_; + const double lambda_l1_; + const data_size_t min_data_in_leaf_; + const double min_sum_hessian_in_leaf_; + const double min_gain_to_split_; + + uint32_t* cuda_feature_hist_offsets_; + uint8_t* cuda_feature_mfb_offsets_; + uint32_t* cuda_feature_default_bins_; + uint8_t* cuda_feature_missing_type_; + double* cuda_lambda_l1_; + data_size_t* cuda_min_data_in_leaf_; + double* cuda_min_sum_hessian_in_leaf_; + double* cuda_min_gain_to_split_; +}; + + +} + +#endif // USE_CUDA +#endif // LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ diff --git a/src/treelearner/cuda/cuda_data_splitter.cpp b/src/treelearner/cuda/cuda_data_splitter.cpp index c61c13b2b847..bf00496daa9a 100644 --- a/src/treelearner/cuda/cuda_data_splitter.cpp +++ b/src/treelearner/cuda/cuda_data_splitter.cpp @@ -34,13 +34,17 @@ void CUDADataSplitter::BeforeTrain(const data_size_t* data_indices) { SynchronizeCUDADevice(); data_indices_.resize(num_data_); CopyFromCUDADeviceToHost(data_indices_.data(), cuda_data_indices_, static_cast(num_data_)); - for (int i = 0; i < 100; ++i) { + /*for (int i = 0; i < 100; ++i) { Log::Warning("data_indices_[%d] = %d", i, data_indices_[i]); Log::Warning("data_indices_[end - %d] = %d", i, data_indices_[num_data_ - 1 - i]); - } + }*/ SetCUDAMemory(cuda_leaf_num_data_offsets_, 0, max_num_leaves_); SetCUDAMemory(cuda_leaf_num_data_, 0, max_num_leaves_); - SetCUDAMemory(cuda_leaf_num_data_, num_data_, 1); + //Log::Warning("num_data_ = %d", num_data_); + CopyFromHostToCUDADevice(cuda_leaf_num_data_, &num_data_, 1); + data_size_t root_leaf_num_data = 0; + CopyFromCUDADeviceToHost(&root_leaf_num_data, cuda_leaf_num_data_, 1); + //Log::Warning("root_leaf_num_data = %d", root_leaf_num_data); } else { Log::Fatal("bagging is not supported by GPU"); } diff --git a/src/treelearner/cuda/cuda_data_splitter.hpp b/src/treelearner/cuda/cuda_data_splitter.hpp index 0abc728b5b1b..5610e6ca3306 100644 --- a/src/treelearner/cuda/cuda_data_splitter.hpp +++ b/src/treelearner/cuda/cuda_data_splitter.hpp @@ -36,6 +36,8 @@ class CUDADataSplitter { // kernel launch functions void LaunchFillDataIndicesBeforeTrain(); + void LaunchSplitKernel(const int* leaf_id, const int* best_split_feature, const int* best_split_threshold); + // CPU const data_size_t num_data_; std::vector data_indices_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 19d16edd7a36..544689b2addb 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -8,6 +8,8 @@ #include "cuda_histogram_constructor.hpp" +#include + namespace LightGBM { CUDAHistogramConstructor::CUDAHistogramConstructor(const std::vector& feature_group_ids, @@ -16,12 +18,19 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(const std::vector& featu num_feature_groups_(feature_group_ids.size()), max_num_leaves_(max_num_leaves) { int offset = 0; + int col_group_offset = 0; for (size_t i = 0; i < feature_group_ids.size(); ++i) { const int group_id = feature_group_ids[i]; feature_group_bin_offsets_.emplace_back(offset); + feature_group_bin_offsets_by_col_groups_.emplace_back(col_group_offset); offset += train_data->FeatureGroupNumBin(group_id); + col_group_offset += train_data->FeatureGroupNumBin(group_id); + if ((i + 1) % NUM_FEATURE_PER_THREAD_GROUP == 0) { + col_group_offset = 0; + } } feature_group_bin_offsets_.emplace_back(offset); + feature_group_bin_offsets_by_col_groups_.emplace_back(col_group_offset); num_total_bin_ = offset; cuda_hist_ = cuda_hist; } @@ -45,6 +54,21 @@ void CUDAHistogramConstructor::Init() { AllocateCUDAMemory(1, &cuda_num_feature_groups_); CopyFromHostToCUDADevice(cuda_num_feature_groups_, &num_feature_groups_, 1); + + AllocateCUDAMemory(feature_group_bin_offsets_.size(), &cuda_feature_group_bin_offsets_); + CopyFromHostToCUDADevice(cuda_feature_group_bin_offsets_, + feature_group_bin_offsets_.data(), + feature_group_bin_offsets_.size()); + AllocateCUDAMemory(feature_group_bin_offsets_by_col_groups_.size(), &cuda_feature_group_bin_offsets_by_col_groups_); + CopyFromHostToCUDADevice(cuda_feature_group_bin_offsets_by_col_groups_, + feature_group_bin_offsets_by_col_groups_.data(), + feature_group_bin_offsets_by_col_groups_.size()); + /*for (size_t i = 0; i < feature_group_bin_offsets_.size(); ++i) { + Log::Warning("feature_group_bin_offsets_[%d] = %d", i, feature_group_bin_offsets_[i]); + } + for (size_t i = 0; i < feature_group_bin_offsets_by_col_groups_.size(); ++i) { + Log::Warning("feature_group_bin_offsets_by_col_groups_[%d] = %d", i, feature_group_bin_offsets_by_col_groups_[i]); + }*/ } void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, @@ -65,19 +89,37 @@ void CUDAHistogramConstructor::FinishLoad() { void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* smaller_leaf_index, const int* /*larger_leaf_index*/, const data_size_t* num_data_in_leaf, const data_size_t* leaf_data_offset, const data_size_t* data_indices_ptr, - const score_t* cuda_gradients, const score_t* cuda_hessians) { + const score_t* cuda_gradients, const score_t* cuda_hessians, const score_t* cuda_gradients_and_hessians) { + //auto start = std::chrono::steady_clock::now(); + + /*for (size_t i = 0; i < feature_group_bin_offsets_.size(); ++i) { + Log::Warning("feature_group_bin_offsets_[%d] = %d", i, feature_group_bin_offsets_[i]); + }*/ + AllocateCUDAMemory(num_data_, &cuda_int_gradients_); + AllocateCUDAMemory(num_data_, &cuda_int_hessians_); + AllocateCUDAMemory(num_data_, &cuda_int_gradients_and_hessians_); + SetCUDAMemory(cuda_int_gradients_, 3, num_data_); + SetCUDAMemory(cuda_int_hessians_, 3, num_data_); + SetCUDAMemory(cuda_int_gradients_and_hessians_, 3, num_data_); + /*for (size_t i = 0; i < feature_group_bin_offsets_by_col_groups_.size(); ++i) { + Log::Warning("feature_group_bin_offsets_by_col_groups_[%d] = %d", i, feature_group_bin_offsets_by_col_groups_[i]); + }*/ + auto start = std::chrono::steady_clock::now(); LaunchConstructHistogramKernel(smaller_leaf_index, num_data_in_leaf, leaf_data_offset, data_indices_ptr, - cuda_gradients, cuda_hessians); + cuda_gradients, cuda_hessians, cuda_gradients_and_hessians); SynchronizeCUDADevice(); + auto end = std::chrono::steady_clock::now(); + double duration = (static_cast>(end - start)).count(); + Log::Warning("LaunchConstructHistogramKernel time %f", duration); PrintLastCUDAError(); - Log::Warning("histogram construction finished"); - Log::Warning("num_total_bin_ = %d", num_total_bin_); - Log::Warning("max_num_leaves_ = %d", max_num_leaves_); - std::vector cpu_hist(200, 0.0f); - CopyFromCUDADeviceToHost(cpu_hist.data(), cuda_hist_, 200); - for (int i = 0; i < 100; ++i) { + //Log::Warning("histogram construction finished"); + //Log::Warning("num_total_bin_ = %d", num_total_bin_); + //Log::Warning("max_num_leaves_ = %d", max_num_leaves_); + std::vector cpu_hist(6143 * 2, 0.0f); + CopyFromCUDADeviceToHost(cpu_hist.data(), cuda_hist_, 6143 * 2); + /*for (int i = 0; i < 6143; ++i) { Log::Warning("bin %d grad %f hess %f", i, cpu_hist[2 * i], cpu_hist[2 * i + 1]); - } + }*/ } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 276fd0b33812..6226470ff2d5 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -10,71 +10,140 @@ namespace LightGBM { -#define SHRAE_HIST_SIZE (6144) -#define NUM_DATA_PER_THREAD (1600) -#define NUM_FEATURE_PER_THREAD_GROUP (12) - __global__ void CUDAConstructHistogramKernel(const int* leaf_index, const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* data_indices_ptr, hist_t* feature_histogram, const int* num_feature_groups, - const int* leaf_num_data_offset, const uint8_t* data, const data_size_t* num_data_in_leaf) { + const int* leaf_num_data_offset, const uint8_t* data, const data_size_t* num_data_in_leaf, + const uint32_t* feature_group_offsets_by_col_group, + const uint32_t* feature_group_offsets, + const score_t* cuda_gradients_and_hessians, + const int8_t* cuda_int_gradients, + const int8_t* cuda_int_hessians, + const int32_t* cuda_int_gradients_and_hessians) { const unsigned int threadIdx_x = threadIdx.x; - if (threadIdx_x == 0) { + /*if (threadIdx_x == 0) { printf("CUDAConstructHistogramKernel step 0\n"); - } + }*/ const int num_feature_groups_ref = *num_feature_groups; - if (threadIdx_x == 0) { + /*if (threadIdx_x == 0) { printf("CUDAConstructHistogramKernel step 1\n"); - } + }*/ const int leaf_index_ref = *leaf_index; - if (threadIdx_x == 0) { + /*if (threadIdx_x == 0) { printf("CUDAConstructHistogramKernel step 2\n"); - } + }*/ const int num_data_in_smaller_leaf_ref = *(num_data_in_leaf + leaf_index_ref); - if (threadIdx_x == 0) { + /*if (threadIdx_x == 0) { printf("CUDAConstructHistogramKernel step 3\n"); - } + }*/ const int leaf_num_data_in_smaller_leaf_ref = *(leaf_num_data_offset + leaf_index_ref); - printf("num_feature_groups_ref = %d", num_feature_groups_ref); - printf("leaf_index_ref = %d", leaf_index_ref); - printf("num_data_in_smaller_leaf_ref = %d", num_data_in_smaller_leaf_ref); - printf("leaf_num_data_in_smaller_leaf_ref = %d", leaf_num_data_in_smaller_leaf_ref); + /*printf("num_feature_groups_ref = %d\n", num_feature_groups_ref); + printf("leaf_index_ref = %d\n", leaf_index_ref); + printf("num_data_in_smaller_leaf_ref = %d\n", num_data_in_smaller_leaf_ref); + printf("leaf_num_data_in_smaller_leaf_ref = %d\n", leaf_num_data_in_smaller_leaf_ref);*/ const data_size_t* data_indices_in_smaller_leaf = data_indices_ptr + leaf_num_data_in_smaller_leaf_ref; - //__shared__ double shared_hist[SHRAE_HIST_SIZE]; // 256 * 24, can use 12 features + __shared__ float shared_hist[SHRAE_HIST_SIZE]; // 256 * 24 * 2, can use 24 features + //__shared__ int32_t shared_int_hist[SHRAE_HIST_SIZE]; + //uint32_t bin_offset = feature_group_offsets[blockIdx.x * 12]; + //const uint32_t next_feature_group_start = (blockIdx.x + 1) * 12; + //const uint32_t next_col_group_first_feature = next_feature_group_start > 28 ? 28 : next_feature_group_start; + uint32_t num_bins_in_col_group = feature_group_offsets[blockDim.x]; + const uint32_t num_items_per_thread = (2 * num_bins_in_col_group + NUM_THRADS_PER_BLOCK - 1) / NUM_THRADS_PER_BLOCK; + const int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; + const uint32_t thread_start = thread_idx * num_items_per_thread; + const uint32_t thread_end = thread_start + num_items_per_thread > num_bins_in_col_group * 2 ? + num_bins_in_col_group * 2 : thread_start + num_items_per_thread; + for (uint32_t i = thread_start; i < thread_end; ++i) { + shared_hist[i] = 0.0f; + //shared_int_hist[i] = 0; + } + __syncthreads(); + + /*if (blockIdx.x == 0 && threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0) { + printf("num_data_in_leaf = %d\n", num_data_in_smaller_leaf_ref); + printf("num_feature_groups_ref = %d", num_feature_groups_ref); + }*/ const unsigned int threadIdx_y = threadIdx.y; const unsigned int blockIdx_x = blockIdx.x; + const unsigned int blockIdx_y = blockIdx.y; const unsigned int blockDim_x = blockDim.x; const unsigned int offset = threadIdx_x % 2; - if (threadIdx_x < 24) { - const int feature_group_index = threadIdx_x / 2 + blockIdx_x * blockDim_x / 8 * 3; - const data_size_t start = threadIdx_y * NUM_DATA_PER_THREAD; + //if ((threadIdx_x < 24 && blockIdx_x < 2) || (threadIdx_x < 8 && blockIdx_x == 2)) { + //const int feature_group_index = threadIdx_x / 2 + blockIdx_x * blockDim_x / 8 * 3; + /*if (feature_group_index >= 28) { + printf("error feature_group_index = %d\n", feature_group_index); + }*/ + const data_size_t start = threadIdx_y * NUM_DATA_PER_THREAD + blockIdx_y * blockDim.y * NUM_DATA_PER_THREAD; const data_size_t end = start + NUM_DATA_PER_THREAD > num_data_in_smaller_leaf_ref ? num_data_in_smaller_leaf_ref : start + NUM_DATA_PER_THREAD; - if (offset == 0) { + /*if (offset == 0) { // handle gradient for (data_size_t i = start; i < end; ++i) { const score_t gradient = cuda_gradients[i]; const data_size_t data_index = data_indices_in_smaller_leaf[i]; - const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + feature_group_index]); - feature_histogram[bin << 1] += gradient; + if (data_index != i) { + printf("error data_index = %d vs i = %d", data_index, i); + } + const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + feature_group_index]) + + feature_group_offsets_by_col_group[feature_group_index]; + //shared_hist[bin << 1] += gradient; + atomicAdd_system(shared_hist + (bin << 1), gradient); } } else { // handle hessian for (data_size_t i = start; i < end; ++i) { const score_t hessian = cuda_hessians[i]; const data_size_t data_index = data_indices_in_smaller_leaf[i]; - const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + feature_group_index]); - feature_histogram[(bin << 1) + 1] += hessian; + const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + feature_group_index]) + + feature_group_offsets_by_col_group[feature_group_index]; + //shared_hist[(bin << 1) + 1] += hessian; + atomicAdd_system(shared_hist + ((bin << 1) + 1), hessian); } + }*/ + for (data_size_t i = start; i < end; ++i) { + const score_t grad = cuda_gradients[i]; + const score_t hess = cuda_hessians[i]; + const data_size_t data_index = data_indices_in_smaller_leaf[i]; + const uint32_t bin = static_cast(data[i * num_feature_groups_ref + threadIdx_x]) + + feature_group_offsets[threadIdx_x]; + const uint32_t pos = bin << 1; + float* pos_ptr = shared_hist + pos; + atomicAdd_system(pos_ptr, grad); + atomicAdd_system(pos_ptr + 1, hess); + //const int32_t grad = static_cast(cuda_int_gradients[i]); + //const int32_t hess = static_cast(cuda_int_hessians[i]); + /*const int32_t grad_and_hess = cuda_int_gradients_and_hessians[i]; + const data_size_t data_index = data_indices_in_smaller_leaf[i]; + const uint32_t bin = static_cast(data[i * num_feature_groups_ref + threadIdx_x]) + + feature_group_offsets[threadIdx_x]; + //const uint32_t pos = bin << 1; + int32_t* pos_ptr = shared_int_hist + bin; + atomicAdd_system(pos_ptr, grad_and_hess);*/ + //atomicAdd_system(pos_ptr + 1, hess); } + //} + __syncthreads(); + /*uint32_t bin_offset = feature_group_offsets[blockIdx.x * 12]; + const uint32_t next_feature_group_start = (blockIdx.x + 1) * 12; + const uint32_t next_col_group_first_feature = next_feature_group_start > 28 ? 28 : next_feature_group_start; + uint32_t num_bins_in_col_group = feature_group_offsets[next_col_group_first_feature] - bin_offset; + const uint32_t num_items_per_thread = (2 * num_bins_in_col_group + 1023) / 1024; + const int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; + const uint32_t thread_start = thread_idx * num_items_per_thread; + const uint32_t thread_end = thread_start + num_items_per_thread > num_bins_in_col_group * 2 ? + num_bins_in_col_group * 2 : thread_start + num_items_per_thread;*/ + for (uint32_t i = thread_start; i < thread_end; ++i) { + //feature_histogram[i + bin_offset * 2] += shared_hist[thread_idx]; + atomicAdd_system(feature_histogram + i, shared_hist[i]); } } void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const int* smaller_leaf_index, const data_size_t* num_data_in_leaf, const data_size_t* leaf_num_data_offset, - const data_size_t* data_indices_ptr, const score_t* cuda_gradients, const score_t* cuda_hessians) { - const int block_dim_x = 32; - const int block_dim_y = 1024 / block_dim_x; + const data_size_t* data_indices_ptr, const score_t* cuda_gradients, const score_t* cuda_hessians, + const score_t* cuda_gradients_and_hessians) { + const int block_dim_x = 28; + const int block_dim_y = NUM_THRADS_PER_BLOCK / block_dim_x; const int grid_dim_y = ((num_data_ + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + block_dim_y - 1) / block_dim_y; const int grid_dim_x = (static_cast(num_feature_groups_ + NUM_FEATURE_PER_THREAD_GROUP - 1) / NUM_FEATURE_PER_THREAD_GROUP); Log::Warning("block_dim_x = %d, block_dim_y = %d", block_dim_x, block_dim_y); @@ -82,7 +151,10 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); CUDAConstructHistogramKernel<<>>(smaller_leaf_index, cuda_gradients, cuda_hessians, - data_indices_ptr, cuda_hist_, cuda_num_feature_groups_, leaf_num_data_offset, cuda_data_, num_data_in_leaf); + data_indices_ptr, cuda_hist_, cuda_num_feature_groups_, leaf_num_data_offset, cuda_data_, num_data_in_leaf, + cuda_feature_group_bin_offsets_by_col_groups_, + cuda_feature_group_bin_offsets_, cuda_gradients_and_hessians, cuda_int_gradients_, cuda_int_hessians_, + cuda_int_gradients_and_hessians_); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 600fde1aee42..d4725f677255 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -17,6 +17,12 @@ namespace LightGBM { + +#define SHRAE_HIST_SIZE (6144 * 2) +#define NUM_DATA_PER_THREAD (400) +#define NUM_THRADS_PER_BLOCK (504) +#define NUM_FEATURE_PER_THREAD_GROUP (28) + class CUDAHistogramConstructor { public: CUDAHistogramConstructor(const std::vector& feature_group_ids, @@ -31,11 +37,12 @@ class CUDAHistogramConstructor { void ConstructHistogramForLeaf(const int* smaller_leaf_index, const int* larger_leaf_index, const data_size_t* num_data_in_smaller_leaf, const int* smaller_leaf_data_offset, const data_size_t* data_indices_ptr, - const score_t* cuda_gradients, const score_t* cuda_hessians); + const score_t* cuda_gradients, const score_t* cuda_hessians, const score_t* cuda_gradients_and_hessians); void LaunchConstructHistogramKernel( const int* smaller_leaf_index, const data_size_t* num_data_in_leaf, const data_size_t* leaf_num_data_offset, - const data_size_t* data_indices_ptr, const score_t* cuda_gradients, const score_t* cuda_hessians); + const data_size_t* data_indices_ptr, const score_t* cuda_gradients, const score_t* cuda_hessians, + const score_t* cuda_gradients_and_hessians); hist_t* cuda_hist() { return cuda_hist_; } @@ -43,42 +50,22 @@ class CUDAHistogramConstructor { // data on CPU, stored in row-wise style std::vector cpu_data_; std::vector feature_group_bin_offsets_; + std::vector feature_group_bin_offsets_by_col_groups_; uint8_t* cuda_data_; - uint32_t cuda_feature_group_bin_offsets_; + uint32_t* cuda_feature_group_bin_offsets_; + uint32_t* cuda_feature_group_bin_offsets_by_col_groups_; const data_size_t num_data_; hist_t* cuda_hist_; int num_total_bin_; int* cuda_num_total_bin_; int num_feature_groups_; int* cuda_num_feature_groups_; + int8_t* cuda_int_gradients_; + int8_t* cuda_int_hessians_; + int32_t* cuda_int_gradients_and_hessians_; const int max_num_leaves_; }; -class CUDABestSplitFinder { - public: - CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* train_data, - const std::vector& feature_group_ids, const int max_num_leaves); - - void FindBestSplitsForLeaf(const int* leaf_id); - - void FindBestFromAllSplits(); - - int* best_leaf() { return cuda_best_leaf_; } - - int* best_split_feature_index() { return cuda_best_split_feature_index_; } - - int* best_split_threshold() { return cuda_best_split_threshold_; } - - private: - int* cuda_leaf_best_split_feature_index_; - int* cuda_leaf_best_split_threshold_; - double* cuda_leaf_best_split_gain_; - - int* cuda_best_leaf_; - int* cuda_best_split_feature_index_; - int* cuda_best_split_threshold_; -}; - } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp new file mode 100644 index 000000000000..5776ebd54866 --- /dev/null +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -0,0 +1,54 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "cuda_leaf_splits.hpp" + +namespace LightGBM { + +CUDALeafSplits::CUDALeafSplits(const data_size_t num_data, + const score_t* cuda_gradients, const score_t* cuda_hessians, + const int* cuda_num_data): num_data_(num_data) { + cuda_sum_of_gradients_ = nullptr; + cuda_sum_of_hessians_ = nullptr; + cuda_num_data_in_leaf_ = nullptr; + cuda_gain_ = nullptr; + cuda_leaf_value_ = nullptr; + + cuda_gradients_ = cuda_gradients; + cuda_hessians_ = cuda_hessians; + cuda_data_indices_in_leaf_ = nullptr; + cuda_num_data_ = cuda_num_data; +} + +void CUDALeafSplits::Init() { + num_blocks_init_from_gradients_ = (num_data_ + INIT_SUM_BLOCK_SIZE_LEAF_SPLITS - 1) / INIT_SUM_BLOCK_SIZE_LEAF_SPLITS; + + // allocate more memory for sum reduction in CUDA + // only the first element records the final sum + AllocateCUDAMemory(num_blocks_init_from_gradients_, &cuda_sum_of_gradients_); + AllocateCUDAMemory(num_blocks_init_from_gradients_, &cuda_sum_of_hessians_); + + AllocateCUDAMemory(1, &cuda_num_data_in_leaf_); + AllocateCUDAMemory(1, &cuda_gain_); + AllocateCUDAMemory(1, &cuda_leaf_value_); +} + +void CUDALeafSplits::InitValues(const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, + const data_size_t* cuda_num_data_in_leaf, const data_size_t* cuda_data_indices_in_leaf, + const double* cuda_gain, const double* cuda_leaf_value) { + CopyFromCUDADeviceToCUDADevice(cuda_sum_of_gradients_, cuda_sum_of_gradients, 1); + CopyFromCUDADeviceToCUDADevice(cuda_sum_of_hessians_, cuda_sum_of_hessians, 1); + CopyFromCUDADeviceToCUDADevice(cuda_num_data_in_leaf_, cuda_num_data_in_leaf, 1); + cuda_data_indices_in_leaf_ = cuda_data_indices_in_leaf; + CopyFromCUDADeviceToCUDADevice(cuda_gain_, cuda_gain, 1); + CopyFromCUDADeviceToCUDADevice(cuda_leaf_value_, cuda_leaf_value, 1); +} + +void CUDALeafSplits::InitValues() { + +} + +} // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu new file mode 100644 index 000000000000..65318d41de6c --- /dev/null +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -0,0 +1,59 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "cuda_leaf_splits.hpp" + +namespace LightGBM { + +__global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score_t* cuda_hessians, + const data_size_t* cuda_num_data, double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) { + __shared__ score_t shared_gradients[NUM_THRADS_PER_BLOCK_LEAF_SPLITS]; + __shared__ score_t shared_hessians[NUM_THRADS_PER_BLOCK_LEAF_SPLITS]; + const unsigned int tid = threadIdx.x; + const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD; + const unsigned int num_data_ref = static_cast(*cuda_num_data); + shared_gradients[tid] = 0.0f; + shared_hessians[tid] = 0.0f; + __syncthreads(); + for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD; ++j) { + if (i + j < num_data_ref) { + shared_gradients[tid] += cuda_gradients[i + j]; + shared_hessians[tid] += cuda_hessians[i + j]; + } + } + __syncthreads(); + for (unsigned int s = 1; s < blockDim.x; s *= 2) { + if (tid % (2 * s) == 0 && (tid + s) < NUM_THRADS_PER_BLOCK_SPLITS_INIT) { + shared_gradients[tid] += shared_gradients[tid + s]; + shared_hessians[tid] += shared_hessians[tid + s]; + } + __syncthreads(); + } + if (tid == 0) { + cuda_sum_of_gradients[blockIdx.x] += shared_gradients[0]; + cuda_sum_of_hessians[blockIdx.x] += shared_hessians[0]; + } +} + +__global__ void CUDAInitValuesKernel2(double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) { + for (unsigned int i = 1; i < gridDim.x; ++i) { + cuda_sum_of_gradients[0] += cuda_sum_of_gradients[i]; + cuda_sum_of_hessians[0] += cuda_sum_of_hessians[i]; + } +} + +void CUDALeafSplits::LaunchInitValuesKernal() { + CUDAInitValuesKernel1<<>>( + cuda_gradients_, cuda_hessians_, cuda_num_data_, smaller_leaf_sum_gradients_, + smaller_leaf_sum_hessians_); + CopyFromCUDADeviceToCUDADevice(cuda_num_data_in_leaf_, cuda_num_data_, 1); + SynchronizeCUDADevice(); + CUDAInitValuesKernel2<<>>( + smaller_leaf_sum_gradients_, smaller_leaf_sum_hessians_); + SynchronizeCUDADevice(); +} + +} // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp new file mode 100644 index 000000000000..ae065be81d8c --- /dev/null +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -0,0 +1,57 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_CUDA_LEAF_SPLITS_HPP_ +#define LIGHTGBM_CUDA_LEAF_SPLITS_HPP_ + +#ifdef USE_CUDA + +#include +#include +#include "new_cuda_utils.hpp" + +#define INIT_SUM_BLOCK_SIZE_LEAF_SPLITS (6144) +#define NUM_THRADS_PER_BLOCK_LEAF_SPLITS (1024) +#define NUM_DATA_THREAD_ADD_LEAF_SPLITS (6) + +namespace LightGBM { + +class CUDALeafSplits { + public: + CUDALeafSplits(); + + void Init(); + + void InitValues(const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, + const data_size_t* cuda_num_data_in_leaf, const data_size_t* cuda_data_indices_in_leaf, + const double* cuda_gain, const double* cuda_leaf_value); + + void InitValues(); + + private: + void LaunchInitValuesKernal(); + + // Host memory + const int num_data_; + int num_blocks_init_from_gradients_; + + // CUDA memory, held by this object + double* cuda_sum_of_gradients_; + double* cuda_sum_of_hessians_; + data_size_t* cuda_num_data_in_leaf_; + double* cuda_gain_; + double* cuda_leaf_value_; + + // CUDA memory, held by other object + const data_size_t* cuda_data_indices_in_leaf_; + const score_t* cuda_gradients_; + const score_t* cuda_hessians_; + const int* cuda_num_data_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_CUDA_LEAF_SPLITS_HPP_ diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.cpp b/src/treelearner/cuda/cuda_leaf_splits_init.cpp index a64309303735..f20e2ff41881 100644 --- a/src/treelearner/cuda/cuda_leaf_splits_init.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits_init.cpp @@ -34,12 +34,20 @@ void CUDALeafSplitsInit::Init() { cuda_num_data_ = reinterpret_cast(cuda_num_data_ptr); const void* num_data_ptr = reinterpret_cast(&num_data_); CUDASUCCESS_OR_FATAL(cudaMemcpy(cuda_num_data_ptr, num_data_ptr, sizeof(int), cudaMemcpyHostToDevice)); + + AllocateCUDAMemory(1, &smaller_leaf_index_); + AllocateCUDAMemory(1, &larger_leaf_index_); + AllocateCUDAMemory(1, &cuda_num_data_); + CopyFromHostToCUDADevice(cuda_num_data_, &num_data_, 1); } void CUDALeafSplitsInit::Compute() { - LaunchLeafSplitsInit(num_blocks_, INIT_SUM_BLOCK_SIZE, - cuda_gradients_, cuda_hessians_, cuda_num_data_, - smaller_leaf_sum_gradients_, smaller_leaf_sum_hessians_); + auto start = std::chrono::steady_clock::now(); + LaunchLeafSplitsInit(); + SynchronizeCUDADevice(); + auto end = std::chrono::steady_clock::now(); + double duration = (static_cast>(end - start)).count(); + Log::Warning("LaunchLeafSplitsInit time %f", duration); Log::Warning(cudaGetErrorName(cudaGetLastError())); CUDASUCCESS_OR_FATAL(cudaDeviceSynchronize()); diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.cu b/src/treelearner/cuda/cuda_leaf_splits_init.cu index dc056ab3ae7c..e1bcf931ffb1 100644 --- a/src/treelearner/cuda/cuda_leaf_splits_init.cu +++ b/src/treelearner/cuda/cuda_leaf_splits_init.cu @@ -12,32 +12,35 @@ namespace LightGBM { __global__ void CUDALeafSplitsInitKernel1(const float* cuda_gradients, const float* cuda_hessians, const data_size_t* num_data, double* grad_sum_out, double* hess_sum_out) { - __shared__ float shared_gradients[INIT_SUM_BLOCK_SIZE]; - __shared__ float shared_hessians[INIT_SUM_BLOCK_SIZE]; + __shared__ float shared_gradients[NUM_THRADS_PER_BLOCK_SPLITS_INIT]; + __shared__ float shared_hessians[NUM_THRADS_PER_BLOCK_SPLITS_INIT]; const unsigned int tid = threadIdx.x; - const unsigned int i = blockIdx.x * blockDim.x + tid; - if (i < static_cast(*num_data)) { - shared_gradients[tid] = cuda_gradients[i]; - shared_hessians[tid] = cuda_hessians[i]; - } else { - shared_gradients[tid] = 0.0f; - shared_hessians[tid] = 0.0f; + const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD; + const unsigned int num_data_ref = static_cast(*num_data); + shared_gradients[tid] = 0.0f; + shared_hessians[tid] = 0.0f; + __syncthreads(); + for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD; ++j) { + if (i + j < num_data_ref) { + shared_gradients[tid] += cuda_gradients[i + j]; + shared_hessians[tid] += cuda_hessians[i + j]; + } } + __syncthreads(); for (unsigned int s = 1; s < blockDim.x; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < INIT_SUM_BLOCK_SIZE) { + if (tid % (2 * s) == 0 && (tid + s) < NUM_THRADS_PER_BLOCK_SPLITS_INIT) { shared_gradients[tid] += shared_gradients[tid + s]; shared_hessians[tid] += shared_hessians[tid + s]; } __syncthreads(); } if (tid == 0) { - grad_sum_out[blockIdx.x] = shared_gradients[0]; - hess_sum_out[blockIdx.x] = shared_hessians[0]; + grad_sum_out[blockIdx.x] += shared_gradients[0]; + hess_sum_out[blockIdx.x] += shared_hessians[0]; } } -__global__ void CUDALeafSplitsInitKernel2(const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t* num_data, double* grad_sum_out, double* hess_sum_out) { +__global__ void CUDALeafSplitsInitKernel2(double* grad_sum_out, double* hess_sum_out) { if (threadIdx.x == 0 && blockIdx.x == 0) { for (unsigned int i = 1; i < gridDim.x; ++i) { grad_sum_out[0] += grad_sum_out[i]; @@ -46,15 +49,13 @@ __global__ void CUDALeafSplitsInitKernel2(const score_t* cuda_gradients, const s } } -void CUDALeafSplitsInit::LaunchLeafSplitsInit(const int num_blocks, const int init_sum_block_size, - const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* num_data, - double* smaller_leaf_sum_gradients, double* smaller_leaf_sum_hessians) { - CUDALeafSplitsInitKernel1<<>>( - cuda_gradients, cuda_hessians, num_data, smaller_leaf_sum_gradients, - smaller_leaf_sum_hessians); - CUDALeafSplitsInitKernel2<<>>( - cuda_gradients, cuda_hessians, num_data, smaller_leaf_sum_gradients, - smaller_leaf_sum_hessians); +void CUDALeafSplitsInit::LaunchLeafSplitsInit() { + CUDALeafSplitsInitKernel1<<>>( + cuda_gradients_, cuda_hessians_, cuda_num_data_, smaller_leaf_sum_gradients_, + smaller_leaf_sum_hessians_); + SynchronizeCUDADevice(); + CUDALeafSplitsInitKernel2<<>>( + smaller_leaf_sum_gradients_, smaller_leaf_sum_hessians_); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.hpp b/src/treelearner/cuda/cuda_leaf_splits_init.hpp index 16ae57033a1f..26b1fc64c8b5 100644 --- a/src/treelearner/cuda/cuda_leaf_splits_init.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits_init.hpp @@ -9,10 +9,12 @@ #ifdef USE_CUDA #include -#include #include +#include "new_cuda_utils.hpp" -#define INIT_SUM_BLOCK_SIZE (1024) +#define INIT_SUM_BLOCK_SIZE (6144) +#define NUM_THRADS_PER_BLOCK_SPLITS_INIT (1024) +#define NUM_DATA_THREAD_ADD (6) namespace LightGBM { @@ -36,9 +38,9 @@ class CUDALeafSplitsInit { const int* larger_leaf_index() { return larger_leaf_index_; } - void LaunchLeafSplitsInit(const int num_blocks, const int init_sum_block_size, - const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* num_data, - double* smaller_leaf_sum_gradients, double* smaller_leaf_sum_hessians); + const double + + void LaunchLeafSplitsInit(); protected: const score_t* cuda_gradients_; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index aaee5764ee2a..838d338e9deb 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -28,10 +28,10 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia num_gpus_ = 1;//config_->num_gpu > num_total_gpus ? num_total_gpus : config_->num_gpu; num_threads_ = OMP_NUM_THREADS(); Log::Warning("NewCUDATreeLearner::Init step 3"); - + gradients_and_hessians_.resize(2 * num_data_); AllocateFeatureTasks(); Log::Warning("NewCUDATreeLearner::Init step 4"); - AllocateCUDAMemory(is_constant_hessian); + AllocateMemory(is_constant_hessian); Log::Warning("NewCUDATreeLearner::Init step 5"); CreateCUDAHistogramConstructors(); @@ -65,9 +65,10 @@ void NewCUDATreeLearner::AllocateFeatureTasks() { } } -void NewCUDATreeLearner::AllocateCUDAMemory(const bool is_constant_hessian) { +void NewCUDATreeLearner::AllocateMemory(const bool is_constant_hessian) { device_data_indices_.resize(num_gpus_, nullptr); device_gradients_.resize(num_gpus_, nullptr); + device_gradients_and_hessians_.resize(num_gpus_, nullptr); if (!is_constant_hessian) { device_hessians_.resize(num_gpus_, nullptr); } @@ -90,6 +91,7 @@ void NewCUDATreeLearner::AllocateCUDAMemory(const bool is_constant_hessian) { void* gradients_ptr = reinterpret_cast(device_gradients_[device_id]); CUDASUCCESS_OR_FATAL(cudaMalloc(&gradients_ptr, num_data_ * sizeof(float))); device_gradients_[device_id] = reinterpret_cast(gradients_ptr); + AllocateCUDAMemory(2 * num_data_ * sizeof(score_t), &device_gradients_and_hessians_[device_id]); if (!is_constant_hessian) { if (device_hessians_[device_id] != nullptr) { CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id])); @@ -154,6 +156,7 @@ void NewCUDATreeLearner::PushDataIntoDeviceHistogramConstructors() { iter->Reset(0); for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { const uint32_t bin = static_cast(iter->RawGet(data_index)); + CHECK_LE(bin, 255); cuda_histogram_constructor->PushOneData(bin, group_id, data_index); } } @@ -207,6 +210,12 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); CUDASUCCESS_OR_FATAL(cudaMemcpy(device_gradients_[0], gradients, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice)); CUDASUCCESS_OR_FATAL(cudaMemcpy(device_hessians_[0], hessians, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice)); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (data_size_t i = 0; i < num_data_; ++i) { + gradients_and_hessians_[2 * i] = gradients[i]; + gradients_and_hessians_[2 * i + 1] = hessians[i]; + } + CopyFromHostToCUDADevice(device_gradients_and_hessians_[0], gradients_and_hessians_.data(), 2 * static_cast(num_data_)); Log::Warning("before initialization of leaf splits"); device_leaf_splits_initializers_[0]->Compute(); Log::Warning("after initialization of leaf splits"); @@ -215,7 +224,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, device_histogram_constructors_[0]->ConstructHistogramForLeaf(device_leaf_splits_initializers_[0]->smaller_leaf_index(), device_leaf_splits_initializers_[0]->larger_leaf_index(), device_splitters_[0]->leaf_num_data(), device_splitters_[0]->leaf_num_data_offsets(), - device_splitters_[0]->data_indices(), device_gradients_[0], device_hessians_[0]); + device_splitters_[0]->data_indices(), device_gradients_[0], device_hessians_[0], device_gradients_and_hessians_[0]); Log::Warning("after construction of root histograms"); } diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index db093fb6b028..2bb2003f1dc7 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -12,6 +12,7 @@ #include "cuda_leaf_splits_init.hpp" #include "cuda_histogram_constructor.hpp" #include "cuda_data_splitter.hpp" +#include "cuda_best_split_finder.hpp" namespace LightGBM { @@ -33,7 +34,7 @@ class NewCUDATreeLearner: public SerialTreeLearner { protected: void AllocateFeatureTasks(); - void AllocateCUDAMemory(const bool is_constant_hessian); + void AllocateMemory(const bool is_constant_hessian); void CreateCUDAHistogramConstructors(); @@ -53,6 +54,8 @@ class NewCUDATreeLearner: public SerialTreeLearner { int num_gpus_; // number of threads on CPU int num_threads_; + // gradient and hessian values packed togather + std::vector gradients_and_hessians_; // feature groups allocated to each device std::vector> device_feature_groups_; @@ -67,6 +70,8 @@ class NewCUDATreeLearner: public SerialTreeLearner { std::vector device_gradients_; // hessian values on CUDA devices std::vector device_hessians_; + // gradient and hessian values in CUDA devices + std::vector device_gradients_and_hessians_; // histogram storage on CUDA devices std::vector device_histograms_; diff --git a/src/treelearner/cuda/new_cuda_utils.hpp b/src/treelearner/cuda/new_cuda_utils.hpp index ccb4a4aa02c7..a7cb6a981d1a 100644 --- a/src/treelearner/cuda/new_cuda_utils.hpp +++ b/src/treelearner/cuda/new_cuda_utils.hpp @@ -10,6 +10,7 @@ #ifdef USE_CUDA #include +#include #include namespace LightGBM { @@ -37,11 +38,19 @@ void CopyFromCUDADeviceToHost(T* dst_ptr, const T* src_ptr, size_t size) { CUDASUCCESS_OR_FATAL(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost)); } +template +void CopyFromCUDADeviceToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice)); +} + void SynchronizeCUDADevice(); template void SetCUDAMemory(T* dst_ptr, int value, size_t size) { - CUDASUCCESS_OR_FATAL(cudaMemset(reinterpret_cast(dst_ptr), value, size)); + CUDASUCCESS_OR_FATAL(cudaMemset(reinterpret_cast(dst_ptr), value, size * sizeof(T))); } void PrintLastCUDAError(); diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 8b1725a64992..e063cc9d46b3 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -359,6 +359,9 @@ void SerialTreeLearner::ConstructHistograms( smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), ptr_smaller_leaf_hist_data); + for (int i = 0; i < 100; ++i) { + Log::Warning("bin %d grad %f hess %f", i, ptr_smaller_leaf_hist_data[2 * i], ptr_smaller_leaf_hist_data[2 * i + 1]); + } if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { // construct larger leaf hist_t* ptr_larger_leaf_hist_data = From 634a4f1265f3368e3884e805cfe8b22eb706e31c Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 29 Apr 2021 13:44:53 +0000 Subject: [PATCH 004/166] new cuda framework --- .../cuda/cuda_best_split_finder.cpp | 14 +- .../cuda/cuda_best_split_finder.cu | 33 +++-- .../cuda/cuda_best_split_finder.hpp | 18 +-- .../cuda/cuda_centralized_info.cpp | 32 +++++ .../cuda/cuda_centralized_info.hpp | 68 +++++++++ src/treelearner/cuda/cuda_data_partition.cpp | 48 +++++++ ...ata_splitter.cu => cuda_data_partition.cu} | 14 +- src/treelearner/cuda/cuda_data_partition.hpp | 68 +++++++++ src/treelearner/cuda/cuda_data_splitter.cpp | 60 -------- src/treelearner/cuda/cuda_data_splitter.hpp | 58 -------- .../cuda/cuda_histogram_constructor.cpp | 121 ++++++---------- .../cuda/cuda_histogram_constructor.cu | 135 +++--------------- .../cuda/cuda_histogram_constructor.hpp | 67 +++++---- src/treelearner/cuda/cuda_leaf_splits.cpp | 18 ++- src/treelearner/cuda/cuda_leaf_splits.cu | 28 ++-- src/treelearner/cuda/cuda_leaf_splits.hpp | 21 +++ .../cuda/cuda_leaf_splits_init.cpp | 66 --------- src/treelearner/cuda/cuda_leaf_splits_init.cu | 63 -------- .../cuda/cuda_leaf_splits_init.hpp | 65 --------- .../cuda/new_cuda_tree_learner.cpp | 111 +++++++------- .../cuda/new_cuda_tree_learner.hpp | 47 ++++-- src/treelearner/cuda/new_cuda_utils.cpp | 4 + src/treelearner/cuda/new_cuda_utils.hpp | 12 ++ 23 files changed, 518 insertions(+), 653 deletions(-) create mode 100644 src/treelearner/cuda/cuda_centralized_info.cpp create mode 100644 src/treelearner/cuda/cuda_centralized_info.hpp create mode 100644 src/treelearner/cuda/cuda_data_partition.cpp rename src/treelearner/cuda/{cuda_data_splitter.cu => cuda_data_partition.cu} (62%) create mode 100644 src/treelearner/cuda/cuda_data_partition.hpp delete mode 100644 src/treelearner/cuda/cuda_data_splitter.cpp delete mode 100644 src/treelearner/cuda/cuda_data_splitter.hpp delete mode 100644 src/treelearner/cuda/cuda_leaf_splits_init.cpp delete mode 100644 src/treelearner/cuda/cuda_leaf_splits_init.cu delete mode 100644 src/treelearner/cuda/cuda_leaf_splits_init.hpp diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index c38892535ff5..091ee6c8ace9 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -4,16 +4,18 @@ * license information. */ +#ifdef USE_CUDA + #include "cuda_best_split_finder.hpp" -#include "cuda_leaf_splits_init.hpp" +#include "cuda_leaf_splits.hpp" namespace LightGBM { - +/* CUDABestSplitFinder::CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* train_data, - const std::vector& feature_hist_offsets, const int max_num_leaves, + const std::vector& feature_hist_offsets, const int num_leaves, const double lambda_l1, const data_size_t min_data_in_leaf, const double min_sum_hessian_in_leaf, const double min_gain_to_split): -cuda_hist_(cuda_hist), num_features_(train_data->num_features()), max_num_leaves_(max_num_leaves), +cuda_hist_(cuda_hist), num_features_(train_data->num_features()), num_leaves_(num_leaves), feature_hist_offsets_(feature_hist_offsets), num_total_bin_(feature_hist_offsets.back()), lambda_l1(lambda_l1_), min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf), min_gain_to_split_(min_gain_to_split) { feature_missing_type_.resize(num_features_); @@ -75,5 +77,7 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplitsInit* smalle } void CUDABestSplitFinder::FindBestFromAllSplits() {} - +*/ } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index fdaac8caa8d0..257af429b2cc 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -4,10 +4,12 @@ * license information. */ +#ifdef USE_CUDA + #include "cuda_best_split_finder.hpp" namespace LightGBM { - +/* __device__ double ThresholdL1(double s, double l1) { const double reg_s = fmax(0.0, fabs(s) - l1); if (s >= 0.0f) { @@ -67,8 +69,8 @@ __device__ double GetSplitGains(double sum_left_gradients, __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, const uint32_t feature_num_bin, const uint8_t feature_mfb_offset, const uint32_t feature_default_bin, const uint8_t feature_missing_type, - const double lambda_l1, const double parent_gain, const data_size_t min_data_in_leaf, - const double min_sum_hessian_in_leaf, + const double lambda_l1, const double lambda_l2, const double parent_gain, const data_size_t min_data_in_leaf, + const double min_sum_hessian_in_leaf, const double min_gain_to_split, const double sum_gradients, const double sum_hessians, const data_size_t num_data, const bool reverse, const bool skip_default_bin, const bool na_as_missing, // output parameters @@ -86,15 +88,16 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, double best_gain = kMinScore; data_size_t best_left_count = 0; uint32_t best_threshold = feature_num_bin; - const double cnt_factor = num_data / sum_hessian; + const double cnt_factor = num_data / sum_hessians; const bool use_l1 = lambda_l1 > 0.0f; + const double min_gain_shift = parent_gain + min_gain_to_split; if (reverse) { double sum_right_gradient = 0.0f; double sum_right_hessian = kEpsilon; data_size_t right_count = 0; - int t = feature_num_bin - 1 - feature_mfb_offset - NA_AS_MISSING; + int t = feature_num_bin - 1 - feature_mfb_offset - na_as_missing; const int t_end = 1 - feature_mfb_offset; // from right to left, and we don't need data in bin0 @@ -129,7 +132,7 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, break; } - double sum_left_gradient = sum_gradient - sum_right_gradient; + double sum_left_gradient = sum_gradients - sum_right_gradient; // current split gain double current_gain = GetSplitGains( @@ -233,7 +236,7 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, __global__ void FindBestSplitsForLeafKernel(const hist_t* leaf_hist_ptr, const uint32_t* feature_hist_offsets, const uint8_t* feature_mfb_offsets, const uint32_t* feature_default_bins, - const uint8_t* feature_missing_types, const double* lambda_l1, const int* smaller_leaf_id, + const uint8_t* feature_missing_types, const double* lambda_l1, const double* lambda_l2, const int* smaller_leaf_id, const int* larger_leaf_id, const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, @@ -269,16 +272,16 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* leaf_hist_ptr, if (missing_type == 1) { FindBestSplitsForLeafKernelInner(leaf_hist_ptr + leaf_index, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], - feature_missing_types[inner_feature_index], *lambda_l1, *parent_gain, - *min_data_in_leaf, *min_sum_hessian_in_leaf, sum_gradients, sum_hessians, + feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, *parent_gain, + *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, num_data_in_leaf, reverse, true, false, out_gain, out_default_left, out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_right_sum_gradients, out_right_sum_hessians, out_right_num_data); } else { FindBestSplitsForLeafKernelInner(leaf_hist_ptr + leaf_index, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], - feature_missing_types[inner_feature_index], *lambda_l1, *parent_gain, - *min_data_in_leaf, *min_sum_hessian_in_leaf, sum_gradients, sum_hessians, + feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, *parent_gain, + *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, num_data_in_leaf, reverse, false, true, out_gain, out_default_left, out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_right_sum_gradients, out_right_sum_hessians, out_right_num_data); @@ -287,8 +290,8 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* leaf_hist_ptr, if (reverse) { FindBestSplitsForLeafKernelInner(leaf_hist_ptr + leaf_index, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], - feature_missing_types[inner_feature_index], *lambda_l1, *parent_gain, - *min_data_in_leaf, *min_sum_hessian_in_leaf, sum_gradients, sum_hessians, + feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, *parent_gain, + *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, num_data_in_leaf, reverse, true, false, out_gain, out_default_left, out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_right_sum_gradients, out_right_sum_hessians, out_right_num_data); @@ -322,5 +325,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(const int* smaller_l cuda_best_split_left_count_, cuda_best_split_right_sum_gradient_, cuda_best_split_right_sum_hessian_, cuda_best_split_right_count_); } - +*/ } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 50cd7e7c7fb1..95e58f381dd3 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -3,9 +3,12 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ + #ifndef LIGHTGBM_CUDA_BEST_SPLIT_FINDER_HPP_ #define LIGHTGBM_CUDA_BEST_SPLIT_FINDER_HPP_ +#ifdef USE_CUDA + #include "new_cuda_utils.hpp" #include @@ -13,14 +16,12 @@ #include -#ifdef USE_CUDA - namespace LightGBM { class CUDABestSplitFinder { - public: +/* public: CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* train_data, - const std::vector& feature_hist_offsets, const int max_num_leaves); + const std::vector& feature_hist_offsets, const int num_leaves); void Init(); @@ -37,10 +38,6 @@ class CUDABestSplitFinder { private: void LaunchFindBestSplitsForLeafKernel(const int* smaller_leaf_id, const int* larger_leaf_id, const double* parent_gain); - int* cuda_leaf_best_split_feature_index_; - int* cuda_leaf_best_split_threshold_; - double* cuda_leaf_best_split_gain_; - int* cuda_best_leaf_; int* cuda_best_split_feature_index_; int* cuda_best_split_threshold_; @@ -63,7 +60,7 @@ class CUDABestSplitFinder { hist_t* prefix_sum_hist_left_; hist_t* prefix_sum_hist_right_; const int num_features_; - const int max_num_leaves_; + const int num_leaves_; const int num_total_bin_; int* cuda_num_total_bin_; @@ -86,10 +83,9 @@ class CUDABestSplitFinder { double* cuda_lambda_l1_; data_size_t* cuda_min_data_in_leaf_; double* cuda_min_sum_hessian_in_leaf_; - double* cuda_min_gain_to_split_; + double* cuda_min_gain_to_split_;*/ }; - } #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_centralized_info.cpp b/src/treelearner/cuda/cuda_centralized_info.cpp new file mode 100644 index 000000000000..45998445c4ac --- /dev/null +++ b/src/treelearner/cuda/cuda_centralized_info.cpp @@ -0,0 +1,32 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_centralized_info.hpp" + +namespace LightGBM { + +CUDACentralizedInfo::CUDACentralizedInfo(const data_size_t num_data, const int num_leaves, const int num_features): +num_data_(num_data), num_leaves_(num_leaves), num_features_(num_features) {} + +void CUDACentralizedInfo::Init() { + InitCUDAMemoryFromHostMemory(&cuda_num_data_, &num_data_, 1); + InitCUDAMemoryFromHostMemory(&cuda_num_leaves_, &num_leaves_, 1); + InitCUDAMemoryFromHostMemory(&cuda_num_features_, &num_features_, 1); + + AllocateCUDAMemory(static_cast(num_data_), &cuda_gradients_); + AllocateCUDAMemory(static_cast(num_data_), &cuda_hessians_); +} + +void CUDACentralizedInfo::BeforeTrain(const score_t* gradients, const score_t* hessians) { + CopyFromHostToCUDADevice(cuda_gradients_, gradients, static_cast(num_data_)); + CopyFromHostToCUDADevice(cuda_hessians_, hessians, static_cast(num_data_)); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_centralized_info.hpp b/src/treelearner/cuda/cuda_centralized_info.hpp new file mode 100644 index 000000000000..97844e2cb19b --- /dev/null +++ b/src/treelearner/cuda/cuda_centralized_info.hpp @@ -0,0 +1,68 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifndef LIGHTGBM_CUDA_CENTRALIZED_INFO_HPP_ +#define LIGHTGBM_CUDA_CENTRALIZED_INFO_HPP_ + +#ifdef USE_CUDA + +#include +#include +#include "new_cuda_utils.hpp" + +namespace LightGBM { + +// maintina centralized information for tree training +// these information are shared by various cuda objects in tree training +class CUDACentralizedInfo { + public: + CUDACentralizedInfo(const data_size_t num_data, const int num_leaves, const int num_features); + + void Init(); + + void BeforeTrain(const score_t* gradients, const score_t* hessians); + + const data_size_t* cuda_num_data() const { return cuda_num_data_; } + + const int* cuda_num_leaves() const { return cuda_num_leaves_; } + + const int* cuda_num_features() const { return cuda_num_features_; } + + const score_t* cuda_gradients() const { return cuda_gradients_; } + + const score_t* cuda_hessians() const { return cuda_hessians_; } + + void Test() { + data_size_t test_num_data = 0; + int test_num_leaves = 0; + int test_num_features = 0; + + CopyFromCUDADeviceToHost(&test_num_data, cuda_num_data_, 1); + CopyFromCUDADeviceToHost(&test_num_leaves, cuda_num_leaves_, 1); + CopyFromCUDADeviceToHost(&test_num_features, cuda_num_features_, 1); + Log::Warning("CUDACentralizedInfo::Test test_num_data = %d", test_num_data); + Log::Warning("CUDACentralizedInfo::Test test_num_leaves = %d", test_num_leaves); + Log::Warning("CUDACentralizedInfo::Test test_num_features = %d", test_num_features); + } + + private: + // Host memory + const data_size_t num_data_; + const int num_leaves_; + const int num_features_; + + // CUDA memory, held by this object + data_size_t* cuda_num_data_; + int* cuda_num_leaves_; + int* cuda_num_features_; + score_t* cuda_gradients_; + score_t* cuda_hessians_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_CUDA_CENTRALIZED_INFO_HPP_ diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp new file mode 100644 index 000000000000..77317d512068 --- /dev/null +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -0,0 +1,48 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_data_partition.hpp" + +namespace LightGBM { + +CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_leaves, + const data_size_t* cuda_num_data, const int* cuda_num_leaves): + num_data_(num_data), num_leaves_(num_leaves) { + cuda_num_data_ = cuda_num_data; + cuda_num_leaves_ = cuda_num_leaves; +} + +void CUDADataPartition::Init() { + // allocate CUDA memory + AllocateCUDAMemory(static_cast(num_data_), &cuda_data_indices_); + AllocateCUDAMemory(static_cast(num_leaves_) + 1, &cuda_leaf_num_data_offsets_); +} + +void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { + if (data_indices == nullptr) { + // no bagging + LaunchFillDataIndicesBeforeTrain(); + SetCUDAMemory(cuda_leaf_num_data_offsets_, 0, static_cast(num_leaves_) + 1); + SynchronizeCUDADevice(); + CopyFromCUDADeviceToCUDADevice(cuda_leaf_num_data_offsets_ + 1, cuda_num_data_, 1); + SynchronizeCUDADevice(); + } else { + Log::Fatal("bagging is not supported by GPU"); + } +} + +void CUDADataPartition::Split(const int* /*leaf_id*/, + const int* /*best_split_feature*/, + const int* /*best_split_threshold*/) {} + +Tree* CUDADataPartition::GetCPUTree() {} + + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_splitter.cu b/src/treelearner/cuda/cuda_data_partition.cu similarity index 62% rename from src/treelearner/cuda/cuda_data_splitter.cu rename to src/treelearner/cuda/cuda_data_partition.cu index 4bb599cb055a..5f30391f0f41 100644 --- a/src/treelearner/cuda/cuda_data_splitter.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -4,9 +4,9 @@ * license information. */ -#include "cuda_data_splitter.hpp" +#ifdef USE_CUDA -#define FILL_INDICES_BLOCK_SIZE (1024) +#include "cuda_data_partition.hpp" namespace LightGBM { @@ -19,9 +19,11 @@ __global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_dat } } -void CUDADataSplitter::LaunchFillDataIndicesBeforeTrain() { - const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE - 1) / FILL_INDICES_BLOCK_SIZE; - FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_); +void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() { + const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; + FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_); } -} // namespace LightGBM \ No newline at end of file +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp new file mode 100644 index 000000000000..e13925ffb183 --- /dev/null +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -0,0 +1,68 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ +#define LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ + +#ifdef USE_CUDA + +#include +#include +#include "new_cuda_utils.hpp" + +#define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) + +namespace LightGBM { + +class CUDADataPartition { + public: + CUDADataPartition(const data_size_t num_data, const int num_leaves, + const data_size_t* cuda_num_data, const int* cuda_num_leaves); + + void Init(); + + void BeforeTrain(const data_size_t* data_indices); + + void Split(const int* leaf_id, const int* best_split_feature, const int* best_split_threshold); + + Tree* GetCPUTree(); + + const data_size_t* cuda_leaf_num_data_offsets() { return cuda_leaf_num_data_offsets_; } + + void Test() { + PrintLastCUDAError(); + std::vector test_data_indices(num_data_, -1); + CopyFromCUDADeviceToHost(test_data_indices.data(), cuda_data_indices_, static_cast(num_data_)); + for (data_size_t i = 0; i < num_data_; ++i) { + CHECK_EQ(i, test_data_indices[i]); + } + Log::Warning("CUDADataPartition::Test Pass"); + } + + const data_size_t* cuda_leaf_num_data_offsets() const { return cuda_leaf_num_data_offsets_; } + + private: + // kernel launch functions + void LaunchFillDataIndicesBeforeTrain(); + + void LaunchSplitKernel(const int* leaf_id, const int* best_split_feature, const int* best_split_threshold); + + // Host memory + const data_size_t num_data_; + const int num_leaves_; + + // CUDA memory, held by this object + data_size_t* cuda_data_indices_; + data_size_t* cuda_leaf_num_data_offsets_; + + // CUDA memory, held by other object + const data_size_t* cuda_num_data_; + const int* cuda_num_leaves_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ diff --git a/src/treelearner/cuda/cuda_data_splitter.cpp b/src/treelearner/cuda/cuda_data_splitter.cpp deleted file mode 100644 index bf00496daa9a..000000000000 --- a/src/treelearner/cuda/cuda_data_splitter.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#include "cuda_data_splitter.hpp" - -namespace LightGBM { - -CUDADataSplitter::CUDADataSplitter(const data_size_t num_data, const int max_num_leaves): - num_data_(num_data), max_num_leaves_(max_num_leaves) {} - -void CUDADataSplitter::Init() { - // allocate GPU memory - AllocateCUDAMemory(static_cast(num_data_), &cuda_data_indices_); - - AllocateCUDAMemory(static_cast(max_num_leaves_), &cuda_leaf_num_data_offsets_); - - AllocateCUDAMemory(1, &cuda_num_data_); - CopyFromHostToCUDADevice(cuda_num_data_, &num_data_, 1); - - AllocateCUDAMemory(1, &cuda_max_num_leaves_); - CopyFromHostToCUDADevice(cuda_max_num_leaves_, &max_num_leaves_, 1); - - AllocateCUDAMemory(static_cast(max_num_leaves_), &cuda_leaf_num_data_offsets_); - AllocateCUDAMemory(static_cast(max_num_leaves_), &cuda_leaf_num_data_); -} - -void CUDADataSplitter::BeforeTrain(const data_size_t* data_indices) { - if (data_indices == nullptr) { - // no bagging - LaunchFillDataIndicesBeforeTrain(); - SynchronizeCUDADevice(); - data_indices_.resize(num_data_); - CopyFromCUDADeviceToHost(data_indices_.data(), cuda_data_indices_, static_cast(num_data_)); - /*for (int i = 0; i < 100; ++i) { - Log::Warning("data_indices_[%d] = %d", i, data_indices_[i]); - Log::Warning("data_indices_[end - %d] = %d", i, data_indices_[num_data_ - 1 - i]); - }*/ - SetCUDAMemory(cuda_leaf_num_data_offsets_, 0, max_num_leaves_); - SetCUDAMemory(cuda_leaf_num_data_, 0, max_num_leaves_); - //Log::Warning("num_data_ = %d", num_data_); - CopyFromHostToCUDADevice(cuda_leaf_num_data_, &num_data_, 1); - data_size_t root_leaf_num_data = 0; - CopyFromCUDADeviceToHost(&root_leaf_num_data, cuda_leaf_num_data_, 1); - //Log::Warning("root_leaf_num_data = %d", root_leaf_num_data); - } else { - Log::Fatal("bagging is not supported by GPU"); - } -} - -void CUDADataSplitter::Split(const int* /*leaf_id*/, - const int* /*best_split_feature*/, - const int* /*best_split_threshold*/) {} - -Tree* CUDADataSplitter::GetCPUTree() {} - - -} // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_data_splitter.hpp b/src/treelearner/cuda/cuda_data_splitter.hpp deleted file mode 100644 index 5610e6ca3306..000000000000 --- a/src/treelearner/cuda/cuda_data_splitter.hpp +++ /dev/null @@ -1,58 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ -#ifndef LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ -#define LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ - -#ifdef USE_CUDA - -#include -#include -#include "new_cuda_utils.hpp" - -namespace LightGBM { - -class CUDADataSplitter { - public: - CUDADataSplitter(const data_size_t num_data, const int max_num_leaves); - - void Init(); - - void BeforeTrain(const data_size_t* data_indices); - - void Split(const int* leaf_id, const int* best_split_feature, const int* best_split_threshold); - - Tree* GetCPUTree(); - - const data_size_t* data_indices() { return cuda_data_indices_; } - - const data_size_t* leaf_num_data_offsets() { return cuda_leaf_num_data_offsets_; } - - const data_size_t* leaf_num_data() { return cuda_leaf_num_data_; } - - private: - // kernel launch functions - void LaunchFillDataIndicesBeforeTrain(); - - void LaunchSplitKernel(const int* leaf_id, const int* best_split_feature, const int* best_split_threshold); - - // CPU - const data_size_t num_data_; - std::vector data_indices_; - const int max_num_leaves_; - - // GPU - data_size_t* cuda_data_indices_; - data_size_t* cuda_leaf_num_data_offsets_; - data_size_t* cuda_leaf_num_data_; - - data_size_t* cuda_num_data_; - int* cuda_max_num_leaves_; -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ \ No newline at end of file diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 544689b2addb..b0b73cb0159e 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -12,63 +12,52 @@ namespace LightGBM { -CUDAHistogramConstructor::CUDAHistogramConstructor(const std::vector& feature_group_ids, - const Dataset* train_data, const int max_num_leaves, - hist_t* cuda_hist): num_data_(train_data->num_data()), - num_feature_groups_(feature_group_ids.size()), - max_num_leaves_(max_num_leaves) { +CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, + const int num_leaves, const int num_threads, + const score_t* cuda_gradients, const score_t* cuda_hessians): num_data_(train_data->num_data()), + num_features_(train_data->num_features()), num_leaves_(num_leaves), + num_feature_groups_(train_data->num_feature_groups()), num_threads_(num_threads), + cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians) { int offset = 0; - int col_group_offset = 0; - for (size_t i = 0; i < feature_group_ids.size(); ++i) { - const int group_id = feature_group_ids[i]; + for (int group_id = 0; group_id < train_data->num_feature_groups(); ++group_id) { feature_group_bin_offsets_.emplace_back(offset); - feature_group_bin_offsets_by_col_groups_.emplace_back(col_group_offset); offset += train_data->FeatureGroupNumBin(group_id); - col_group_offset += train_data->FeatureGroupNumBin(group_id); - if ((i + 1) % NUM_FEATURE_PER_THREAD_GROUP == 0) { - col_group_offset = 0; - } } feature_group_bin_offsets_.emplace_back(offset); - feature_group_bin_offsets_by_col_groups_.emplace_back(col_group_offset); num_total_bin_ = offset; - cuda_hist_ = cuda_hist; } -void CUDAHistogramConstructor::Init() { +void CUDAHistogramConstructor::Init(const Dataset* train_data) { // allocate CPU memory - cpu_data_.resize(num_data_ * num_feature_groups_, 0); + data_.resize(num_data_ * num_feature_groups_, 0); // allocate GPU memory - void* cuda_data_ptr = nullptr; - CUDASUCCESS_OR_FATAL(cudaMalloc(&cuda_data_ptr, num_data_ * num_feature_groups_ * sizeof(uint8_t))); - cuda_data_ = reinterpret_cast(cuda_data_ptr); - - void* cuda_hist_ptr = nullptr; - CUDASUCCESS_OR_FATAL(cudaMalloc(&cuda_hist_ptr, num_total_bin_ * max_num_leaves_ * sizeof(double))); - cuda_hist_ = reinterpret_cast(cuda_hist_ptr); - - void* cuda_num_total_bin_ptr = nullptr; - CUDASUCCESS_OR_FATAL(cudaMalloc(&cuda_num_total_bin_ptr, sizeof(int))); - cuda_num_total_bin_ = reinterpret_cast(cuda_num_total_bin_ptr); - CUDASUCCESS_OR_FATAL(cudaMemcpy(cuda_num_total_bin_ptr, reinterpret_cast(&num_total_bin_), sizeof(int), cudaMemcpyHostToDevice)); - - AllocateCUDAMemory(1, &cuda_num_feature_groups_); - CopyFromHostToCUDADevice(cuda_num_feature_groups_, &num_feature_groups_, 1); - - AllocateCUDAMemory(feature_group_bin_offsets_.size(), &cuda_feature_group_bin_offsets_); - CopyFromHostToCUDADevice(cuda_feature_group_bin_offsets_, - feature_group_bin_offsets_.data(), - feature_group_bin_offsets_.size()); - AllocateCUDAMemory(feature_group_bin_offsets_by_col_groups_.size(), &cuda_feature_group_bin_offsets_by_col_groups_); - CopyFromHostToCUDADevice(cuda_feature_group_bin_offsets_by_col_groups_, - feature_group_bin_offsets_by_col_groups_.data(), - feature_group_bin_offsets_by_col_groups_.size()); - /*for (size_t i = 0; i < feature_group_bin_offsets_.size(); ++i) { - Log::Warning("feature_group_bin_offsets_[%d] = %d", i, feature_group_bin_offsets_[i]); + AllocateCUDAMemory(num_feature_groups_ * num_data_, &cuda_data_); + + AllocateCUDAMemory(num_total_bin_ * 2 * num_leaves_, &cuda_hist_); + + InitCUDAMemoryFromHostMemory(&cuda_num_total_bin_, &num_total_bin_, 1); + + InitCUDAMemoryFromHostMemory(&cuda_num_feature_groups_, &num_feature_groups_, 1); + + InitCUDAMemoryFromHostMemory(&cuda_feature_group_bin_offsets_, + feature_group_bin_offsets_.data(), feature_group_bin_offsets_.size()); + + InitCUDAData(train_data); +} + +void CUDAHistogramConstructor::InitCUDAData(const Dataset* train_data) { + std::vector> bin_iterators(num_feature_groups_); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int group_id = 0; group_id < num_feature_groups_; ++group_id) { + bin_iterators[group_id].reset(train_data->FeatureGroupIterator(group_id)); + bin_iterators[group_id]->Reset(0); + for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { + const uint32_t bin = static_cast(bin_iterators[group_id]->RawGet(data_index)); + PushOneData(bin, group_id, data_index); + } } - for (size_t i = 0; i < feature_group_bin_offsets_by_col_groups_.size(); ++i) { - Log::Warning("feature_group_bin_offsets_by_col_groups_[%d] = %d", i, feature_group_bin_offsets_by_col_groups_[i]); - }*/ + CopyFromHostToCUDADevice(cuda_data_, data_.data(), data_.size()); + SynchronizeCUDADevice(); } void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, @@ -77,53 +66,23 @@ void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, const uint8_t feature_bin_value_uint8 = static_cast(feature_bin_value); const size_t index = static_cast(data_index) * static_cast(num_feature_groups_) + static_cast(feature_group_id); - cpu_data_[index] = feature_bin_value_uint8; + data_[index] = feature_bin_value_uint8; } -void CUDAHistogramConstructor::FinishLoad() { - // copy CPU data to GPU - void* cuda_data_ptr = reinterpret_cast(cuda_data_); - const void* cpu_data_ptr = reinterpret_cast(cpu_data_.data()); - CUDASUCCESS_OR_FATAL(cudaMemcpy(cuda_data_ptr, cpu_data_ptr, sizeof(uint8_t) * num_data_ * num_feature_groups_, cudaMemcpyHostToDevice)); -} - -void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* smaller_leaf_index, const int* /*larger_leaf_index*/, - const data_size_t* num_data_in_leaf, const data_size_t* leaf_data_offset, const data_size_t* data_indices_ptr, - const score_t* cuda_gradients, const score_t* cuda_hessians, const score_t* cuda_gradients_and_hessians) { - //auto start = std::chrono::steady_clock::now(); - - /*for (size_t i = 0; i < feature_group_bin_offsets_.size(); ++i) { - Log::Warning("feature_group_bin_offsets_[%d] = %d", i, feature_group_bin_offsets_[i]); - }*/ - AllocateCUDAMemory(num_data_, &cuda_int_gradients_); - AllocateCUDAMemory(num_data_, &cuda_int_hessians_); - AllocateCUDAMemory(num_data_, &cuda_int_gradients_and_hessians_); - SetCUDAMemory(cuda_int_gradients_, 3, num_data_); - SetCUDAMemory(cuda_int_hessians_, 3, num_data_); - SetCUDAMemory(cuda_int_gradients_and_hessians_, 3, num_data_); - /*for (size_t i = 0; i < feature_group_bin_offsets_by_col_groups_.size(); ++i) { - Log::Warning("feature_group_bin_offsets_by_col_groups_[%d] = %d", i, feature_group_bin_offsets_by_col_groups_[i]); - }*/ +void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const int* /*cuda_larger_leaf_index*/, + const data_size_t* cuda_data_indices_in_smaller_leaf, const data_size_t* /*cuda_data_indices_in_larger_leaf*/, + const data_size_t* cuda_leaf_num_data_offsets) { auto start = std::chrono::steady_clock::now(); - LaunchConstructHistogramKernel(smaller_leaf_index, num_data_in_leaf, leaf_data_offset, data_indices_ptr, - cuda_gradients, cuda_hessians, cuda_gradients_and_hessians); + LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_leaf_num_data_offsets, cuda_data_indices_in_smaller_leaf); SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); Log::Warning("LaunchConstructHistogramKernel time %f", duration); PrintLastCUDAError(); - //Log::Warning("histogram construction finished"); - //Log::Warning("num_total_bin_ = %d", num_total_bin_); - //Log::Warning("max_num_leaves_ = %d", max_num_leaves_); std::vector cpu_hist(6143 * 2, 0.0f); CopyFromCUDADeviceToHost(cpu_hist.data(), cuda_hist_, 6143 * 2); - /*for (int i = 0; i < 6143; ++i) { - Log::Warning("bin %d grad %f hess %f", i, cpu_hist[2 * i], cpu_hist[2 * i + 1]); - }*/ } - - } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 6226470ff2d5..54b0aff10b08 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -13,40 +13,12 @@ namespace LightGBM { __global__ void CUDAConstructHistogramKernel(const int* leaf_index, const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* data_indices_ptr, hist_t* feature_histogram, const int* num_feature_groups, - const int* leaf_num_data_offset, const uint8_t* data, const data_size_t* num_data_in_leaf, - const uint32_t* feature_group_offsets_by_col_group, - const uint32_t* feature_group_offsets, - const score_t* cuda_gradients_and_hessians, - const int8_t* cuda_int_gradients, - const int8_t* cuda_int_hessians, - const int32_t* cuda_int_gradients_and_hessians) { + const data_size_t* leaf_num_data_offsets, const uint8_t* data, const uint32_t* feature_group_offsets) { const unsigned int threadIdx_x = threadIdx.x; - /*if (threadIdx_x == 0) { - printf("CUDAConstructHistogramKernel step 0\n"); - }*/ const int num_feature_groups_ref = *num_feature_groups; - /*if (threadIdx_x == 0) { - printf("CUDAConstructHistogramKernel step 1\n"); - }*/ const int leaf_index_ref = *leaf_index; - /*if (threadIdx_x == 0) { - printf("CUDAConstructHistogramKernel step 2\n"); - }*/ - const int num_data_in_smaller_leaf_ref = *(num_data_in_leaf + leaf_index_ref); - /*if (threadIdx_x == 0) { - printf("CUDAConstructHistogramKernel step 3\n"); - }*/ - const int leaf_num_data_in_smaller_leaf_ref = *(leaf_num_data_offset + leaf_index_ref); - /*printf("num_feature_groups_ref = %d\n", num_feature_groups_ref); - printf("leaf_index_ref = %d\n", leaf_index_ref); - printf("num_data_in_smaller_leaf_ref = %d\n", num_data_in_smaller_leaf_ref); - printf("leaf_num_data_in_smaller_leaf_ref = %d\n", leaf_num_data_in_smaller_leaf_ref);*/ - const data_size_t* data_indices_in_smaller_leaf = data_indices_ptr + leaf_num_data_in_smaller_leaf_ref; + const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data_offsets[leaf_index_ref + 1] - leaf_num_data_offsets[leaf_index_ref]; __shared__ float shared_hist[SHRAE_HIST_SIZE]; // 256 * 24 * 2, can use 24 features - //__shared__ int32_t shared_int_hist[SHRAE_HIST_SIZE]; - //uint32_t bin_offset = feature_group_offsets[blockIdx.x * 12]; - //const uint32_t next_feature_group_start = (blockIdx.x + 1) * 12; - //const uint32_t next_col_group_first_feature = next_feature_group_start > 28 ? 28 : next_feature_group_start; uint32_t num_bins_in_col_group = feature_group_offsets[blockDim.x]; const uint32_t num_items_per_thread = (2 * num_bins_in_col_group + NUM_THRADS_PER_BLOCK - 1) / NUM_THRADS_PER_BLOCK; const int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; @@ -55,94 +27,35 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, num_bins_in_col_group * 2 : thread_start + num_items_per_thread; for (uint32_t i = thread_start; i < thread_end; ++i) { shared_hist[i] = 0.0f; - //shared_int_hist[i] = 0; } __syncthreads(); - - /*if (blockIdx.x == 0 && threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0) { - printf("num_data_in_leaf = %d\n", num_data_in_smaller_leaf_ref); - printf("num_feature_groups_ref = %d", num_feature_groups_ref); - }*/ const unsigned int threadIdx_y = threadIdx.y; - const unsigned int blockIdx_x = blockIdx.x; const unsigned int blockIdx_y = blockIdx.y; - const unsigned int blockDim_x = blockDim.x; - const unsigned int offset = threadIdx_x % 2; - //if ((threadIdx_x < 24 && blockIdx_x < 2) || (threadIdx_x < 8 && blockIdx_x == 2)) { - //const int feature_group_index = threadIdx_x / 2 + blockIdx_x * blockDim_x / 8 * 3; - /*if (feature_group_index >= 28) { - printf("error feature_group_index = %d\n", feature_group_index); - }*/ - const data_size_t start = threadIdx_y * NUM_DATA_PER_THREAD + blockIdx_y * blockDim.y * NUM_DATA_PER_THREAD; - const data_size_t end = start + NUM_DATA_PER_THREAD > num_data_in_smaller_leaf_ref ? - num_data_in_smaller_leaf_ref : start + NUM_DATA_PER_THREAD; - /*if (offset == 0) { - // handle gradient - for (data_size_t i = start; i < end; ++i) { - const score_t gradient = cuda_gradients[i]; - const data_size_t data_index = data_indices_in_smaller_leaf[i]; - if (data_index != i) { - printf("error data_index = %d vs i = %d", data_index, i); - } - const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + feature_group_index]) + - feature_group_offsets_by_col_group[feature_group_index]; - //shared_hist[bin << 1] += gradient; - atomicAdd_system(shared_hist + (bin << 1), gradient); - } - } else { - // handle hessian - for (data_size_t i = start; i < end; ++i) { - const score_t hessian = cuda_hessians[i]; - const data_size_t data_index = data_indices_in_smaller_leaf[i]; - const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + feature_group_index]) + - feature_group_offsets_by_col_group[feature_group_index]; - //shared_hist[(bin << 1) + 1] += hessian; - atomicAdd_system(shared_hist + ((bin << 1) + 1), hessian); - } - }*/ - for (data_size_t i = start; i < end; ++i) { - const score_t grad = cuda_gradients[i]; - const score_t hess = cuda_hessians[i]; - const data_size_t data_index = data_indices_in_smaller_leaf[i]; - const uint32_t bin = static_cast(data[i * num_feature_groups_ref + threadIdx_x]) + - feature_group_offsets[threadIdx_x]; - const uint32_t pos = bin << 1; - float* pos_ptr = shared_hist + pos; - atomicAdd_system(pos_ptr, grad); - atomicAdd_system(pos_ptr + 1, hess); - //const int32_t grad = static_cast(cuda_int_gradients[i]); - //const int32_t hess = static_cast(cuda_int_hessians[i]); - /*const int32_t grad_and_hess = cuda_int_gradients_and_hessians[i]; - const data_size_t data_index = data_indices_in_smaller_leaf[i]; - const uint32_t bin = static_cast(data[i * num_feature_groups_ref + threadIdx_x]) + - feature_group_offsets[threadIdx_x]; - //const uint32_t pos = bin << 1; - int32_t* pos_ptr = shared_int_hist + bin; - atomicAdd_system(pos_ptr, grad_and_hess);*/ - //atomicAdd_system(pos_ptr + 1, hess); - } - //} + const data_size_t start = threadIdx_y * NUM_DATA_PER_THREAD + blockIdx_y * blockDim.y * NUM_DATA_PER_THREAD; + const data_size_t end = start + NUM_DATA_PER_THREAD > num_data_in_smaller_leaf_ref ? + num_data_in_smaller_leaf_ref : start + NUM_DATA_PER_THREAD; + for (data_size_t i = start; i < end; ++i) { + const score_t grad = cuda_gradients[i]; + const score_t hess = cuda_hessians[i]; + const data_size_t data_index = data_indices_ptr[i]; + const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + threadIdx_x]) + + feature_group_offsets[threadIdx_x]; + const uint32_t pos = bin << 1; + float* pos_ptr = shared_hist + pos; + atomicAdd_system(pos_ptr, grad); + atomicAdd_system(pos_ptr + 1, hess); + } __syncthreads(); - /*uint32_t bin_offset = feature_group_offsets[blockIdx.x * 12]; - const uint32_t next_feature_group_start = (blockIdx.x + 1) * 12; - const uint32_t next_col_group_first_feature = next_feature_group_start > 28 ? 28 : next_feature_group_start; - uint32_t num_bins_in_col_group = feature_group_offsets[next_col_group_first_feature] - bin_offset; - const uint32_t num_items_per_thread = (2 * num_bins_in_col_group + 1023) / 1024; - const int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; - const uint32_t thread_start = thread_idx * num_items_per_thread; - const uint32_t thread_end = thread_start + num_items_per_thread > num_bins_in_col_group * 2 ? - num_bins_in_col_group * 2 : thread_start + num_items_per_thread;*/ for (uint32_t i = thread_start; i < thread_end; ++i) { - //feature_histogram[i + bin_offset * 2] += shared_hist[thread_idx]; atomicAdd_system(feature_histogram + i, shared_hist[i]); } } void CUDAHistogramConstructor::LaunchConstructHistogramKernel( - const int* smaller_leaf_index, const data_size_t* num_data_in_leaf, const data_size_t* leaf_num_data_offset, - const data_size_t* data_indices_ptr, const score_t* cuda_gradients, const score_t* cuda_hessians, - const score_t* cuda_gradients_and_hessians) { - const int block_dim_x = 28; + const int* cuda_smaller_leaf_index, + const data_size_t* cuda_data_indices_in_smaller_leaf, + const data_size_t* cuda_leaf_num_data_offsets) { + const int block_dim_x = num_features_; // TODO(shiyu1994): only supports the case when the whole histogram can be loaded into shared memory const int block_dim_y = NUM_THRADS_PER_BLOCK / block_dim_x; const int grid_dim_y = ((num_data_ + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + block_dim_y - 1) / block_dim_y; const int grid_dim_x = (static_cast(num_feature_groups_ + NUM_FEATURE_PER_THREAD_GROUP - 1) / NUM_FEATURE_PER_THREAD_GROUP); @@ -150,11 +63,9 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( Log::Warning("gid_dim_x = %d, grid_dim_y = %d", grid_dim_x, grid_dim_y); dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); - CUDAConstructHistogramKernel<<>>(smaller_leaf_index, cuda_gradients, cuda_hessians, - data_indices_ptr, cuda_hist_, cuda_num_feature_groups_, leaf_num_data_offset, cuda_data_, num_data_in_leaf, - cuda_feature_group_bin_offsets_by_col_groups_, - cuda_feature_group_bin_offsets_, cuda_gradients_and_hessians, cuda_int_gradients_, cuda_int_hessians_, - cuda_int_gradients_and_hessians_); + CUDAConstructHistogramKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_hist_, cuda_num_feature_groups_, cuda_leaf_num_data_offsets, cuda_data_, + cuda_feature_group_bin_offsets_); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index d4725f677255..c840af4eee1a 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -15,55 +15,64 @@ #include -namespace LightGBM { - - #define SHRAE_HIST_SIZE (6144 * 2) #define NUM_DATA_PER_THREAD (400) #define NUM_THRADS_PER_BLOCK (504) #define NUM_FEATURE_PER_THREAD_GROUP (28) +namespace LightGBM { + class CUDAHistogramConstructor { public: - CUDAHistogramConstructor(const std::vector& feature_group_ids, - const Dataset* train_data, const int max_num_leaves, - hist_t* cuda_hist); + CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, + const score_t* cuda_gradients, const score_t* cuda_hessians); - void Init(); + void Init(const Dataset* train_data); - void PushOneData(const uint32_t feature_bin_value, const int feature_group_id, const data_size_t data_index); - - void FinishLoad(); + void ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, + const data_size_t* cuda_data_indices_in_smaller_leaf, const data_size_t* cuda_data_indices_in_larger_leaf, + const data_size_t* cuda_leaf_num_data_offsets); - void ConstructHistogramForLeaf(const int* smaller_leaf_index, const int* larger_leaf_index, - const data_size_t* num_data_in_smaller_leaf, const int* smaller_leaf_data_offset, const data_size_t* data_indices_ptr, - const score_t* cuda_gradients, const score_t* cuda_hessians, const score_t* cuda_gradients_and_hessians); + void LaunchConstructHistogramKernel(const int* cuda_leaf_index, + const data_size_t* cuda_data_indices_in_leaf, + const data_size_t* cuda_leaf_num_data_offsets); - void LaunchConstructHistogramKernel( - const int* smaller_leaf_index, const data_size_t* num_data_in_leaf, const data_size_t* leaf_num_data_offset, - const data_size_t* data_indices_ptr, const score_t* cuda_gradients, const score_t* cuda_hessians, - const score_t* cuda_gradients_and_hessians); + const hist_t* cuda_hist() const { return cuda_hist_; } - hist_t* cuda_hist() { return cuda_hist_; } + void TestAfterInit() { + std::vector test_data(data_.size(), 0); + CopyFromCUDADeviceToHost(test_data.data(), cuda_data_, data_.size()); + for (size_t i = 0; i < 100; ++i) { + Log::Warning("CUDAHistogramConstructor::TestAfterInit test_data[%d] = %d", i, test_data[i]); + } + } private: + void InitCUDAData(const Dataset* train_data); + + void PushOneData(const uint32_t feature_bin_value, const int feature_group_id, const data_size_t data_index); + + // Host memory // data on CPU, stored in row-wise style - std::vector cpu_data_; + const data_size_t num_data_; + const int num_features_; + const int num_leaves_; + const int num_threads_; + int num_total_bin_; + int num_feature_groups_; + std::vector data_; std::vector feature_group_bin_offsets_; - std::vector feature_group_bin_offsets_by_col_groups_; - uint8_t* cuda_data_; + + // CUDA memory, held by this object uint32_t* cuda_feature_group_bin_offsets_; - uint32_t* cuda_feature_group_bin_offsets_by_col_groups_; - const data_size_t num_data_; hist_t* cuda_hist_; - int num_total_bin_; int* cuda_num_total_bin_; - int num_feature_groups_; int* cuda_num_feature_groups_; - int8_t* cuda_int_gradients_; - int8_t* cuda_int_hessians_; - int32_t* cuda_int_gradients_and_hessians_; - const int max_num_leaves_; + uint8_t* cuda_data_; + + // CUDA memory, held by other objects + const score_t* cuda_gradients_; + const score_t* cuda_hessians_; }; } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 5776ebd54866..33dab992d165 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -4,13 +4,15 @@ * license information. */ +#ifdef USE_CUDA + #include "cuda_leaf_splits.hpp" namespace LightGBM { -CUDALeafSplits::CUDALeafSplits(const data_size_t num_data, +CUDALeafSplits::CUDALeafSplits(const data_size_t num_data, const int leaf_index, const score_t* cuda_gradients, const score_t* cuda_hessians, - const int* cuda_num_data): num_data_(num_data) { + const int* cuda_num_data): num_data_(num_data), leaf_index_(leaf_index) { cuda_sum_of_gradients_ = nullptr; cuda_sum_of_hessians_ = nullptr; cuda_num_data_in_leaf_ = nullptr; @@ -31,9 +33,10 @@ void CUDALeafSplits::Init() { AllocateCUDAMemory(num_blocks_init_from_gradients_, &cuda_sum_of_gradients_); AllocateCUDAMemory(num_blocks_init_from_gradients_, &cuda_sum_of_hessians_); - AllocateCUDAMemory(1, &cuda_num_data_in_leaf_); - AllocateCUDAMemory(1, &cuda_gain_); - AllocateCUDAMemory(1, &cuda_leaf_value_); + InitCUDAMemoryFromHostMemory(&cuda_num_data_in_leaf_, &num_data_, 1); + InitCUDAValueFromConstant(&cuda_gain_, 0.0f); + + InitCUDAMemoryFromHostMemory(&cuda_leaf_index_, &leaf_index_, 1); } void CUDALeafSplits::InitValues(const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, @@ -45,10 +48,13 @@ void CUDALeafSplits::InitValues(const double* cuda_sum_of_gradients, const doubl cuda_data_indices_in_leaf_ = cuda_data_indices_in_leaf; CopyFromCUDADeviceToCUDADevice(cuda_gain_, cuda_gain, 1); CopyFromCUDADeviceToCUDADevice(cuda_leaf_value_, cuda_leaf_value, 1); + SynchronizeCUDADevice(); } void CUDALeafSplits::InitValues() { - + LaunchInitValuesKernal(); } } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 65318d41de6c..3eb32e5da1fd 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -4,6 +4,8 @@ * license information. */ +#ifdef USE_CUDA + #include "cuda_leaf_splits.hpp" namespace LightGBM { @@ -13,12 +15,12 @@ __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score __shared__ score_t shared_gradients[NUM_THRADS_PER_BLOCK_LEAF_SPLITS]; __shared__ score_t shared_hessians[NUM_THRADS_PER_BLOCK_LEAF_SPLITS]; const unsigned int tid = threadIdx.x; - const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD; + const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_LEAF_SPLITS; const unsigned int num_data_ref = static_cast(*cuda_num_data); shared_gradients[tid] = 0.0f; shared_hessians[tid] = 0.0f; __syncthreads(); - for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD; ++j) { + for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_LEAF_SPLITS; ++j) { if (i + j < num_data_ref) { shared_gradients[tid] += cuda_gradients[i + j]; shared_hessians[tid] += cuda_hessians[i + j]; @@ -26,7 +28,7 @@ __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score } __syncthreads(); for (unsigned int s = 1; s < blockDim.x; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < NUM_THRADS_PER_BLOCK_SPLITS_INIT) { + if (tid % (2 * s) == 0 && (tid + s) < NUM_THRADS_PER_BLOCK_LEAF_SPLITS) { shared_gradients[tid] += shared_gradients[tid + s]; shared_hessians[tid] += shared_hessians[tid + s]; } @@ -39,21 +41,25 @@ __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score } __global__ void CUDAInitValuesKernel2(double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) { - for (unsigned int i = 1; i < gridDim.x; ++i) { - cuda_sum_of_gradients[0] += cuda_sum_of_gradients[i]; - cuda_sum_of_hessians[0] += cuda_sum_of_hessians[i]; + if (blockIdx.x == 0) { + for (unsigned int i = 1; i < gridDim.x; ++i) { + cuda_sum_of_gradients[0] += cuda_sum_of_gradients[i]; + cuda_sum_of_hessians[0] += cuda_sum_of_hessians[i]; + } } } void CUDALeafSplits::LaunchInitValuesKernal() { - CUDAInitValuesKernel1<<>>( - cuda_gradients_, cuda_hessians_, cuda_num_data_, smaller_leaf_sum_gradients_, - smaller_leaf_sum_hessians_); + CUDAInitValuesKernel1<<>>( + cuda_gradients_, cuda_hessians_, cuda_num_data_, cuda_sum_of_gradients_, + cuda_sum_of_hessians_); CopyFromCUDADeviceToCUDADevice(cuda_num_data_in_leaf_, cuda_num_data_, 1); SynchronizeCUDADevice(); - CUDAInitValuesKernel2<<>>( - smaller_leaf_sum_gradients_, smaller_leaf_sum_hessians_); + CUDAInitValuesKernel2<<>>( + cuda_sum_of_gradients_, cuda_sum_of_hessians_); SynchronizeCUDADevice(); } } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index ae065be81d8c..c2d9d918437f 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -20,6 +20,10 @@ namespace LightGBM { class CUDALeafSplits { public: + CUDALeafSplits(const data_size_t num_data, const int leaf_index, + const score_t* cuda_gradients, const score_t* cuda_hessians, + const int* cuda_num_data); + CUDALeafSplits(); void Init(); @@ -30,14 +34,31 @@ class CUDALeafSplits { void InitValues(); + const int* cuda_leaf_index() const { return cuda_leaf_index_; } + + const data_size_t* cuda_num_data_in_leaf() const { return cuda_num_data_in_leaf_; } + + const data_size_t* cuda_data_indices_in_leaf() const { return cuda_data_indices_in_leaf_; } + + void Test() { + PrintLastCUDAError(); + double test_sum_of_gradients = 0.0f, test_sum_of_hessians = 0.0f; + CopyFromCUDADeviceToHost(&test_sum_of_gradients, cuda_sum_of_gradients_, 1); + CopyFromCUDADeviceToHost(&test_sum_of_hessians, cuda_sum_of_hessians_, 1); + Log::Warning("CUDALeafSplits::Test test_sum_of_gradients = %f", test_sum_of_gradients); + Log::Warning("CUDALeafSplits::Test test_sum_of_hessians = %f", test_sum_of_hessians); + } + private: void LaunchInitValuesKernal(); // Host memory const int num_data_; + const int leaf_index_; int num_blocks_init_from_gradients_; // CUDA memory, held by this object + int* cuda_leaf_index_; double* cuda_sum_of_gradients_; double* cuda_sum_of_hessians_; data_size_t* cuda_num_data_in_leaf_; diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.cpp b/src/treelearner/cuda/cuda_leaf_splits_init.cpp deleted file mode 100644 index f20e2ff41881..000000000000 --- a/src/treelearner/cuda/cuda_leaf_splits_init.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_leaf_splits_init.hpp" - -#include - -namespace LightGBM { - -CUDALeafSplitsInit::CUDALeafSplitsInit(const score_t* cuda_gradients, - const score_t* cuda_hessians, const data_size_t num_data): -cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians), num_data_(num_data) {} - -void CUDALeafSplitsInit::Init() { - num_blocks_ = (num_data_ + INIT_SUM_BLOCK_SIZE - 1) / INIT_SUM_BLOCK_SIZE; - - CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); - void* smaller_leaf_sum_gradients_ptr = nullptr; - void* smaller_leaf_sum_hessians_ptr = nullptr; - CUDASUCCESS_OR_FATAL(cudaMalloc(&smaller_leaf_sum_gradients_ptr, num_blocks_ * sizeof(double))); - CUDASUCCESS_OR_FATAL(cudaMalloc(&smaller_leaf_sum_hessians_ptr, num_blocks_ * sizeof(double))); - CUDASUCCESS_OR_FATAL(cudaMemset(smaller_leaf_sum_gradients_ptr, 0, num_blocks_ * sizeof(double))); - CUDASUCCESS_OR_FATAL(cudaMemset(smaller_leaf_sum_hessians_ptr, 0, num_blocks_ * sizeof(double))); - smaller_leaf_sum_gradients_ = reinterpret_cast(smaller_leaf_sum_gradients_ptr); - smaller_leaf_sum_hessians_ = reinterpret_cast(smaller_leaf_sum_hessians_ptr); - - void* cuda_num_data_ptr = nullptr; - CUDASUCCESS_OR_FATAL(cudaMalloc(&cuda_num_data_ptr, sizeof(int))); - cuda_num_data_ = reinterpret_cast(cuda_num_data_ptr); - const void* num_data_ptr = reinterpret_cast(&num_data_); - CUDASUCCESS_OR_FATAL(cudaMemcpy(cuda_num_data_ptr, num_data_ptr, sizeof(int), cudaMemcpyHostToDevice)); - - AllocateCUDAMemory(1, &smaller_leaf_index_); - AllocateCUDAMemory(1, &larger_leaf_index_); - AllocateCUDAMemory(1, &cuda_num_data_); - CopyFromHostToCUDADevice(cuda_num_data_, &num_data_, 1); -} - -void CUDALeafSplitsInit::Compute() { - auto start = std::chrono::steady_clock::now(); - LaunchLeafSplitsInit(); - SynchronizeCUDADevice(); - auto end = std::chrono::steady_clock::now(); - double duration = (static_cast>(end - start)).count(); - Log::Warning("LaunchLeafSplitsInit time %f", duration); - Log::Warning(cudaGetErrorName(cudaGetLastError())); - CUDASUCCESS_OR_FATAL(cudaDeviceSynchronize()); - - const void* smaller_leaf_sum_gradients_ptr = reinterpret_cast(smaller_leaf_sum_gradients_); - const void* smaller_leaf_sum_hessians_ptr = reinterpret_cast(smaller_leaf_sum_hessians_); - void* host_smaller_leaf_sum_gradients_ptr = reinterpret_cast(&host_smaller_leaf_sum_gradients_); - void* host_smaller_leaf_sum_hessians_ptr = reinterpret_cast(&host_smaller_leaf_sum_hessians_); - CUDASUCCESS_OR_FATAL(cudaMemcpy(host_smaller_leaf_sum_gradients_ptr, smaller_leaf_sum_gradients_ptr, sizeof(double), cudaMemcpyDeviceToHost)); - CUDASUCCESS_OR_FATAL(cudaMemcpy(host_smaller_leaf_sum_hessians_ptr, smaller_leaf_sum_hessians_ptr, sizeof(double), cudaMemcpyDeviceToHost)); - Log::Warning("host_smaller_leaf_sum_gradients_ = %f", host_smaller_leaf_sum_gradients_); - Log::Warning("host_smaller_leaf_sum_hessians_ = %f", host_smaller_leaf_sum_hessians_); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.cu b/src/treelearner/cuda/cuda_leaf_splits_init.cu deleted file mode 100644 index e1bcf931ffb1..000000000000 --- a/src/treelearner/cuda/cuda_leaf_splits_init.cu +++ /dev/null @@ -1,63 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_leaf_splits_init.hpp" - -namespace LightGBM { - -__global__ void CUDALeafSplitsInitKernel1(const float* cuda_gradients, const float* cuda_hessians, - const data_size_t* num_data, double* grad_sum_out, double* hess_sum_out) { - __shared__ float shared_gradients[NUM_THRADS_PER_BLOCK_SPLITS_INIT]; - __shared__ float shared_hessians[NUM_THRADS_PER_BLOCK_SPLITS_INIT]; - const unsigned int tid = threadIdx.x; - const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD; - const unsigned int num_data_ref = static_cast(*num_data); - shared_gradients[tid] = 0.0f; - shared_hessians[tid] = 0.0f; - __syncthreads(); - for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD; ++j) { - if (i + j < num_data_ref) { - shared_gradients[tid] += cuda_gradients[i + j]; - shared_hessians[tid] += cuda_hessians[i + j]; - } - } - __syncthreads(); - for (unsigned int s = 1; s < blockDim.x; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < NUM_THRADS_PER_BLOCK_SPLITS_INIT) { - shared_gradients[tid] += shared_gradients[tid + s]; - shared_hessians[tid] += shared_hessians[tid + s]; - } - __syncthreads(); - } - if (tid == 0) { - grad_sum_out[blockIdx.x] += shared_gradients[0]; - hess_sum_out[blockIdx.x] += shared_hessians[0]; - } -} - -__global__ void CUDALeafSplitsInitKernel2(double* grad_sum_out, double* hess_sum_out) { - if (threadIdx.x == 0 && blockIdx.x == 0) { - for (unsigned int i = 1; i < gridDim.x; ++i) { - grad_sum_out[0] += grad_sum_out[i]; - hess_sum_out[0] += hess_sum_out[i]; - } - } -} - -void CUDALeafSplitsInit::LaunchLeafSplitsInit() { - CUDALeafSplitsInitKernel1<<>>( - cuda_gradients_, cuda_hessians_, cuda_num_data_, smaller_leaf_sum_gradients_, - smaller_leaf_sum_hessians_); - SynchronizeCUDADevice(); - CUDALeafSplitsInitKernel2<<>>( - smaller_leaf_sum_gradients_, smaller_leaf_sum_hessians_); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits_init.hpp b/src/treelearner/cuda/cuda_leaf_splits_init.hpp deleted file mode 100644 index 26b1fc64c8b5..000000000000 --- a/src/treelearner/cuda/cuda_leaf_splits_init.hpp +++ /dev/null @@ -1,65 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ -#ifndef LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HPP_ -#define LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HPP_ - -#ifdef USE_CUDA - -#include -#include -#include "new_cuda_utils.hpp" - -#define INIT_SUM_BLOCK_SIZE (6144) -#define NUM_THRADS_PER_BLOCK_SPLITS_INIT (1024) -#define NUM_DATA_THREAD_ADD (6) - -namespace LightGBM { - -class CUDALeafSplitsInit { - public: - CUDALeafSplitsInit(const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t num_data); - - void Init(); - - void Compute(); - - const double* smaller_leaf_sum_gradients() { return smaller_leaf_sum_gradients_; } - - const double* smaller_leaf_sum_hessians() { return smaller_leaf_sum_hessians_; } - - const double* larger_leaf_sum_gradients() { return larger_leaf_sum_gradients_; } - - const double* larger_leaf_sum_hessians() { return larger_leaf_sum_hessians_; } - - const int* smaller_leaf_index() { return smaller_leaf_index_; } - - const int* larger_leaf_index() { return larger_leaf_index_; } - - const double - - void LaunchLeafSplitsInit(); - - protected: - const score_t* cuda_gradients_; - const score_t* cuda_hessians_; - double* smaller_leaf_sum_gradients_; - double* smaller_leaf_sum_hessians_; - double host_smaller_leaf_sum_gradients_; - double host_smaller_leaf_sum_hessians_; - double* larger_leaf_sum_gradients_; - double* larger_leaf_sum_hessians_; - int* smaller_leaf_index_; - int* larger_leaf_index_; - int* cuda_num_data_; - - const int num_data_; - int num_blocks_; -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_CUDA_LEAF_SPLITS_INIT_HPP_ diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 838d338e9deb..8cdb4dfccdfd 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -13,60 +13,53 @@ namespace LightGBM { -NewCUDATreeLearner::NewCUDATreeLearner(const Config* config): SerialTreeLearner(config) { - -} +NewCUDATreeLearner::NewCUDATreeLearner(const Config* config): SerialTreeLearner(config) {} NewCUDATreeLearner::~NewCUDATreeLearner() {} void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { + // use the first gpu by now SerialTreeLearner::Init(train_data, is_constant_hessian); - Log::Warning("NewCUDATreeLearner::Init step 1"); - int num_total_gpus = 0; - CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&num_total_gpus)); - Log::Warning("NewCUDATreeLearner::Init step 2"); - num_gpus_ = 1;//config_->num_gpu > num_total_gpus ? num_total_gpus : config_->num_gpu; num_threads_ = OMP_NUM_THREADS(); - Log::Warning("NewCUDATreeLearner::Init step 3"); - gradients_and_hessians_.resize(2 * num_data_); - AllocateFeatureTasks(); - Log::Warning("NewCUDATreeLearner::Init step 4"); - AllocateMemory(is_constant_hessian); - Log::Warning("NewCUDATreeLearner::Init step 5"); - - CreateCUDAHistogramConstructors(); - Log::Warning("NewCUDATreeLearner::Init step 6"); + CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + cuda_centralized_info_.reset(new CUDACentralizedInfo(num_data_, this->config_->num_leaves, num_features_)); + cuda_centralized_info_->Init(); + //cuda_centralized_info_->Test(); + + cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0, cuda_centralized_info_->cuda_gradients(), + cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); + cuda_smaller_leaf_splits_->Init(); + cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_, -1, cuda_centralized_info_->cuda_gradients(), + cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); + cuda_larger_leaf_splits_->Init(); + + cuda_data_partition_.reset(new CUDADataPartition(num_data_, this->config_->num_leaves, + cuda_centralized_info_->cuda_num_data(), cuda_centralized_info_->cuda_num_leaves())); + cuda_data_partition_->Init(); + + cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, + cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians())); + cuda_histogram_constructor_->Init(train_data_); + //cuda_histogram_constructor_->TestAfterInit(); } void NewCUDATreeLearner::BeforeTrain() { - SerialTreeLearner::BeforeTrain(); - #pragma omp parallel for schedule(static) num_threads(num_threads_) + cuda_centralized_info_->BeforeTrain(gradients_, hessians_); + cuda_smaller_leaf_splits_->InitValues(); + //cuda_smaller_leaf_splits_->Test(); + cuda_data_partition_->BeforeTrain(nullptr); + //cuda_data_partition_->Test(); + + //SerialTreeLearner::BeforeTrain(); + /*#pragma omp parallel for schedule(static) num_threads(num_threads_) for (int device_id = 0; device_id < num_gpus_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); device_leaf_splits_initializers_[device_id]->Init(); - } -} - -void NewCUDATreeLearner::AllocateFeatureTasks() { - device_feature_groups_.resize(num_gpus_); - device_num_total_bins_.resize(num_gpus_, 0); - const int num_feature_groups = train_data_->num_feature_groups(); - const int num_feature_groups_per_device = (num_feature_groups + num_gpus_ - 1) / num_gpus_; - for (int device_id = 0; device_id < num_gpus_; ++device_id) { - device_feature_groups_[device_id].clear(); - const int device_feature_group_start = device_id * num_feature_groups_per_device; - const int device_feature_group_end = std::min(device_feature_group_start + num_feature_groups_per_device, num_feature_groups); - int& num_total_bin = device_num_total_bins_[device_id]; - num_total_bin = 0; - for (int group_id = device_feature_group_start; group_id < device_feature_group_end; ++group_id) { - device_feature_groups_[device_id].emplace_back(group_id); - num_total_bin += train_data_->FeatureGroupNumBin(group_id); - } - } + }*/ } void NewCUDATreeLearner::AllocateMemory(const bool is_constant_hessian) { - device_data_indices_.resize(num_gpus_, nullptr); + /*device_data_indices_.resize(num_gpus_, nullptr); device_gradients_.resize(num_gpus_, nullptr); device_gradients_and_hessians_.resize(num_gpus_, nullptr); if (!is_constant_hessian) { @@ -106,11 +99,11 @@ void NewCUDATreeLearner::AllocateMemory(const bool is_constant_hessian) { void* histograms_ptr = reinterpret_cast(device_histograms_[device_id]); CUDASUCCESS_OR_FATAL(cudaMalloc(&histograms_ptr, num_total_bin * 2 * sizeof(double))); device_histograms_[device_id] = reinterpret_cast(histograms_ptr); - } + }*/ } void NewCUDATreeLearner::CreateCUDAHistogramConstructors() { - Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors num_gpus_ = %d", num_gpus_); + /*Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors num_gpus_ = %d", num_gpus_); device_histogram_constructors_.resize(num_gpus_); device_leaf_splits_initializers_.resize(num_gpus_); device_best_split_finders_.resize(num_gpus_); @@ -143,11 +136,11 @@ void NewCUDATreeLearner::CreateCUDAHistogramConstructors() { } Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 6", num_gpus_); PushDataIntoDeviceHistogramConstructors(); - Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 7", num_gpus_); + Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 7", num_gpus_);*/ } void NewCUDATreeLearner::PushDataIntoDeviceHistogramConstructors() { - #pragma omp parallel for schedule(static) num_threads(num_threads_) + /*#pragma omp parallel for schedule(static) num_threads(num_threads_) for (int device_id = 0; device_id < num_gpus_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); CUDAHistogramConstructor* cuda_histogram_constructor = device_histogram_constructors_[device_id].get(); @@ -162,27 +155,27 @@ void NewCUDATreeLearner::PushDataIntoDeviceHistogramConstructors() { } // call finish load to tranfer data from CPU to GPU cuda_histogram_constructor->FinishLoad(); - } + }*/ } void NewCUDATreeLearner::FindBestSplits(const Tree* tree) { - std::vector is_feature_used(num_features_, 1); + /*std::vector is_feature_used(num_features_, 1); ConstructHistograms(is_feature_used, true); - FindBestSplitsFromHistograms(is_feature_used, true, tree); + FindBestSplitsFromHistograms(is_feature_used, true, tree);*/ } void NewCUDATreeLearner::ConstructHistograms(const std::vector& /*is_feature_used*/, bool /*use_subtract*/) { - #pragma omp parallel for schedule(static) num_threads(num_threads_) + /*#pragma omp parallel for schedule(static) num_threads(num_threads_) for (int device_id = 0; device_id < num_gpus_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - } + }*/ } void NewCUDATreeLearner::FindBestSplitsFromHistograms(const std::vector& /*is_feature_used*/, bool /*use_subtract*/, const Tree* /*tree*/) { - #pragma omp parallel for schedule(static) num_threads(num_threads_) + /*#pragma omp parallel for schedule(static) num_threads(num_threads_) for (int device_id = 0; device_id < num_gpus_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); device_best_split_finders_[device_id]->FindBestSplitsForLeaf( @@ -190,24 +183,33 @@ void NewCUDATreeLearner::FindBestSplitsFromHistograms(const std::vector& device_best_split_finders_[device_id]->FindBestSplitsForLeaf( device_leaf_splits_initializers_[device_id]->larger_leaf_index()); device_best_split_finders_[device_id]->FindBestFromAllSplits(); - } + }*/ } void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, int* /*left_leaf*/, int* /*right_leaf*/) { - #pragma omp parallel for schedule(static) num_threads(num_threads_) + /*#pragma omp parallel for schedule(static) num_threads(num_threads_) for (int device_id = 0; device_id < num_gpus_; ++device_id) { CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); device_splitters_[device_id]->Split( device_best_split_finders_[device_id]->best_leaf(), device_best_split_finders_[device_id]->best_split_feature_index(), device_best_split_finders_[device_id]->best_split_threshold()); - } + }*/ } Tree* NewCUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, bool /*is_first_tree*/) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + gradients_ = gradients; + hessians_ = hessians; + BeforeTrain(); + cuda_histogram_constructor_->ConstructHistogramForLeaf( + cuda_smaller_leaf_splits_->cuda_leaf_index(), + cuda_larger_leaf_splits_->cuda_leaf_index(), + cuda_smaller_leaf_splits_->cuda_data_indices_in_leaf(), + cuda_larger_leaf_splits_->cuda_data_indices_in_leaf(), + cuda_data_partition_->cuda_leaf_num_data_offsets()); + /*CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); CUDASUCCESS_OR_FATAL(cudaMemcpy(device_gradients_[0], gradients, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice)); CUDASUCCESS_OR_FATAL(cudaMemcpy(device_hessians_[0], hessians, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice)); #pragma omp parallel for schedule(static) num_threads(num_threads_) @@ -225,7 +227,8 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, device_leaf_splits_initializers_[0]->larger_leaf_index(), device_splitters_[0]->leaf_num_data(), device_splitters_[0]->leaf_num_data_offsets(), device_splitters_[0]->data_indices(), device_gradients_[0], device_hessians_[0], device_gradients_and_hessians_[0]); - Log::Warning("after construction of root histograms"); + Log::Warning("after construction of root histograms");*/ + return nullptr; } void NewCUDATreeLearner::ResetTrainingData(const Dataset* /*train_data*/, diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 2bb2003f1dc7..4e1a7a17ea94 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -9,10 +9,11 @@ #ifdef USE_CUDA #include "../serial_tree_learner.h" -#include "cuda_leaf_splits_init.hpp" +#include "cuda_leaf_splits.hpp" #include "cuda_histogram_constructor.hpp" -#include "cuda_data_splitter.hpp" +#include "cuda_data_partition.hpp" #include "cuda_best_split_finder.hpp" +#include "cuda_centralized_info.hpp" namespace LightGBM { @@ -54,16 +55,21 @@ class NewCUDATreeLearner: public SerialTreeLearner { int num_gpus_; // number of threads on CPU int num_threads_; - // gradient and hessian values packed togather - std::vector gradients_and_hessians_; - - // feature groups allocated to each device - std::vector> device_feature_groups_; - // number of total bins of feature groups allocated to each device - std::vector device_num_total_bins_; - // number of maximum work groups per device - std::vector device_num_workgroups_; + // CUDA components for tree training + // centralized information shared by other CUDA components + std::unique_ptr cuda_centralized_info_; + // leaf splits information for smaller and larger leaves + std::unique_ptr cuda_smaller_leaf_splits_; + std::unique_ptr cuda_larger_leaf_splits_; + // data partition that partitions data indices into different leaves + std::unique_ptr cuda_data_partition_; + // for histogram construction + std::unique_ptr cuda_histogram_constructor_; + // for best split information finding, given the histograms + std::unique_ptr cuda_best_split_finder_; + + /* // full data indices on CUDA devices, as the data indices of data_partition_ in CPU version std::vector device_data_indices_; // gradient values on CUDA devices @@ -82,7 +88,24 @@ class NewCUDATreeLearner: public SerialTreeLearner { // device best split finder std::vector> device_best_split_finders_; // device splitter - std::vector> device_splitters_; + std::vector> device_splitters_;*/ +}; + +} // namespace LightGBM + +#else // USE_CUDA + +// When GPU support is not compiled in, quit with an error message + +namespace LightGBM { + +class NewCUDATreeLearner: public SerialTreeLearner { + public: + #pragma warning(disable : 4702) + explicit NewCUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) { + Log::Fatal("CUDA Tree Learner was not enabled in this build.\n" + "Please recompile with CMake option -DUSE_CUDA=1"); + } }; } // namespace LightGBM diff --git a/src/treelearner/cuda/new_cuda_utils.cpp b/src/treelearner/cuda/new_cuda_utils.cpp index 58c38d5e6ae9..62b0559e8377 100644 --- a/src/treelearner/cuda/new_cuda_utils.cpp +++ b/src/treelearner/cuda/new_cuda_utils.cpp @@ -4,6 +4,8 @@ * license information. */ +#ifdef USE_CUDA + #include "new_cuda_utils.hpp" namespace LightGBM { @@ -18,3 +20,5 @@ void PrintLastCUDAError() { } } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/new_cuda_utils.hpp b/src/treelearner/cuda/new_cuda_utils.hpp index a7cb6a981d1a..4c2ccb2a163d 100644 --- a/src/treelearner/cuda/new_cuda_utils.hpp +++ b/src/treelearner/cuda/new_cuda_utils.hpp @@ -30,6 +30,18 @@ void CopyFromHostToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size) { CUDASUCCESS_OR_FATAL(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice)); } +template +void InitCUDAMemoryFromHostMemory(T** dst_ptr, const T* src_ptr, size_t size) { + AllocateCUDAMemory(size, dst_ptr); + CopyFromHostToCUDADevice(*dst_ptr, src_ptr, size); +} + +template +void InitCUDAValueFromConstant(T** dst_ptr, const T value) { + AllocateCUDAMemory(1, dst_ptr); + CopyFromHostToCUDADevice(*dst_ptr, &value, 1); +} + template void CopyFromCUDADeviceToHost(T* dst_ptr, const T* src_ptr, size_t size) { void* void_dst_ptr = reinterpret_cast(dst_ptr); From 23bcaa2ae55082cd16bc515edce97beeb770c709 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 6 May 2021 09:38:55 +0000 Subject: [PATCH 005/166] tree learner cuda kernels --- CMakeLists.txt | 18 +- .../cuda/cuda_best_split_finder.cpp | 97 +++- .../cuda/cuda_best_split_finder.cu | 341 ++++++++++--- .../cuda/cuda_best_split_finder.hpp | 120 +++-- src/treelearner/cuda/cuda_data_partition.cpp | 113 ++++- src/treelearner/cuda/cuda_data_partition.cu | 475 ++++++++++++++++++ src/treelearner/cuda/cuda_data_partition.hpp | 128 ++++- .../cuda/cuda_histogram_constructor.cpp | 14 +- .../cuda/cuda_histogram_constructor.cu | 10 +- .../cuda/cuda_histogram_constructor.hpp | 22 +- src/treelearner/cuda/cuda_leaf_splits.cpp | 3 +- src/treelearner/cuda/cuda_leaf_splits.hpp | 12 +- .../cuda/new_cuda_tree_learner.cpp | 38 +- src/treelearner/cuda/new_cuda_utils.cu | 45 ++ src/treelearner/cuda/new_cuda_utils.hpp | 7 + src/treelearner/serial_tree_learner.cpp | 5 +- 16 files changed, 1279 insertions(+), 169 deletions(-) create mode 100644 src/treelearner/cuda/new_cuda_utils.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 51516a9ba938..05fb49e13ddd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -230,14 +230,20 @@ if(USE_CUDA) add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}") endforeach() - add_library(leaf_splits_init OBJECT src/treelearner/cuda/cuda_leaf_splits_init.cu) - set_target_properties(leaf_splits_init PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + add_library(new_cuda_utils OBJECT src/treelearner/cuda/new_cuda_utils.cu) + set_target_properties(new_cuda_utils PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - add_library(data_splitter OBJECT src/treelearner/cuda/cuda_data_splitter.cu) - set_target_properties(data_splitter PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + add_library(cuda_leaf_splits OBJECT src/treelearner/cuda/cuda_leaf_splits.cu) + set_target_properties(cuda_leaf_splits PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - add_library(histogram_constructor OBJECT src/treelearner/cuda/cuda_histogram_constructor.cu) - set_target_properties(histogram_constructor PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + add_library(cuda_data_partition OBJECT src/treelearner/cuda/cuda_data_partition.cu) + set_target_properties(cuda_data_partition PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + add_library(cuda_histogram_constructor OBJECT src/treelearner/cuda/cuda_histogram_constructor.cu) + set_target_properties(cuda_histogram_constructor PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + add_library(cuda_best_split_finder OBJECT src/treelearner/cuda/cuda_best_split_finder.cu) + set_target_properties(cuda_best_split_finder PROPERTIES CUDA_SEPARABLE_COMPILATION ON) endif(USE_CUDA) if(USE_HDFS) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 091ee6c8ace9..a36533e39233 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -10,58 +10,89 @@ #include "cuda_leaf_splits.hpp" namespace LightGBM { -/* + CUDABestSplitFinder::CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* train_data, - const std::vector& feature_hist_offsets, const int num_leaves, - const double lambda_l1, const data_size_t min_data_in_leaf, const double min_sum_hessian_in_leaf, - const double min_gain_to_split): -cuda_hist_(cuda_hist), num_features_(train_data->num_features()), num_leaves_(num_leaves), -feature_hist_offsets_(feature_hist_offsets), num_total_bin_(feature_hist_offsets.back()), lambda_l1(lambda_l1_), -min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf), min_gain_to_split_(min_gain_to_split) { + const std::vector& feature_hist_offsets, const int num_leaves, + const double lambda_l1, const double lambda_l2, const data_size_t min_data_in_leaf, + const double min_sum_hessian_in_leaf, const double min_gain_to_split, + const int* cuda_num_features): + num_features_(train_data->num_features()), num_leaves_(num_leaves), + num_total_bin_(feature_hist_offsets.back()), feature_hist_offsets_(feature_hist_offsets), lambda_l1_(lambda_l1), lambda_l2_(lambda_l2), + min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf), min_gain_to_split_(min_gain_to_split), + cuda_hist_(cuda_hist), cuda_num_features_(cuda_num_features) { feature_missing_type_.resize(num_features_); feature_mfb_offsets_.resize(num_features_); feature_default_bins_.resize(num_features_); + max_num_bin_in_feature_ = 0; for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { - const BinMapper* bin_mapper = train_data_->FeatureBinMapper(inner_feature_index); + const BinMapper* bin_mapper = train_data->FeatureBinMapper(inner_feature_index); const MissingType missing_type = bin_mapper->missing_type(); feature_missing_type_[inner_feature_index] = missing_type; feature_mfb_offsets_[inner_feature_index] = static_cast(bin_mapper->GetMostFreqBin() == 0); feature_default_bins_[inner_feature_index] = bin_mapper->GetDefaultBin(); + const int num_bin = bin_mapper->num_bin() - feature_mfb_offsets_[inner_feature_index]; + if (num_bin > max_num_bin_in_feature_) { + max_num_bin_in_feature_ = num_bin; + } + } + if (max_num_bin_in_feature_ > MAX_NUM_BIN_IN_FEATURE) { + Log::Fatal("feature bin size %d exceeds limit %d", max_num_bin_in_feature_, MAX_NUM_BIN_IN_FEATURE); } } void CUDABestSplitFinder::Init() { + AllocateCUDAMemory(1, &cuda_best_leaf_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_feature_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_default_left_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_threshold_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_gain_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_sum_gradient_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_sum_hessian_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_count_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_output_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_sum_gradient_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_sum_hessian_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_count_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_output_); + AllocateCUDAMemory(feature_hist_offsets_.size(), &cuda_feature_hist_offsets_); CopyFromHostToCUDADevice(cuda_feature_hist_offsets_, feature_hist_offsets_.data(), feature_hist_offsets_.size()); AllocateCUDAMemory(feature_mfb_offsets_.size(), &cuda_feature_mfb_offsets_); CopyFromHostToCUDADevice(cuda_feature_mfb_offsets_, feature_mfb_offsets_.data(), feature_mfb_offsets_.size()); - AllocateCUDAMemory(feature_default_bins_.since(), &cuda_feature_default_bins_); + AllocateCUDAMemory(feature_default_bins_.size(), &cuda_feature_default_bins_); CopyFromHostToCUDADevice(cuda_feature_default_bins_, feature_default_bins_.data(), feature_default_bins_.size()); AllocateCUDAMemory(1, &cuda_num_total_bin_); CopyFromHostToCUDADevice(cuda_num_total_bin_, &num_total_bin_, 1); AllocateCUDAMemory(num_features_, &cuda_feature_missing_type_); + CopyFromHostToCUDADevice(cuda_feature_missing_type_, feature_missing_type_.data(), static_cast(num_features_)); AllocateCUDAMemory(1, &cuda_lambda_l1_); CopyFromHostToCUDADevice(cuda_lambda_l1_, &lambda_l1_, 1); + InitCUDAMemoryFromHostMemory(&cuda_lambda_l2_, &lambda_l2_, 1); + AllocateCUDAMemory(num_total_bin_ * 2, &prefix_sum_hist_left_); AllocateCUDAMemory(num_total_bin_ * 2, &prefix_sum_hist_right_); // * 2 for smaller and larger leaves, * 2 for default left or not const size_t feature_best_split_info_buffer_size = static_cast(num_features_) * 4; AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_feature_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_default_left_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_gain_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_left_sum_gradient_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_left_sum_hessian_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_left_count_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_right_sum_gradient_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_right_sum_hessian_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, cuda_best_split_right_count_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_default_left_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_threshold_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_gain_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_sum_gradient_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_sum_hessian_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_count_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_output_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_sum_gradient_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_sum_hessian_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_count_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_output_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_found_); AllocateCUDAMemory(1, &cuda_min_data_in_leaf_); CopyFromHostToCUDADevice(cuda_min_data_in_leaf_, &min_data_in_leaf_, 1); @@ -71,13 +102,35 @@ void CUDABestSplitFinder::Init() { CopyFromHostToCUDADevice(cuda_min_gain_to_split_, &min_gain_to_split_, 1); } -void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplitsInit* smaller_leaf_splits, - const CUDALeafSplitsInit* larger_leaf_splits ) { - LaunchFindBestSplitsForLeafKernel(smaller_leaf_id, larger_leaf_id, parent_gain); +void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, + const CUDALeafSplits* larger_leaf_splits) { + auto start = std::chrono::steady_clock::now(); + LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits->cuda_leaf_index(), + larger_leaf_splits->cuda_leaf_index(), + smaller_leaf_splits->cuda_gain(), + larger_leaf_splits->cuda_gain(), + smaller_leaf_splits->cuda_sum_of_gradients(), + smaller_leaf_splits->cuda_sum_of_hessians(), + smaller_leaf_splits->cuda_num_data_in_leaf(), + larger_leaf_splits->cuda_sum_of_gradients(), + larger_leaf_splits->cuda_sum_of_hessians(), + larger_leaf_splits->cuda_num_data_in_leaf()); + SynchronizeCUDADevice(); + LaunchSyncBestSplitForLeafKernel(smaller_leaf_splits->cuda_leaf_index(), larger_leaf_splits->cuda_leaf_index()); + SynchronizeCUDADevice(); + auto end = std::chrono::steady_clock::now(); + double duration = (static_cast>(end - start)).count(); + Log::Warning("FindBestSplitsForLeaf time %f", duration); +} + +void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves) { + auto start = std::chrono::steady_clock::now(); + LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves); + auto end = std::chrono::steady_clock::now(); + double duration = (static_cast>(end - start)).count(); + Log::Warning("FindBestFromAllSplits time %f", duration); } -void CUDABestSplitFinder::FindBestFromAllSplits() {} -*/ } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 257af429b2cc..686f61054ee1 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -9,7 +9,7 @@ #include "cuda_best_split_finder.hpp" namespace LightGBM { -/* + __device__ double ThresholdL1(double s, double l1) { const double reg_s = fmax(0.0, fabs(s) - l1); if (s >= 0.0f) { @@ -74,15 +74,18 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, const double sum_gradients, const double sum_hessians, const data_size_t num_data, const bool reverse, const bool skip_default_bin, const bool na_as_missing, // output parameters + uint32_t* output_threshold, double* output_gain, uint8_t* output_default_left, double* output_left_sum_gradients, double* output_left_sum_hessians, data_size_t* output_left_num_data, + double* output_left_output, double* output_right_sum_gradients, double* output_right_sum_hessians, - data_size_t* output_right_num_data) { - + data_size_t* output_right_num_data, + double* output_right_output, + uint8_t* output_found) { double best_sum_left_gradient = NAN; double best_sum_left_hessian = NAN; double best_gain = kMinScore; @@ -92,6 +95,8 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; + *output_found = 0; + if (reverse) { double sum_right_gradient = 0.0f; double sum_right_hessian = kEpsilon; @@ -111,7 +116,7 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, const auto grad = GET_GRAD(feature_hist_ptr, t); const auto hess = GET_HESS(feature_hist_ptr, t); data_size_t cnt = - static_cast(Common::RoundInt(hess * cnt_factor)); + static_cast(__double2int_rn(hess * cnt_factor)); sum_right_gradient += grad; sum_right_hessian += hess; right_count += cnt; @@ -126,7 +131,7 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, break; } - double sum_left_hessian = sum_hessian - sum_right_hessian; + double sum_left_hessian = sum_hessians - sum_right_hessian; // if sum hessian too small if (sum_left_hessian < min_sum_hessian_in_leaf) { break; @@ -143,7 +148,7 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, if (current_gain <= min_gain_shift) { continue; } - + *output_found = 1; // better split point if (current_gain > best_gain) { best_left_count = left_count; @@ -161,17 +166,16 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, int t = 0; const int t_end = feature_num_bin - 2 - feature_mfb_offset; - if (na_as_missing) { if (feature_mfb_offset == 1) { - sum_left_gradient = sum_gradient; - sum_left_hessian = sum_hessian - kEpsilon; + sum_left_gradient = sum_gradients; + sum_left_hessian = sum_hessians - kEpsilon; left_count = num_data; for (int i = 0; i < feature_num_bin - feature_mfb_offset; ++i) { const auto grad = GET_GRAD(feature_hist_ptr, i); const auto hess = GET_HESS(feature_hist_ptr, i); data_size_t cnt = - static_cast(Common::RoundInt(hess * cnt_factor)); + static_cast(__double2int_rn(hess * cnt_factor)); sum_left_gradient -= grad; sum_left_hessian -= hess; left_count -= cnt; @@ -188,10 +192,10 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, } if (t >= 0) { sum_left_gradient += GET_GRAD(feature_hist_ptr, t); - const hist_t* hess = GET_HESS(feature_hist_ptr, t); + const hist_t hess = GET_HESS(feature_hist_ptr, t); sum_left_hessian += hess; left_count += static_cast( - Common::RoundInt(hess * cnt_factor)); + __double2int_rn(hess * cnt_factor)); } // if data not enough, or sum hessian too small if (left_count < min_data_in_leaf || @@ -204,13 +208,13 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, break; } - double sum_right_hessian = sum_hessian - sum_left_hessian; + double sum_right_hessian = sum_hessians - sum_left_hessian; // if sum hessian too small if (sum_right_hessian < min_sum_hessian_in_leaf) { break; } - double sum_right_gradient = sum_gradient - sum_left_gradient; + double sum_right_gradient = sum_gradients - sum_left_gradient; // current split gain double current_gain = GetSplitGains( @@ -221,7 +225,7 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, if (current_gain <= min_gain_shift) { continue; } - + *output_found = 1; // better split point if (current_gain > best_gain) { best_left_count = left_count; @@ -232,9 +236,29 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, } } } + + if (*output_found) { + *output_threshold = best_threshold; + *output_gain = best_gain; + *output_default_left = reverse; + *output_left_sum_gradients = best_sum_left_gradient; + *output_left_sum_hessians = best_sum_left_hessian; + *output_left_num_data = best_left_count; + + const double best_sum_right_gradient = sum_gradients - best_sum_left_gradient; + const double best_sum_right_hessian = sum_hessians - best_sum_left_hessian; + *output_right_sum_gradients = best_sum_right_gradient; + *output_right_sum_hessians = best_sum_right_hessian; + *output_right_num_data = num_data - best_left_count; + + *output_left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, + best_sum_left_hessian, lambda_l1, use_l1, lambda_l2); + *output_left_output = CalculateSplittedLeafOutput(best_sum_right_gradient, + best_sum_right_hessian, lambda_l1, use_l1, lambda_l2); + } } -__global__ void FindBestSplitsForLeafKernel(const hist_t* leaf_hist_ptr, +__global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* cuda_num_total_bin, const uint32_t* feature_hist_offsets, const uint8_t* feature_mfb_offsets, const uint32_t* feature_default_bins, const uint8_t* feature_missing_types, const double* lambda_l1, const double* lambda_l2, const int* smaller_leaf_id, const int* larger_leaf_id, const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, @@ -243,58 +267,68 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* leaf_hist_ptr, const data_size_t* num_data_in_larger_leaf, const data_size_t* min_data_in_leaf, const double* min_sum_hessian_in_leaf, const double* min_gain_to_split, // output - uint8_t* cuda_best_split_default_left, double* cuda_best_split_gain, double* cuda_best_split_left_sum_gradient, - double* cuda_best_split_left_sum_hessian, data_size_t* cuda_best_split_left_count, + uint32_t* cuda_best_split_threshold, uint8_t* cuda_best_split_default_left, + double* cuda_best_split_gain, double* cuda_best_split_left_sum_gradient, + double* cuda_best_split_left_sum_hessian, data_size_t* cuda_best_split_left_count, double* cuda_best_split_left_output, double* cuda_best_split_right_sum_gradient, double* cuda_best_split_right_sum_hessian, - data_size_t* cuda_best_split_right_count) { - const unsigned int num_features = blockDim.x / 2; - const unsigned int inner_feature_index = blockIdx.x % num_features; - const unsigned int threadIdx = threadIdx.x; - const unsigned int global_threadIdx = threadIdx + blockIdx.x * blockDim.x; - const bool reverse = threadIdx == 0 ? true : false; - const bool smaller_or_larger_leaf = static_cast(blockIdx.x / num_features); + data_size_t* cuda_best_split_right_count, double* cuda_best_split_right_output, uint8_t* cuda_best_split_found) { + const unsigned int num_features = gridDim.x / 4; + const unsigned int inner_feature_index = (blockIdx.x / 2) % num_features; + const unsigned int global_block_idx = blockIdx.x; + const bool reverse = blockIdx.x % 2 == 0 ? true : false; + const bool smaller_or_larger = static_cast(blockIdx.x / (2 * num_features) == 0); const int num_bin = feature_hist_offsets[inner_feature_index + 1] - feature_hist_offsets[inner_feature_index]; - const uint8_t missing_type = feature_missing_type[inner_feature_index]; + const uint8_t missing_type = feature_missing_types[inner_feature_index]; const int leaf_index = smaller_or_larger ? *smaller_leaf_id : *larger_leaf_id; const double parent_gain = smaller_or_larger ? *smaller_leaf_gain : *larger_leaf_gain; const double sum_gradients = smaller_or_larger ? *sum_gradients_in_smaller_leaf : *sum_gradients_in_larger_leaf; const double sum_hessians = smaller_or_larger ? *sum_hessians_in_smaller_leaf : *sum_hessians_in_larger_leaf; const double num_data_in_leaf = smaller_or_larger ? *num_data_in_smaller_leaf : *num_data_in_larger_leaf; - double* out_left_sum_gradients = cuda_best_split_left_sum_gradient + global_threadIdx; - double* out_left_sum_hessians = cuda_best_split_left_sum_hessian + global_threadIdx; - double* out_right_sum_gradients = cuda_best_split_right_sum_gradient + global_threadIdx; - double* out_right_sum_hessians = cuda_best_split_right_sum_hessian + global_threadIdx; - data_size_t* out_left_num_data = cuda_best_split_left_count + global_threadIdx; - data_size_t* out_right_num_data = cuda_best_split_right_count + global_threadIdx; - uint8_t* out_default_left = cuda_best_split_default_left + global_threadIdx; - double* out_gain = cuda_best_split_gain + global_threadIdx; + uint32_t* out_threshold = cuda_best_split_threshold + global_block_idx; + double* out_left_sum_gradients = cuda_best_split_left_sum_gradient + global_block_idx; + double* out_left_sum_hessians = cuda_best_split_left_sum_hessian + global_block_idx; + double* out_right_sum_gradients = cuda_best_split_right_sum_gradient + global_block_idx; + double* out_right_sum_hessians = cuda_best_split_right_sum_hessian + global_block_idx; + data_size_t* out_left_num_data = cuda_best_split_left_count + global_block_idx; + data_size_t* out_right_num_data = cuda_best_split_right_count + global_block_idx; + double* out_left_output = cuda_best_split_left_output + global_block_idx; + double* out_right_output = cuda_best_split_right_output + global_block_idx; + uint8_t* out_found = cuda_best_split_found + global_block_idx; + uint8_t* out_default_left = cuda_best_split_default_left + global_block_idx; + double* out_gain = cuda_best_split_gain + global_block_idx; + if (leaf_index < 0) { + *out_found = 0; + return; + } + const int cuda_num_total_bin_ref = *cuda_num_total_bin; + const hist_t* hist_ptr = cuda_hist + (cuda_num_total_bin_ref * leaf_index + feature_hist_offsets[inner_feature_index]) * 2; if (num_bin > 2 && missing_type != 0) { if (missing_type == 1) { - FindBestSplitsForLeafKernelInner(leaf_hist_ptr + leaf_index, + FindBestSplitsForLeafKernelInner(hist_ptr, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], - feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, *parent_gain, + feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, - num_data_in_leaf, reverse, true, false, out_gain, out_default_left, - out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data); + num_data_in_leaf, reverse, true, false, out_threshold, out_gain, out_default_left, + out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_output, + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_output, out_found); } else { - FindBestSplitsForLeafKernelInner(leaf_hist_ptr + leaf_index, + FindBestSplitsForLeafKernelInner(hist_ptr, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], - feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, *parent_gain, + feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, - num_data_in_leaf, reverse, false, true, out_gain, out_default_left, - out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data); + num_data_in_leaf, reverse, false, true, out_threshold, out_gain, out_default_left, + out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_output, + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_output, out_found); } } else { if (reverse) { - FindBestSplitsForLeafKernelInner(leaf_hist_ptr + leaf_index, + FindBestSplitsForLeafKernelInner(hist_ptr, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], - feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, *parent_gain, + feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, - num_data_in_leaf, reverse, true, false, out_gain, out_default_left, - out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data); + num_data_in_leaf, reverse, true, false, out_threshold, out_gain, out_default_left, + out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_output, + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_output, out_found); } if (missing_type == 2) { *out_default_left = 0; @@ -306,26 +340,211 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(const int* smaller_l const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, - const data_size_t* num_data_in_larger_leaf, const data_size_t* min_data_in_leaf, - const double* min_sum_hessian_in_leaf) { - const int leaf_id_ref = *leaf_id; - const int num_total_bin_ref = *num_total_bin_; + const data_size_t* num_data_in_larger_leaf) { // * 2 for smaller and larger leaves, * 2 for split direction - const int num_blocks = num_features_ * 2; - FindBestSplitsForLeafKernel<<>>(cuda_hist_, cuda_feature_hist_offsets_, + const int num_blocks = num_features_ * 4; + FindBestSplitsForLeafKernel<<>>(cuda_hist_, + cuda_num_total_bin_, cuda_feature_hist_offsets_, cuda_feature_mfb_offsets_, cuda_feature_default_bins_, - cuda_feature_missing_type_, cuda_lambda_l1_, + cuda_feature_missing_type_, cuda_lambda_l1_, cuda_lambda_l2_, smaller_leaf_id, larger_leaf_id, smaller_leaf_gain, larger_leaf_gain, sum_gradients_in_smaller_leaf, sum_hessians_in_smaller_leaf, num_data_in_smaller_leaf, sum_gradients_in_larger_leaf, sum_hessians_in_larger_leaf, num_data_in_larger_leaf, - cuda_min_data_in_leaf_, cuda_min_sum_hessian_in_leaf_, cuda_min_gain_to_split, + cuda_min_data_in_leaf_, cuda_min_sum_hessian_in_leaf_, cuda_min_gain_to_split_, - cuda_best_split_default_left_, cuda_best_split_gain_, + cuda_best_split_threshold_, cuda_best_split_default_left_, cuda_best_split_gain_, cuda_best_split_left_sum_gradient_, cuda_best_split_left_sum_hessian_, - cuda_best_split_left_count_, cuda_best_split_right_sum_gradient_, - cuda_best_split_right_sum_hessian_, cuda_best_split_right_count_); + cuda_best_split_left_count_, cuda_best_split_left_output_, + cuda_best_split_right_sum_gradient_, cuda_best_split_right_sum_hessian_, + cuda_best_split_right_count_, cuda_best_split_right_output_, + cuda_best_split_found_); } -*/ + +__global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index, + const int* cuda_num_features, int* cuda_leaf_best_split_feature, uint8_t* cuda_leaf_best_split_default_left, + uint32_t* cuda_leaf_best_split_threshold, double* cuda_leaf_best_split_gain, double* cuda_leaf_best_split_left_sum_gradient, + double* cuda_leaf_best_split_left_sum_hessian, data_size_t* cuda_leaf_best_split_left_count, + double* cuda_leaf_best_split_left_output, double* cuda_leaf_best_split_right_sum_gradient, + double* cuda_leaf_best_split_right_sum_hessian, data_size_t* cuda_leaf_best_split_right_count, + double* cuda_leaf_best_split_right_output, + // input parameters + const int* cuda_best_split_feature, + const uint8_t* cuda_best_split_default_left, + const uint32_t* cuda_best_split_threshold, + const double* cuda_best_split_gain, + const double* cuda_best_split_left_sum_gradient, + const double* cuda_best_split_left_sum_hessian, + const data_size_t* cuda_best_split_left_count, + const double* cuda_best_split_left_output, + const double* cuda_best_split_right_sum_gradient, + const double* cuda_best_split_right_sum_hessian, + const data_size_t* cuda_best_split_right_count, + const double* cuda_best_split_right_output, + const uint8_t* cuda_best_split_found) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + const int num_features_ref = *cuda_num_features; + const int smaller_leaf_index_ref = *smaller_leaf_index; + const int larger_leaf_index_ref = *larger_leaf_index; + + double& smaller_leaf_best_gain = cuda_leaf_best_split_gain[smaller_leaf_index_ref]; + int& smaller_leaf_best_split_feature = cuda_leaf_best_split_feature[smaller_leaf_index_ref]; + uint8_t& smaller_leaf_best_split_default_left = cuda_leaf_best_split_default_left[smaller_leaf_index_ref]; + uint32_t& smaller_leaf_best_split_threshold = cuda_leaf_best_split_threshold[smaller_leaf_index_ref]; + double& smaller_leaf_best_split_left_sum_gradient = cuda_leaf_best_split_left_sum_gradient[smaller_leaf_index_ref]; + double& smaller_leaf_best_split_left_sum_hessian = cuda_leaf_best_split_left_sum_hessian[smaller_leaf_index_ref]; + data_size_t& smaller_leaf_best_split_left_count = cuda_leaf_best_split_left_count[smaller_leaf_index_ref]; + double& smaller_leaf_best_split_left_output = cuda_leaf_best_split_left_output[smaller_leaf_index_ref]; + double& smaller_leaf_best_split_right_sum_gradient = cuda_leaf_best_split_right_sum_gradient[smaller_leaf_index_ref]; + double& smaller_leaf_best_split_right_sum_hessian = cuda_leaf_best_split_right_sum_hessian[smaller_leaf_index_ref]; + data_size_t& smaller_leaf_best_split_right_count = cuda_leaf_best_split_right_count[smaller_leaf_index_ref]; + double& smaller_leaf_best_split_right_output = cuda_leaf_best_split_right_output[smaller_leaf_index_ref]; + + double& larger_leaf_best_gain = cuda_leaf_best_split_gain[larger_leaf_index_ref]; + int& larger_leaf_best_split_feature = cuda_leaf_best_split_feature[larger_leaf_index_ref]; + uint8_t& larger_leaf_best_split_default_left = cuda_leaf_best_split_default_left[larger_leaf_index_ref]; + uint32_t& larger_leaf_best_split_threshold = cuda_leaf_best_split_threshold[larger_leaf_index_ref]; + double& larger_leaf_best_split_left_sum_gradient = cuda_leaf_best_split_left_sum_gradient[larger_leaf_index_ref]; + double& larger_leaf_best_split_left_sum_hessian = cuda_leaf_best_split_left_sum_hessian[larger_leaf_index_ref]; + data_size_t& larger_leaf_best_split_left_count = cuda_leaf_best_split_left_count[larger_leaf_index_ref]; + double& larger_leaf_best_split_left_output = cuda_leaf_best_split_left_output[larger_leaf_index_ref]; + double& larger_leaf_best_split_right_sum_gradient = cuda_leaf_best_split_right_sum_gradient[larger_leaf_index_ref]; + double& larger_leaf_best_split_right_sum_hessian = cuda_leaf_best_split_right_sum_hessian[larger_leaf_index_ref]; + data_size_t& larger_leaf_best_split_right_count = cuda_leaf_best_split_right_count[larger_leaf_index_ref]; + double& larger_leaf_best_split_right_output = cuda_leaf_best_split_right_output[larger_leaf_index_ref]; + + smaller_leaf_best_gain = kMinScore; + larger_leaf_best_gain = kMinScore; + int larger_leaf_offset = 2 * num_features_ref; + for (int feature_index = 0; feature_index < num_features_ref; ++feature_index) { + const int smaller_reverse_index = 2 * feature_index; + const uint8_t smaller_reverse_found = cuda_best_split_found[smaller_reverse_index]; + if (smaller_reverse_found) { + const double smaller_reverse_gain = cuda_best_split_gain[smaller_reverse_index]; + if (smaller_reverse_gain > smaller_leaf_best_gain) { + //printf("reverse smaller leaf new best, feature_index = %d, split_gain = %f, default_left = %d, threshold = %d\n", + // feature_index, smaller_reverse_gain, cuda_best_split_default_left[smaller_reverse_index], + // cuda_best_split_threshold[smaller_reverse_index]); + smaller_leaf_best_gain = smaller_reverse_gain; + smaller_leaf_best_split_feature = feature_index; + smaller_leaf_best_split_default_left = cuda_best_split_default_left[smaller_reverse_index]; + smaller_leaf_best_split_threshold = cuda_best_split_threshold[smaller_reverse_index]; + smaller_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[smaller_reverse_index]; + smaller_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[smaller_reverse_index]; + smaller_leaf_best_split_left_count = cuda_best_split_left_count[smaller_reverse_index]; + smaller_leaf_best_split_left_output = cuda_best_split_left_output[smaller_reverse_index]; + smaller_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[smaller_reverse_index]; + smaller_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[smaller_reverse_index]; + smaller_leaf_best_split_right_count = cuda_best_split_right_count[smaller_reverse_index]; + smaller_leaf_best_split_right_output = cuda_best_split_right_output[smaller_reverse_index]; + } + } + const int smaller_non_reverse_index = 2 * feature_index + 1; + const uint8_t smaller_non_reverse_found = cuda_best_split_found[smaller_non_reverse_index]; + if (smaller_non_reverse_found) { + const double smaller_non_reverse_gain = cuda_best_split_gain[smaller_non_reverse_index]; + if (smaller_non_reverse_gain > smaller_leaf_best_gain) { + //printf("non reverse smaller leaf new best, feature_index = %d, split_gain = %f, default_left = %d, threshold = %d\n", + // feature_index, smaller_non_reverse_gain, cuda_best_split_default_left[smaller_non_reverse_index], + // cuda_best_split_threshold[smaller_non_reverse_index]); + smaller_leaf_best_gain = smaller_non_reverse_gain; + smaller_leaf_best_split_feature = feature_index; + smaller_leaf_best_split_default_left = cuda_best_split_default_left[smaller_non_reverse_index]; + smaller_leaf_best_split_threshold = cuda_best_split_threshold[smaller_non_reverse_index]; + smaller_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[smaller_non_reverse_index]; + smaller_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[smaller_non_reverse_index]; + smaller_leaf_best_split_left_count = cuda_best_split_left_count[smaller_non_reverse_index]; + smaller_leaf_best_split_left_output = cuda_best_split_left_output[smaller_non_reverse_index]; + smaller_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[smaller_non_reverse_index]; + smaller_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[smaller_non_reverse_index]; + smaller_leaf_best_split_right_count = cuda_best_split_right_count[smaller_non_reverse_index]; + smaller_leaf_best_split_right_output = cuda_best_split_right_output[smaller_non_reverse_index]; + } + } + + if (larger_leaf_index_ref >= 0) { + const int larger_reverse_index = 2 * feature_index + larger_leaf_offset; + const uint8_t larger_reverse_found = cuda_best_split_found[larger_reverse_index]; + if (larger_reverse_found) { + const double larger_reverse_gain = cuda_best_split_gain[larger_reverse_index]; + if (larger_reverse_gain > larger_leaf_best_gain) { + larger_leaf_best_gain = larger_reverse_gain; + larger_leaf_best_split_feature = feature_index; + larger_leaf_best_split_default_left = cuda_best_split_default_left[larger_reverse_index]; + larger_leaf_best_split_threshold = cuda_best_split_threshold[larger_reverse_index]; + larger_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[larger_reverse_index]; + larger_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[larger_reverse_index]; + larger_leaf_best_split_left_count = cuda_best_split_left_count[larger_reverse_index]; + larger_leaf_best_split_left_output = cuda_best_split_left_output[larger_reverse_index]; + larger_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[larger_reverse_index]; + larger_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[larger_reverse_index]; + larger_leaf_best_split_right_count = cuda_best_split_right_count[larger_reverse_index]; + larger_leaf_best_split_right_output = cuda_best_split_right_output[larger_reverse_index]; + } + } + const int larger_non_reverse_index = 2 * feature_index + 1 + larger_leaf_offset; + const uint8_t larger_non_reverse_found = cuda_best_split_found[larger_non_reverse_index]; + if (larger_non_reverse_found) { + const double larger_non_reverse_gain = cuda_best_split_gain[larger_non_reverse_index]; + if (larger_non_reverse_gain > larger_leaf_best_gain) { + larger_leaf_best_gain = larger_non_reverse_gain; + larger_leaf_best_split_feature = feature_index; + larger_leaf_best_split_default_left = cuda_best_split_default_left[larger_non_reverse_index]; + larger_leaf_best_split_threshold = cuda_best_split_threshold[larger_non_reverse_index]; + larger_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[larger_non_reverse_index]; + larger_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[larger_non_reverse_index]; + larger_leaf_best_split_left_count = cuda_best_split_left_count[larger_non_reverse_index]; + larger_leaf_best_split_left_output = cuda_best_split_left_output[larger_non_reverse_index]; + larger_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[larger_non_reverse_index]; + larger_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[larger_non_reverse_index]; + larger_leaf_best_split_right_count = cuda_best_split_right_count[larger_non_reverse_index]; + larger_leaf_best_split_right_output = cuda_best_split_right_output[larger_non_reverse_index]; + } + } + } + } + } +} + +void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index) { + SyncBestSplitForLeafKernel<<<1, 1>>>(smaller_leaf_index, larger_leaf_index, + cuda_num_features_, cuda_leaf_best_split_feature_, cuda_leaf_best_split_default_left_, + cuda_leaf_best_split_threshold_, cuda_leaf_best_split_gain_, cuda_leaf_best_split_left_sum_gradient_, + cuda_leaf_best_split_left_sum_hessian_, cuda_leaf_best_split_left_count_, + cuda_leaf_best_split_left_output_, cuda_leaf_best_split_right_sum_gradient_, + cuda_leaf_best_split_right_sum_hessian_, cuda_leaf_best_split_right_count_, + cuda_leaf_best_split_right_output_, + cuda_best_split_feature_, + cuda_best_split_default_left_, + cuda_best_split_threshold_, + cuda_best_split_gain_, + cuda_best_split_left_sum_gradient_, + cuda_best_split_left_sum_hessian_, + cuda_best_split_left_count_, + cuda_best_split_left_output_, + cuda_best_split_right_sum_gradient_, + cuda_best_split_right_sum_hessian_, + cuda_best_split_right_count_, + cuda_best_split_right_output_, + cuda_best_split_found_); +} + +__global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, + const double* cuda_leaf_best_split_gain, int* out_best_leaf) { + const int cuda_cur_num_leaves_ref = *cuda_cur_num_leaves; + double best_gain = kMinScore; + for (int leaf_index = 0; leaf_index < cuda_cur_num_leaves_ref; ++leaf_index) { + const double leaf_best_gain = cuda_leaf_best_split_gain[leaf_index]; + if (leaf_best_gain > best_gain) { + best_gain = leaf_best_gain; + *out_best_leaf = leaf_index; + } + } +} + +void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves) { + FindBestFromAllSplitsKernel<<<1, 1>>>(cuda_cur_num_leaves, cuda_leaf_best_split_gain_, cuda_best_leaf_); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 95e58f381dd3..400716c62a8d 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -10,80 +10,136 @@ #ifdef USE_CUDA #include "new_cuda_utils.hpp" +#include "cuda_leaf_splits.hpp" #include #include #include +#define MAX_NUM_BIN_IN_FEATURE (256) + namespace LightGBM { class CUDABestSplitFinder { -/* public: + public: CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* train_data, - const std::vector& feature_hist_offsets, const int num_leaves); + const std::vector& feature_hist_offsets, const int num_leaves, + const double lambda_l1, const double lambda_l2, const data_size_t min_data_in_leaf, + const double min_sum_hessian_in_leaf, const double min_gain_to_split, + const int* cuda_num_features); void Init(); - void FindBestSplitsForLeaf(const int* smaller_leaf_id, const int* larger_leaf_id, const double* parent_gain); + void FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, const CUDALeafSplits* larger_leaf_splits); - void FindBestFromAllSplits(); + void FindBestFromAllSplits(const int* cuda_cur_num_leaves); - int* best_leaf() { return cuda_best_leaf_; } + const int* cuda_best_leaf() const { return cuda_best_leaf_; } - int* best_split_feature_index() { return cuda_best_split_feature_index_; } + const int* cuda_leaf_best_split_feature() const { return cuda_leaf_best_split_feature_; } - int* best_split_threshold() { return cuda_best_split_threshold_; } + const uint32_t* cuda_leaf_best_split_threshold() const { return cuda_leaf_best_split_threshold_; } - private: - void LaunchFindBestSplitsForLeafKernel(const int* smaller_leaf_id, const int* larger_leaf_id, const double* parent_gain); + const uint8_t* cuda_leaf_best_split_default_left() const { return cuda_leaf_best_split_default_left_; } - int* cuda_best_leaf_; - int* cuda_best_split_feature_index_; - int* cuda_best_split_threshold_; + void TestAfterInit() { + PrintLastCUDAError(); + } - double* cuda_leaf_best_split_gain_; - int* cuda_leaf_best_split_feature_; - int* cuda_leaf_best_split_threshold_; + void TestAfterFindBestSplits() { + PrintLastCUDAError(); + const size_t feature_best_split_info_buffer_size = static_cast(num_features_) * 4; + std::vector test_best_split_threshold(feature_best_split_info_buffer_size, 0); + std::vector test_best_split_found(feature_best_split_info_buffer_size, 0); + CopyFromCUDADeviceToHost(test_best_split_threshold.data(), + cuda_best_split_threshold_, feature_best_split_info_buffer_size); + CopyFromCUDADeviceToHost(test_best_split_found.data(), + cuda_best_split_found_, feature_best_split_info_buffer_size); + for (size_t i = 0; i < feature_best_split_info_buffer_size; ++i) { + Log::Warning("test_best_split_threshold[%d] = %d", i, test_best_split_threshold[i]); + Log::Warning("test_best_split_found[%d] = %d", i, test_best_split_found[i]); + } - int* cuda_best_split_feature_; - uint8_t* cuda_best_split_default_left_; - double* cuda_best_split_gain_; - double* cuda_best_split_left_sum_gradient_; - double* cuda_best_split_left_sum_hessian_; - data_size_t* cuda_best_split_left_count_; - double* cuda_best_split_right_sum_gradient_; - double* cuda_best_split_right_sum_hessian_; - data_size_t* cuda_best_split_right_count_; + int test_best_leaf = 0; + CopyFromCUDADeviceToHost(&test_best_leaf, cuda_best_leaf_, 1); + Log::Warning("test_best_leaf = %d", test_best_leaf); + } - const hist_t* cuda_hist_; - hist_t* prefix_sum_hist_left_; - hist_t* prefix_sum_hist_right_; + private: + void LaunchFindBestSplitsForLeafKernel(const int* smaller_leaf_id, const int* larger_leaf_id, + const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, + const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, + const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, + const data_size_t* num_data_in_larger_leaf); + + void LaunchSyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index); + + void LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves); + + // Host memory const int num_features_; const int num_leaves_; const int num_total_bin_; - - int* cuda_num_total_bin_; - + int max_num_bin_in_feature_; std::vector feature_hist_offsets_; std::vector feature_mfb_offsets_; std::vector feature_default_bins_; - // None --> 0, Zero --> 1, NaN --> 2 std::vector feature_missing_type_; const double lambda_l1_; + const double lambda_l2_; const data_size_t min_data_in_leaf_; const double min_sum_hessian_in_leaf_; const double min_gain_to_split_; + // CUDA memory, held by this object + // for per leaf best split information + int* cuda_best_leaf_; + int* cuda_leaf_best_split_feature_; + uint8_t* cuda_leaf_best_split_default_left_; + uint32_t* cuda_leaf_best_split_threshold_; + double* cuda_leaf_best_split_gain_; + double* cuda_leaf_best_split_left_sum_gradient_; + double* cuda_leaf_best_split_left_sum_hessian_; + data_size_t* cuda_leaf_best_split_left_count_; + double* cuda_leaf_best_split_left_output_; + double* cuda_leaf_best_split_right_sum_gradient_; + double* cuda_leaf_best_split_right_sum_hessian_; + data_size_t* cuda_leaf_best_split_right_count_; + double* cuda_leaf_best_split_right_output_; + // for best split information when finding best split + int* cuda_best_split_feature_; + uint8_t* cuda_best_split_default_left_; + uint32_t* cuda_best_split_threshold_; + double* cuda_best_split_gain_; + double* cuda_best_split_left_sum_gradient_; + double* cuda_best_split_left_sum_hessian_; + data_size_t* cuda_best_split_left_count_; + double* cuda_best_split_left_output_; + double* cuda_best_split_right_sum_gradient_; + double* cuda_best_split_right_sum_hessian_; + data_size_t* cuda_best_split_right_count_; + double* cuda_best_split_right_output_; + uint8_t* cuda_best_split_found_; + int* cuda_num_total_bin_; + // TODO(shiyu1994): use prefix sum to accelerate best split finding + hist_t* prefix_sum_hist_left_; + hist_t* prefix_sum_hist_right_; + // feature information uint32_t* cuda_feature_hist_offsets_; uint8_t* cuda_feature_mfb_offsets_; uint32_t* cuda_feature_default_bins_; uint8_t* cuda_feature_missing_type_; double* cuda_lambda_l1_; + double* cuda_lambda_l2_; data_size_t* cuda_min_data_in_leaf_; double* cuda_min_sum_hessian_in_leaf_; - double* cuda_min_gain_to_split_;*/ + double* cuda_min_gain_to_split_; + + // CUDA memory, held by other object + const hist_t* cuda_hist_; + const int* cuda_num_features_; }; } diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 77317d512068..752e7b1850e0 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -10,35 +10,130 @@ namespace LightGBM { -CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_leaves, - const data_size_t* cuda_num_data, const int* cuda_num_leaves): - num_data_(num_data), num_leaves_(num_leaves) { +CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_features, const int num_leaves, + const int num_threads, const data_size_t* cuda_num_data, const int* cuda_num_leaves, const uint8_t* cuda_data, + const int* cuda_num_features, const std::vector& feature_hist_offsets, const Dataset* train_data): + num_data_(num_data), num_features_(num_features), num_leaves_(num_leaves), num_threads_(num_threads), + cuda_data_(cuda_data), cuda_num_features_(cuda_num_features) { cuda_num_data_ = cuda_num_data; cuda_num_leaves_ = cuda_num_leaves; + max_num_split_indices_blocks_ = (num_data_ + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; + feature_default_bins_.resize(train_data->num_features()); + feature_most_freq_bins_.resize(train_data->num_features()); + feature_max_bins_.resize(train_data->num_features()); + feature_min_bins_.resize(train_data->num_features()); + feature_missing_is_zero_.resize(train_data->num_features()); + feature_missing_is_na_.resize(train_data->num_features()); + feature_mfb_is_zero_.resize(train_data->num_features()); + feature_mfb_is_na_.resize(train_data->num_features()); + int cur_group = 0; + uint32_t prev_group_bins = 0; + for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + const int group = train_data->Feature2Group(feature_index); + if (cur_group != group) { + prev_group_bins += static_cast(train_data->FeatureGroupNumBin(cur_group)); + cur_group = group; + } + const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); + feature_default_bins_[feature_index] = bin_mapper->GetDefaultBin(); + feature_most_freq_bins_[feature_index] = bin_mapper->GetMostFreqBin(); + Log::Warning("feature_index = %d, feature_hist_offsets[feature_index] = %d, prev_group_bins = %d", + feature_index, feature_hist_offsets[feature_index], prev_group_bins); + feature_min_bins_[feature_index] = feature_hist_offsets[feature_index] - prev_group_bins; + feature_max_bins_[feature_index] = feature_hist_offsets[feature_index + 1] - prev_group_bins - 1; + const MissingType missing_type = bin_mapper->missing_type(); + if (missing_type == MissingType::None) { + feature_missing_is_zero_[feature_index] = 0; + feature_missing_is_na_[feature_index] = 0; + feature_mfb_is_zero_[feature_index] = 0; + feature_mfb_is_na_[feature_index] = 0; + } else if (missing_type == MissingType::Zero) { + feature_missing_is_zero_[feature_index] = 1; + feature_missing_is_na_[feature_index] = 0; + if (bin_mapper->GetMostFreqBin() == bin_mapper->GetDefaultBin()) { + feature_mfb_is_zero_[feature_index] = 1; + } else { + feature_mfb_is_zero_[feature_index] = 0; + } + feature_mfb_is_na_[feature_index] = 0; + } else if (missing_type == MissingType::NaN) { + feature_missing_is_zero_[feature_index] = 0; + feature_missing_is_na_[feature_index] = 1; + feature_mfb_is_zero_[feature_index] = 0; + if (bin_mapper->GetMostFreqBin() == bin_mapper->GetDefaultBin()) { + feature_mfb_is_na_[feature_index] = 1; + } else { + feature_mfb_is_na_[feature_index] = 0; + } + } + } } void CUDADataPartition::Init() { // allocate CUDA memory AllocateCUDAMemory(static_cast(num_data_), &cuda_data_indices_); - AllocateCUDAMemory(static_cast(num_leaves_) + 1, &cuda_leaf_num_data_offsets_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_data_start_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_data_end_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_num_data_); + InitCUDAValueFromConstant(&cuda_cur_num_leaves_, 1); + AllocateCUDAMemory(static_cast(num_data_), &cuda_data_to_left_); + AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_), &cuda_block_data_to_left_offset_); + AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_), &cuda_block_data_to_right_offset_); + AllocateCUDAMemory(static_cast(num_data_), &cuda_out_data_indices_in_leaf_); + + InitCUDAMemoryFromHostMemory(&cuda_feature_most_freq_bins_, feature_most_freq_bins_.data(), static_cast(num_features_)); + InitCUDAMemoryFromHostMemory(&cuda_feature_default_bins_, feature_default_bins_.data(), static_cast(num_features_)); + InitCUDAMemoryFromHostMemory(&cuda_feature_max_bins_, feature_max_bins_.data(), static_cast(num_features_)); + InitCUDAMemoryFromHostMemory(&cuda_feature_min_bins_, feature_min_bins_.data(), static_cast(num_features_)); + InitCUDAMemoryFromHostMemory(&cuda_feature_missing_is_zero_, feature_missing_is_zero_.data(), static_cast(num_features_)); + InitCUDAMemoryFromHostMemory(&cuda_feature_missing_is_na_, feature_missing_is_na_.data(), static_cast(num_features_)); + InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_zero_, feature_mfb_is_zero_.data(), static_cast(num_features_)); + InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_na_, feature_mfb_is_na_.data(), static_cast(num_features_)); } void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { if (data_indices == nullptr) { // no bagging LaunchFillDataIndicesBeforeTrain(); - SetCUDAMemory(cuda_leaf_num_data_offsets_, 0, static_cast(num_leaves_) + 1); + SetCUDAMemory(cuda_leaf_num_data_, 0, static_cast(num_leaves_)); + SetCUDAMemory(cuda_leaf_data_start_, 0, static_cast(num_leaves_)); + SetCUDAMemory(cuda_leaf_data_end_, 0, static_cast(num_leaves_)); SynchronizeCUDADevice(); - CopyFromCUDADeviceToCUDADevice(cuda_leaf_num_data_offsets_ + 1, cuda_num_data_, 1); + CopyFromCUDADeviceToCUDADevice(cuda_leaf_num_data_, cuda_num_data_, 1); + CopyFromCUDADeviceToCUDADevice(cuda_leaf_data_end_, cuda_num_data_, 1); SynchronizeCUDADevice(); } else { Log::Fatal("bagging is not supported by GPU"); } } -void CUDADataPartition::Split(const int* /*leaf_id*/, - const int* /*best_split_feature*/, - const int* /*best_split_threshold*/) {} +void CUDADataPartition::Split(const int* leaf_id, + const int* best_split_feature, + const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left) { + auto start = std::chrono::steady_clock::now(); + GenDataToLeftBitVector(leaf_id, best_split_feature, best_split_threshold, best_split_default_left); + auto end = std::chrono::steady_clock::now(); + double duration = (static_cast>(end - start)).count(); + Log::Warning("CUDADataPartition::GenDataToLeftBitVector time %f", duration); + start = std::chrono::steady_clock::now(); + SplitInner(leaf_id); + end = std::chrono::steady_clock::now(); + duration = (static_cast>(end - start)).count(); + Log::Warning("CUDADataPartition::SplitInner time %f", duration); +} + +void CUDADataPartition::GenDataToLeftBitVector(const int* leaf_id, + const int* best_split_feature, + const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left) { + LaunchGenDataToLeftBitVectorKernel(leaf_id, best_split_feature, best_split_threshold, best_split_default_left); +} + +void CUDADataPartition::SplitInner(const int* leaf_index) { + LaunchSplitInnerKernel(leaf_index); +} Tree* CUDADataPartition::GetCPUTree() {} diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 5f30391f0f41..fd6be0deae2b 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -10,6 +10,46 @@ namespace LightGBM { +#define CONFLICT_FREE_INDEX(n) \ + ((n) + ((n) >> LOG_NUM_BANKS_DATA_PARTITION)) \ + +__device__ void PrefixSum(uint32_t* elements, unsigned int n) { + unsigned int offset = 1; + unsigned int threadIdx_x = threadIdx.x; + const unsigned int conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); + const uint32_t last_element = elements[conflict_free_n_minus_1]; + __syncthreads(); + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + offset <<= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[conflict_free_n_minus_1] = 0; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[CONFLICT_FREE_INDEX(n)] = elements[conflict_free_n_minus_1] + last_element; + } +} + __global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_data, data_size_t* data_indices) { const data_size_t num_data_ref = *cuda_num_data; @@ -24,6 +64,441 @@ void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() { FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_); } +__global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t* cuda_leaf_data_start, + const data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, const int* best_split_feature, + const uint32_t* best_split_threshold, const int* cuda_num_features, const uint8_t* cuda_data, + const uint32_t* default_bin, const uint32_t* most_freq_bin, const uint8_t* default_left, + const uint32_t* min_bin, const uint32_t* max_bin, const uint8_t* missing_is_zero, const uint8_t* missing_is_na, + const uint8_t* mfb_is_zero, const uint8_t* mfb_is_na, + uint8_t* cuda_data_to_left) { + const int leaf_index_ref = *leaf_index; + const int best_split_feature_ref = best_split_feature[leaf_index_ref]; + const int num_features_ref = *cuda_num_features; + const uint32_t best_split_threshold_ref = best_split_threshold[leaf_index_ref]; + const uint8_t default_left_ref = default_left[leaf_index_ref]; + const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; + const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; + const data_size_t* data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const unsigned int global_feature_value_index = global_data_index * num_features_ref + best_split_feature_ref; + const uint32_t default_bin_ref = default_bin[best_split_feature_ref]; + const uint32_t most_freq_bin_ref = most_freq_bin[best_split_feature_ref]; + const uint32_t max_bin_ref = max_bin[best_split_feature_ref]; + const uint32_t min_bin_ref = min_bin[best_split_feature_ref]; + const uint8_t missing_is_zero_ref = missing_is_zero[best_split_feature_ref]; + const uint8_t missing_is_na_ref = missing_is_na[best_split_feature_ref]; + const uint8_t mfb_is_zero_ref = mfb_is_zero[best_split_feature_ref]; + const uint8_t mfb_is_na_ref = mfb_is_na[best_split_feature_ref]; + + uint32_t th = best_split_threshold_ref + min_bin_ref; + uint32_t t_zero_bin = min_bin_ref + default_bin_ref; + if (most_freq_bin_ref == 0) { + --th; + --t_zero_bin; + } + uint8_t split_default_to_left = 0; + uint8_t split_missing_default_to_left = 0; + if (most_freq_bin_ref <= best_split_threshold_ref) { + split_default_to_left = 1; + } + if (missing_is_zero_ref || missing_is_na_ref) { + if (default_left_ref) { + split_missing_default_to_left = 1; + } + } + + if (local_data_index < static_cast(num_data_in_leaf)) { + const uint32_t bin = static_cast(cuda_data[global_feature_value_index]); + if (min_bin_ref < max_bin_ref) { + if ((missing_is_zero_ref && !mfb_is_zero_ref && bin == t_zero_bin)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin < min_bin_ref || bin > max_bin_ref) { + if ((missing_is_na_ref || mfb_is_na_ref) || (missing_is_zero_ref || mfb_is_zero_ref)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + } + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } else { + if (missing_is_zero_ref || !mfb_is_zero_ref && bin == t_zero_bin) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin != max_bin_ref) { + if ((missing_is_na_ref && mfb_is_na_ref) || (missing_is_zero_ref && mfb_is_zero_ref)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + } + } else { + if (missing_is_na_ref && !mfb_is_na_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + } + } + } + } +} + +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const int* best_split_feature, + const uint32_t* best_split_threshold, const uint8_t* best_split_default_left) { + GenDataToLeftBitVectorKernel<<>>( + leaf_index, cuda_leaf_data_start_, cuda_leaf_num_data_, + cuda_data_indices_, best_split_feature, best_split_threshold, + cuda_num_features_, cuda_data_, + cuda_feature_default_bins_, cuda_feature_most_freq_bins_, best_split_default_left, + cuda_feature_min_bins_, cuda_feature_max_bins_, cuda_feature_missing_is_zero_, cuda_feature_missing_is_na_, + cuda_feature_mfb_is_zero_, cuda_feature_mfb_is_na_, + cuda_data_to_left_); + SynchronizeCUDADevice(); +} + +__global__ void PrepareOffsetKernel(const int* leaf_index, + const data_size_t* cuda_leaf_num_data, const uint8_t* split_to_left_bit_vector, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer) { + const unsigned int blockDim_x = blockDim.x; + __shared__ uint32_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint32_t thread_to_right_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; + const unsigned int threadIdx_x = threadIdx.x; + const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); + const unsigned int global_read_index = blockIdx.x * blockDim.x * 2 + threadIdx_x; + const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[*leaf_index]; + if (global_read_index < num_data_in_leaf_ref) { + const uint8_t bit = split_to_left_bit_vector[global_read_index]; + thread_to_left_offset_cnt[conflict_free_threadIdx_x] = bit; + thread_to_right_offset_cnt[conflict_free_threadIdx_x] = 1 - bit; + } else { + thread_to_left_offset_cnt[conflict_free_threadIdx_x] = 0; + thread_to_right_offset_cnt[conflict_free_threadIdx_x] = 0; + } + const unsigned int conflict_free_threadIdx_x_offseted = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x); + if (global_read_index + blockDim_x < num_data_in_leaf_ref) { + const uint8_t bit = split_to_left_bit_vector[global_read_index + blockDim_x]; + thread_to_left_offset_cnt[conflict_free_threadIdx_x_offseted] = bit; + thread_to_right_offset_cnt[conflict_free_threadIdx_x_offseted] = 1 - bit; + } else { + thread_to_left_offset_cnt[conflict_free_threadIdx_x_offseted] = 0; + thread_to_right_offset_cnt[conflict_free_threadIdx_x_offseted] = 0; + } + __syncthreads(); + PrefixSum(thread_to_left_offset_cnt, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + PrefixSum(thread_to_right_offset_cnt, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + __syncthreads(); + if (threadIdx_x == 0) { + const unsigned int conflict_free_blockDim_x_times_2 = CONFLICT_FREE_INDEX(blockDim_x << 1); + block_to_left_offset_buffer[blockIdx.x + 1] = thread_to_left_offset_cnt[conflict_free_blockDim_x_times_2]; + block_to_right_offset_buffer[blockIdx.x + 1] = thread_to_right_offset_cnt[conflict_free_blockDim_x_times_2]; + } +} + +__global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* block_to_left_offset_buffer, + data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, + data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, + int* cuda_cur_num_leaves) { + __shared__ uint32_t block_to_left_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint32_t block_to_right_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; + const int leaf_index_ref = *leaf_index; + const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; + const unsigned int blockDim_x = blockDim.x; + const unsigned int threadIdx_x = threadIdx.x; + const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); + const unsigned int conflict_free_threadIdx_x_plus_blockDim_x = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x); + const uint32_t num_blocks = (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; + const uint32_t num_aggregate_blocks = (num_blocks + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; + uint32_t left_prev_sum = 0; + for (uint32_t block_id = 0; block_id < num_aggregate_blocks; ++block_id) { + const unsigned int read_index = block_id * blockDim_x * 2 + threadIdx_x; + if (read_index < num_blocks) { + block_to_left_offset[conflict_free_threadIdx_x] = block_to_left_offset_buffer[read_index + 1]; + } else { + block_to_left_offset[conflict_free_threadIdx_x] = 0; + } + const unsigned int read_index_plus_blockDim_x = read_index + blockDim_x; + if (read_index_plus_blockDim_x < num_blocks) { + block_to_left_offset[conflict_free_threadIdx_x_plus_blockDim_x] = block_to_left_offset_buffer[read_index_plus_blockDim_x + 1]; + } else { + block_to_left_offset[conflict_free_threadIdx_x_plus_blockDim_x] = 0; + } + if (threadIdx_x == 0) { + block_to_left_offset[0] += left_prev_sum; + } + __syncthreads(); + PrefixSum(block_to_left_offset, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + __syncthreads(); + if (threadIdx_x == 0) { + left_prev_sum = block_to_left_offset[CONFLICT_FREE_INDEX(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION)]; + } + if (read_index < num_blocks) { + const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); + block_to_left_offset_buffer[read_index + 1] = block_to_left_offset[conflict_free_threadIdx_x_plus_1]; + } + if (read_index_plus_blockDim_x < num_blocks) { + const unsigned int conflict_free_threadIdx_x_plus_1_plus_blockDim_x = CONFLICT_FREE_INDEX(threadIdx_x + 1 + blockDim_x); + block_to_left_offset_buffer[read_index_plus_blockDim_x + 1] = block_to_left_offset[conflict_free_threadIdx_x_plus_1_plus_blockDim_x]; + } + __syncthreads(); + } + const unsigned int to_left_total_cnt = block_to_left_offset_buffer[num_blocks]; + uint32_t right_prev_sum = to_left_total_cnt; + for (uint32_t block_id = 0; block_id < num_aggregate_blocks; ++block_id) { + const unsigned int read_index = block_id * blockDim_x * 2 + threadIdx_x; + if (read_index < num_blocks) { + block_to_right_offset[conflict_free_threadIdx_x] = block_to_right_offset_buffer[read_index + 1]; + } else { + block_to_right_offset[conflict_free_threadIdx_x] = 0; + } + const unsigned int read_index_plus_blockDim_x = read_index + blockDim_x; + if (read_index_plus_blockDim_x < num_blocks) { + block_to_right_offset[conflict_free_threadIdx_x_plus_blockDim_x] = block_to_right_offset_buffer[read_index_plus_blockDim_x + 1]; + } else { + block_to_right_offset[conflict_free_threadIdx_x_plus_blockDim_x] = 0; + } + if (threadIdx_x == 0) { + block_to_right_offset[0] += right_prev_sum; + } + __syncthreads(); + PrefixSum(block_to_right_offset, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + __syncthreads(); + if (threadIdx_x == 0) { + right_prev_sum = block_to_right_offset[CONFLICT_FREE_INDEX(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION)]; + } + if (read_index < num_blocks) { + const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); + block_to_right_offset_buffer[read_index + 1] = block_to_right_offset[conflict_free_threadIdx_x_plus_1]; + } + if (read_index_plus_blockDim_x < num_blocks) { + const unsigned int conflict_free_threadIdx_x_plus_1_plus_blockDim_x = CONFLICT_FREE_INDEX(threadIdx_x + 1 + blockDim_x); + block_to_right_offset_buffer[read_index_plus_blockDim_x + 1] = block_to_right_offset[conflict_free_threadIdx_x_plus_1_plus_blockDim_x]; + } + __syncthreads(); + } + if (blockIdx.x == 0 && threadIdx.x == 0) { + const int cur_max_leaf_index = (*cuda_cur_num_leaves); + block_to_left_offset_buffer[0] = 0; + const unsigned int to_left_total_cnt = block_to_left_offset_buffer[num_blocks]; + block_to_right_offset_buffer[0] = to_left_total_cnt; + const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index_ref]; + cuda_leaf_data_end[leaf_index_ref] = cuda_leaf_data_start[leaf_index_ref] + static_cast(to_left_total_cnt); + cuda_leaf_num_data[leaf_index_ref] = static_cast(to_left_total_cnt); + cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index_ref]; + cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; + cuda_leaf_num_data[cur_max_leaf_index] = block_to_right_offset_buffer[num_blocks] - to_left_total_cnt; + ++(*cuda_cur_num_leaves); + } +} + +__global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_leaves, + const data_size_t* cuda_leaf_data_start, const data_size_t* cuda_leaf_num_data, + const data_size_t* cuda_data_indices, const uint8_t* split_to_left_bit_vector, + const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, + data_size_t* out_data_indices_in_leaf) { + __shared__ uint8_t thread_split_to_left_bit_vector[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; + __shared__ uint32_t thread_to_left_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint32_t thread_to_right_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; + const int leaf_index_ref = *leaf_index; + const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; + const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index_ref] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; + const unsigned int threadIdx_x = threadIdx.x; + const unsigned int blockDim_x = blockDim.x; + const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); + const unsigned int global_thread_index = blockIdx.x * blockDim_x * 2 + threadIdx_x; + const data_size_t* cuda_data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; + if (global_thread_index < num_data_in_leaf_ref) { + const uint8_t bit = split_to_left_bit_vector[global_thread_index]; + thread_split_to_left_bit_vector[threadIdx_x] = bit; + thread_to_left_pos[conflict_free_threadIdx_x_plus_1] = bit; + thread_to_right_pos[conflict_free_threadIdx_x_plus_1] = 1 - bit; + } else { + thread_split_to_left_bit_vector[threadIdx_x] = 0; + thread_to_left_pos[conflict_free_threadIdx_x_plus_1] = 0; + thread_to_right_pos[conflict_free_threadIdx_x_plus_1] = 0; + } + const unsigned int conflict_free_threadIdx_x_plus_blockDim_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x + 1); + const unsigned int global_thread_index_plus_blockDim_x = global_thread_index + blockDim_x; + if (global_thread_index_plus_blockDim_x < num_data_in_leaf_ref) { + const uint8_t bit = split_to_left_bit_vector[global_thread_index_plus_blockDim_x]; + thread_split_to_left_bit_vector[threadIdx_x + blockDim_x] = bit; + thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = bit; + thread_to_right_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = 1 - bit; + } else { + thread_split_to_left_bit_vector[threadIdx_x + blockDim_x] = 0; + thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = 0; + thread_to_right_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = 0; + } + __syncthreads(); + if (threadIdx_x == 0) { + const uint32_t to_right_block_offset = block_to_right_offset_buffer[blockIdx.x]; + const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x]; + thread_to_left_pos[0] = to_left_block_offset; + thread_to_right_pos[0] = to_right_block_offset; + } + __syncthreads(); + PrefixSum(thread_to_left_pos, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + PrefixSum(thread_to_right_pos, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + __syncthreads(); + if (global_thread_index < num_data_in_leaf_ref) { + if (thread_split_to_left_bit_vector[threadIdx_x] == 1) { + out_data_indices_in_leaf[thread_to_left_pos[conflict_free_threadIdx_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index]; + } else { + out_data_indices_in_leaf[thread_to_right_pos[conflict_free_threadIdx_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index]; + } + } + if (global_thread_index_plus_blockDim_x < num_data_in_leaf_ref) { + if (thread_split_to_left_bit_vector[threadIdx_x + blockDim_x] == 1) { + out_data_indices_in_leaf[thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x]; + } else { + out_data_indices_in_leaf[thread_to_right_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x]; + } + } + /*if (thread_to_left_pos[conflict_free_threadIdx_x_plus_1] == 0) { + printf("thread_to_left_pos[%d] = %d, global_thread_index = %d, thread_split_to_left_bit_vector[%d] = %d\n", + conflict_free_threadIdx_x_plus_1, thread_to_left_pos[conflict_free_threadIdx_x_plus_1], global_thread_index, threadIdx_x, thread_split_to_left_bit_vector[threadIdx_x]); + } + if (thread_to_right_pos[conflict_free_threadIdx_x_plus_1] == 0) { + printf("thread_to_right_pos[%d] = %d, global_thread_index = %d, thread_split_to_left_bit_vector[%d] = %d\n", + conflict_free_threadIdx_x_plus_1, thread_to_left_pos[conflict_free_threadIdx_x_plus_1], global_thread_index, threadIdx_x, thread_split_to_left_bit_vector[threadIdx_x]); + } + if (thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] == 0) { + printf("thread_to_left_pos[%d] = %d, global_thread_index = %d, thread_split_to_left_bit_vector[%d + %ds] = %d\n", + conflict_free_threadIdx_x_plus_blockDim_x_plus_1, thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1], global_thread_index_plus_blockDim_x, threadIdx_x, blockDim_x, thread_split_to_left_bit_vector[threadIdx_x + blockDim_x]); + } + if (thread_to_right_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] == 0) { + printf("thread_to_right_pos[%d] = %d, global_thread_index = %d, thread_split_to_left_bit_vector[%d + %d] = %d\n", + conflict_free_threadIdx_x_plus_blockDim_x_plus_1, thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1], global_thread_index_plus_blockDim_x, threadIdx_x, blockDim_x, thread_split_to_left_bit_vector[threadIdx_x + blockDim_x]); + }*/ +} + +/*__global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_leaves, + const data_size_t* cuda_leaf_data_start, const data_size_t* cuda_leaf_num_data, + const data_size_t* cuda_data_indices, const uint8_t* split_to_left_bit_vector, + const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, + data_size_t* out_data_indices_in_leaf) { + __shared__ uint8_t thread_split_to_left_bit_vector[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; + __shared__ uint32_t thread_to_left_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; + __shared__ uint32_t thread_to_right_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; + const int leaf_index_ref = *leaf_index; + const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; + const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index_ref] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; + const unsigned int threadIdx_x = threadIdx.x; + const unsigned int global_thread_index = blockIdx.x * blockDim.x + threadIdx_x; + const data_size_t* cuda_data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; + if (global_thread_index < num_data_in_leaf_ref) { + thread_split_to_left_bit_vector[threadIdx_x] = split_to_left_bit_vector[global_thread_index]; + } else { + thread_split_to_left_bit_vector[threadIdx_x] = 0; + } + __syncthreads(); + if (threadIdx_x == 0) { + const uint32_t to_right_block_offset = block_to_right_offset_buffer[blockIdx.x]; + const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x]; + thread_to_left_pos[0] = to_left_block_offset; + thread_to_right_pos[0] = to_right_block_offset; + for (unsigned int i = 0; i < blockDim.x - 1; ++i) { + const unsigned int tmp_global_thread_index = blockIdx.x * blockDim.x + i; + if (tmp_global_thread_index < num_data_in_leaf_ref) { + if (thread_split_to_left_bit_vector[i] == 0) { + thread_to_right_pos[i + 1] = thread_to_right_pos[i] + 1; + thread_to_left_pos[i + 1] = thread_to_left_pos[i]; + } else { + thread_to_left_pos[i + 1] = thread_to_left_pos[i] + 1; + thread_to_right_pos[i + 1] = thread_to_right_pos[i]; + } + } else { + thread_to_left_pos[i + 1] = thread_to_left_pos[i]; + thread_to_right_pos[i + 1] = thread_to_right_pos[i]; + } + } + } + __syncthreads(); + if (global_thread_index < num_data_in_leaf_ref) { + if (thread_split_to_left_bit_vector[threadIdx_x] == 1) { + out_data_indices_in_leaf[thread_to_left_pos[threadIdx_x]] = cuda_data_indices_in_leaf[global_thread_index]; + } else { + out_data_indices_in_leaf[thread_to_right_pos[threadIdx_x]] = cuda_data_indices_in_leaf[global_thread_index]; + } + } +}*/ + +__global__ void CopyDataIndicesKernel(const int* leaf_index, + const int* cuda_cur_num_leaves, + const data_size_t* cuda_leaf_data_start, + const data_size_t* cuda_leaf_num_data, + const data_size_t* out_data_indices_in_leaf, + data_size_t* cuda_data_indices) { + const int leaf_index_ref = *leaf_index; + const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; + const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index_ref] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; + const unsigned int threadIdx_x = threadIdx.x; + const unsigned int global_thread_index = blockIdx.x * blockDim.x + threadIdx_x; + data_size_t* cuda_data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; + if (global_thread_index < num_data_in_leaf_ref) { + cuda_data_indices_in_leaf[global_thread_index] = out_data_indices_in_leaf[global_thread_index]; + } +} + +void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index) { + auto start = std::chrono::steady_clock::now(); + PrepareOffsetKernel<<>>( + leaf_index, cuda_leaf_num_data_, cuda_data_to_left_, + cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_); + SynchronizeCUDADevice(); + auto end = std::chrono::steady_clock::now(); + double duration = (static_cast>(end - start)).count(); + Log::Warning("CUDADataPartition::PrepareOffsetKernel time %f", duration); + start = std::chrono::steady_clock::now(); + AggregateBlockOffsetKernel<<<1, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION / 2>>>(leaf_index, cuda_block_data_to_left_offset_, + cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, + cuda_cur_num_leaves_); + SynchronizeCUDADevice(); + end = std::chrono::steady_clock::now(); + duration = (static_cast>(end - start)).count(); + Log::Warning("CUDADataPartition::AggregateBlockOffsetKernel time %f", duration); + start = std::chrono::steady_clock::now(); + SplitInnerKernel<<>>( + leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, + cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + cuda_out_data_indices_in_leaf_); + SynchronizeCUDADevice(); + end = std::chrono::steady_clock::now(); + duration = (static_cast>(end - start)).count(); + Log::Warning("CUDADataPartition::SplitInnerKernel time %f", duration); + start = std::chrono::steady_clock::now(); + CopyDataIndicesKernel<<>>( + leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_data_indices_); + SynchronizeCUDADevice(); + end = std::chrono::steady_clock::now(); + duration = (static_cast>(end - start)).count(); + Log::Warning("CUDADataPartition::CopyDataIndicesKernel time %f", duration); +} + +__global__ void PrefixSumKernel(uint32_t* cuda_elements) { + __shared__ uint32_t elements[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1]; + const unsigned int threadIdx_x = threadIdx.x; + const unsigned int global_read_index = blockIdx.x * blockDim.x * 2 + threadIdx_x; + elements[threadIdx_x] = cuda_elements[global_read_index]; + elements[threadIdx_x + blockDim.x] = cuda_elements[global_read_index + blockDim.x]; + __syncthreads(); + PrefixSum(elements, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + __syncthreads(); + cuda_elements[global_read_index] = elements[threadIdx_x]; + cuda_elements[global_read_index + blockDim.x] = elements[threadIdx_x + blockDim.x]; +} + +void CUDADataPartition::LaunchPrefixSumKernel(uint32_t* cuda_elements) { + PrefixSumKernel<<<1, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION / 2>>>(cuda_elements); + SynchronizeCUDADevice(); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index e13925ffb183..1ebbf6b9889b 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -13,24 +13,27 @@ #include "new_cuda_utils.hpp" #define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) +#define SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) +#define NUM_BANKS_DATA_PARTITION (32) +#define LOG_NUM_BANKS_DATA_PARTITION (5) namespace LightGBM { class CUDADataPartition { public: - CUDADataPartition(const data_size_t num_data, const int num_leaves, - const data_size_t* cuda_num_data, const int* cuda_num_leaves); + CUDADataPartition(const data_size_t num_data, const int num_features, const int num_leaves, + const int num_threads, const data_size_t* cuda_num_data, const int* cuda_num_leaves, const uint8_t* cuda_data, + const int* cuda_num_features, const std::vector& feature_hist_offsets, const Dataset* train_data); void Init(); void BeforeTrain(const data_size_t* data_indices); - void Split(const int* leaf_id, const int* best_split_feature, const int* best_split_threshold); + void Split(const int* leaf_id, const int* best_split_feature, + const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); Tree* GetCPUTree(); - const data_size_t* cuda_leaf_num_data_offsets() { return cuda_leaf_num_data_offsets_; } - void Test() { PrintLastCUDAError(); std::vector test_data_indices(num_data_, -1); @@ -38,28 +41,137 @@ class CUDADataPartition { for (data_size_t i = 0; i < num_data_; ++i) { CHECK_EQ(i, test_data_indices[i]); } + data_size_t test_leaf_data_start_0 = 0, test_leaf_data_end_0 = 0, test_leaf_num_data_0 = 0; + data_size_t test_leaf_data_start_1 = 0, test_leaf_data_end_1 = 0, test_leaf_num_data_1 = 0; + CopyFromCUDADeviceToHost(&test_leaf_data_start_0, cuda_leaf_data_start_, 1); + CopyFromCUDADeviceToHost(&test_leaf_data_end_0, cuda_leaf_data_end_, 1); + CopyFromCUDADeviceToHost(&test_leaf_num_data_0, cuda_leaf_num_data_, 1); + CopyFromCUDADeviceToHost(&test_leaf_data_start_1, cuda_leaf_data_start_ + 1, 1); + CopyFromCUDADeviceToHost(&test_leaf_data_end_1, cuda_leaf_data_end_ + 1, 1); + CopyFromCUDADeviceToHost(&test_leaf_num_data_1, cuda_leaf_num_data_ + 1, 1); + Log::Warning("test_leaf_data_start_0 = %d", test_leaf_data_start_0); + Log::Warning("test_leaf_data_end_0 = %d", test_leaf_data_end_0); + Log::Warning("test_leaf_num_data_0 = %d", test_leaf_num_data_0); + Log::Warning("test_leaf_data_start_1 = %d", test_leaf_data_start_1); + Log::Warning("test_leaf_data_end_1 = %d", test_leaf_data_end_1); + Log::Warning("test_leaf_num_data_1 = %d", test_leaf_num_data_1); Log::Warning("CUDADataPartition::Test Pass"); } - const data_size_t* cuda_leaf_num_data_offsets() const { return cuda_leaf_num_data_offsets_; } + void TestAfterSplit() { + std::vector test_bit_vector(num_data_, 0); + CopyFromCUDADeviceToHost(test_bit_vector.data(), cuda_data_to_left_, static_cast(num_data_)); + data_size_t num_data_to_left = 0; + #pragma omp parallel for schedule(static) num_threads(num_threads_) reduction(+:num_data_to_left) + for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { + if (test_bit_vector[data_index]) { + ++num_data_to_left; + } + } + Log::Warning("CUDADataPartition::TestAfterSplit num_data_to_left = %d", num_data_to_left); + std::vector test_data_indices(num_data_, 0); + CopyFromCUDADeviceToHost(test_data_indices.data(), cuda_data_indices_, static_cast(num_data_)); + std::vector test_leaf_num_data(num_leaves_, 0), test_leaf_data_start(num_leaves_, 0), test_leaf_data_end(num_leaves_, 0); + CopyFromCUDADeviceToHost(test_leaf_num_data.data(), cuda_leaf_num_data_, static_cast(num_leaves_)); + CopyFromCUDADeviceToHost(test_leaf_data_start.data(), cuda_leaf_data_start_, static_cast(num_leaves_)); + CopyFromCUDADeviceToHost(test_leaf_data_end.data(), cuda_leaf_data_end_, static_cast(num_leaves_)); + for (int i = 0; i < num_leaves_; ++i) { + Log::Warning("test_leaf_num_data[%d] = %d", i, test_leaf_num_data[i]); + Log::Warning("test_leaf_data_start[%d] = %d", i, test_leaf_data_start[i]); + Log::Warning("test_leaf_data_end[%d] = %d", i, test_leaf_data_end[i]); + } + const data_size_t num_data_in_leaf_0 = test_leaf_num_data[0]; + const int check_window_size = 10; + for (data_size_t i = 0; i < check_window_size; ++i) { + Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); + } + for (data_size_t i = num_data_in_leaf_0 - check_window_size; i < num_data_in_leaf_0; ++i) { + Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); + } + for (data_size_t i = num_data_in_leaf_0; i < num_data_in_leaf_0 + check_window_size; ++i) { + Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); + } + } + + void TestPrefixSum() { + std::vector test_elements(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION, 1); + uint32_t* cuda_elements = nullptr; + InitCUDAMemoryFromHostMemory(&cuda_elements, test_elements.data(), test_elements.size()); + LaunchPrefixSumKernel(cuda_elements); + CopyFromCUDADeviceToHost(test_elements.data(), cuda_elements, test_elements.size()); + for (int i = 0; i < SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; ++i) { + Log::Warning("test_elements[%d] = %d", i, test_elements[i]); + } + } + + const data_size_t* cuda_leaf_data_start() const { return cuda_leaf_data_start_; } + + const data_size_t* cuda_leaf_data_end() const { return cuda_leaf_data_end_; } + + const data_size_t* cuda_leaf_num_data() const { return cuda_leaf_num_data_; } + + //const data_size_t* cuda_leaf_num_data_offsets() const { return cuda_leaf_num_data_offsets_; } + + const data_size_t* cuda_data_indices() const { return cuda_data_indices_; } + + const int* cuda_cur_num_leaves() const { return cuda_cur_num_leaves_; } private: + void GenDataToLeftBitVector(const int* leaf_id, const int* best_split_feature, + const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); + + void SplitInner(const int* leaf_index); + // kernel launch functions void LaunchFillDataIndicesBeforeTrain(); - void LaunchSplitKernel(const int* leaf_id, const int* best_split_feature, const int* best_split_threshold); + void LaunchSplitInnerKernel(const int* leaf_index); + + void LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const int* best_split_feature, + const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); + + void LaunchPrefixSumKernel(uint32_t* cuda_elements); // Host memory const data_size_t num_data_; + const int num_features_; const int num_leaves_; + const int num_threads_; + int max_num_split_indices_blocks_; + std::vector feature_default_bins_; + std::vector feature_most_freq_bins_; + std::vector feature_max_bins_; + std::vector feature_min_bins_; + std::vector feature_missing_is_zero_; + std::vector feature_missing_is_na_; + std::vector feature_mfb_is_zero_; + std::vector feature_mfb_is_na_; // CUDA memory, held by this object data_size_t* cuda_data_indices_; - data_size_t* cuda_leaf_num_data_offsets_; + data_size_t* cuda_leaf_data_start_; + data_size_t* cuda_leaf_data_end_; + data_size_t* cuda_leaf_num_data_; + int* cuda_cur_num_leaves_; + // for split + uint8_t* cuda_data_to_left_; + data_size_t* cuda_block_data_to_left_offset_; + data_size_t* cuda_block_data_to_right_offset_; + data_size_t* cuda_out_data_indices_in_leaf_; + uint32_t* cuda_feature_default_bins_; + uint32_t* cuda_feature_most_freq_bins_; + uint32_t* cuda_feature_max_bins_; + uint32_t* cuda_feature_min_bins_; + uint8_t* cuda_feature_missing_is_zero_; + uint8_t* cuda_feature_missing_is_na_; + uint8_t* cuda_feature_mfb_is_zero_; + uint8_t* cuda_feature_mfb_is_na_; // CUDA memory, held by other object const data_size_t* cuda_num_data_; const int* cuda_num_leaves_; + const uint8_t* cuda_data_; + const int* cuda_num_features_; }; } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index b0b73cb0159e..08eac8bb233a 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -8,15 +8,13 @@ #include "cuda_histogram_constructor.hpp" -#include - namespace LightGBM { CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, const score_t* cuda_gradients, const score_t* cuda_hessians): num_data_(train_data->num_data()), - num_features_(train_data->num_features()), num_leaves_(num_leaves), - num_feature_groups_(train_data->num_feature_groups()), num_threads_(num_threads), + num_features_(train_data->num_features()), num_leaves_(num_leaves), num_threads_(num_threads), + num_feature_groups_(train_data->num_feature_groups()), cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians) { int offset = 0; for (int group_id = 0; group_id < train_data->num_feature_groups(); ++group_id) { @@ -71,16 +69,16 @@ void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const int* /*cuda_larger_leaf_index*/, const data_size_t* cuda_data_indices_in_smaller_leaf, const data_size_t* /*cuda_data_indices_in_larger_leaf*/, - const data_size_t* cuda_leaf_num_data_offsets) { + const data_size_t* cuda_leaf_num_data) { auto start = std::chrono::steady_clock::now(); - LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_leaf_num_data_offsets, cuda_data_indices_in_smaller_leaf); + LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data); SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); Log::Warning("LaunchConstructHistogramKernel time %f", duration); - PrintLastCUDAError(); + /*PrintLastCUDAError(); std::vector cpu_hist(6143 * 2, 0.0f); - CopyFromCUDADeviceToHost(cpu_hist.data(), cuda_hist_, 6143 * 2); + CopyFromCUDADeviceToHost(cpu_hist.data(), cuda_hist_, 6143 * 2);*/ } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 54b0aff10b08..934821c38526 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -13,11 +13,11 @@ namespace LightGBM { __global__ void CUDAConstructHistogramKernel(const int* leaf_index, const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* data_indices_ptr, hist_t* feature_histogram, const int* num_feature_groups, - const data_size_t* leaf_num_data_offsets, const uint8_t* data, const uint32_t* feature_group_offsets) { + const data_size_t* leaf_num_data, const uint8_t* data, const uint32_t* feature_group_offsets) { const unsigned int threadIdx_x = threadIdx.x; const int num_feature_groups_ref = *num_feature_groups; const int leaf_index_ref = *leaf_index; - const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data_offsets[leaf_index_ref + 1] - leaf_num_data_offsets[leaf_index_ref]; + const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; __shared__ float shared_hist[SHRAE_HIST_SIZE]; // 256 * 24 * 2, can use 24 features uint32_t num_bins_in_col_group = feature_group_offsets[blockDim.x]; const uint32_t num_items_per_thread = (2 * num_bins_in_col_group + NUM_THRADS_PER_BLOCK - 1) / NUM_THRADS_PER_BLOCK; @@ -31,7 +31,7 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, __syncthreads(); const unsigned int threadIdx_y = threadIdx.y; const unsigned int blockIdx_y = blockIdx.y; - const data_size_t start = threadIdx_y * NUM_DATA_PER_THREAD + blockIdx_y * blockDim.y * NUM_DATA_PER_THREAD; + const data_size_t start = (threadIdx_y + blockIdx_y * blockDim.y) * NUM_DATA_PER_THREAD; const data_size_t end = start + NUM_DATA_PER_THREAD > num_data_in_smaller_leaf_ref ? num_data_in_smaller_leaf_ref : start + NUM_DATA_PER_THREAD; for (data_size_t i = start; i < end; ++i) { @@ -54,7 +54,7 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const int* cuda_smaller_leaf_index, const data_size_t* cuda_data_indices_in_smaller_leaf, - const data_size_t* cuda_leaf_num_data_offsets) { + const data_size_t* cuda_leaf_num_data) { const int block_dim_x = num_features_; // TODO(shiyu1994): only supports the case when the whole histogram can be loaded into shared memory const int block_dim_y = NUM_THRADS_PER_BLOCK / block_dim_x; const int grid_dim_y = ((num_data_ + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + block_dim_y - 1) / block_dim_y; @@ -64,7 +64,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); CUDAConstructHistogramKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_hist_, cuda_num_feature_groups_, cuda_leaf_num_data_offsets, cuda_data_, + cuda_data_indices_in_smaller_leaf, cuda_hist_, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_, cuda_feature_group_bin_offsets_); } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index c840af4eee1a..29e472b0c9a9 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -31,14 +31,12 @@ class CUDAHistogramConstructor { void ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, const data_size_t* cuda_data_indices_in_smaller_leaf, const data_size_t* cuda_data_indices_in_larger_leaf, - const data_size_t* cuda_leaf_num_data_offsets); - - void LaunchConstructHistogramKernel(const int* cuda_leaf_index, - const data_size_t* cuda_data_indices_in_leaf, - const data_size_t* cuda_leaf_num_data_offsets); + const data_size_t* cuda_leaf_num_data); const hist_t* cuda_hist() const { return cuda_hist_; } + const uint8_t* cuda_data() const { return cuda_data_; } + void TestAfterInit() { std::vector test_data(data_.size(), 0); CopyFromCUDADeviceToHost(test_data.data(), cuda_data_, data_.size()); @@ -47,7 +45,21 @@ class CUDAHistogramConstructor { } } + void TestAfterConstructHistogram() { + PrintLastCUDAError(); + std::vector test_hist(num_total_bin_ * 2, 0.0f); + CopyFromCUDADeviceToHost(test_hist.data(), cuda_hist_, static_cast(num_total_bin_) * 2); + for (int i = 0; i < 100; ++i) { + Log::Warning("bin %d grad %f hess %f", i, test_hist[2 * i], test_hist[2 * i + 1]); + } + } + private: + + void LaunchConstructHistogramKernel(const int* cuda_leaf_index, + const data_size_t* cuda_data_indices_in_leaf, + const data_size_t* cuda_leaf_num_data); + void InitCUDAData(const Dataset* train_data); void PushOneData(const uint32_t feature_bin_value, const int feature_group_id, const data_size_t data_index); diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 33dab992d165..cacddacb0bb5 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -51,8 +51,9 @@ void CUDALeafSplits::InitValues(const double* cuda_sum_of_gradients, const doubl SynchronizeCUDADevice(); } -void CUDALeafSplits::InitValues() { +void CUDALeafSplits::InitValues(const data_size_t* cuda_data_indices_in_leaf) { LaunchInitValuesKernal(); + cuda_data_indices_in_leaf_ = cuda_data_indices_in_leaf; } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index c2d9d918437f..d1617ef83f9e 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -32,14 +32,20 @@ class CUDALeafSplits { const data_size_t* cuda_num_data_in_leaf, const data_size_t* cuda_data_indices_in_leaf, const double* cuda_gain, const double* cuda_leaf_value); - void InitValues(); + void InitValues(const data_size_t* cuda_data_indices_in_leaf); const int* cuda_leaf_index() const { return cuda_leaf_index_; } - const data_size_t* cuda_num_data_in_leaf() const { return cuda_num_data_in_leaf_; } - const data_size_t* cuda_data_indices_in_leaf() const { return cuda_data_indices_in_leaf_; } + const double* cuda_gain() const { return cuda_gain_; } + + const double* cuda_sum_of_gradients() const { return cuda_sum_of_gradients_; } + + const double* cuda_sum_of_hessians() const { return cuda_sum_of_hessians_; } + + const data_size_t* cuda_num_data_in_leaf() const { return cuda_num_data_in_leaf_; } + void Test() { PrintLastCUDAError(); double test_sum_of_gradients = 0.0f, test_sum_of_hessians = 0.0f; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 8cdb4dfccdfd..ea35ae40b5a8 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -24,7 +24,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); cuda_centralized_info_.reset(new CUDACentralizedInfo(num_data_, this->config_->num_leaves, num_features_)); cuda_centralized_info_->Init(); - //cuda_centralized_info_->Test(); + cuda_centralized_info_->Test(); cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); @@ -33,22 +33,32 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); cuda_larger_leaf_splits_->Init(); - cuda_data_partition_.reset(new CUDADataPartition(num_data_, this->config_->num_leaves, - cuda_centralized_info_->cuda_num_data(), cuda_centralized_info_->cuda_num_leaves())); - cuda_data_partition_->Init(); - cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians())); cuda_histogram_constructor_->Init(train_data_); //cuda_histogram_constructor_->TestAfterInit(); + + cuda_data_partition_.reset(new CUDADataPartition(num_data_, num_features_, this->config_->num_leaves, num_threads_, + cuda_centralized_info_->cuda_num_data(), cuda_centralized_info_->cuda_num_leaves(), + cuda_histogram_constructor_->cuda_data(), cuda_centralized_info_->cuda_num_features(), + share_state_->feature_hist_offsets(), train_data_)); + cuda_data_partition_->Init(); + + cuda_best_split_finder_.reset(new CUDABestSplitFinder(cuda_histogram_constructor_->cuda_hist(), + train_data_, this->share_state_->feature_hist_offsets(), this->config_->num_leaves, + this->config_->lambda_l1, this->config_->lambda_l2, this->config_->min_data_in_leaf, + this->config_->min_sum_hessian_in_leaf, this->config_->min_gain_to_split, + cuda_centralized_info_->cuda_num_features())); + cuda_best_split_finder_->Init(); + cuda_best_split_finder_->TestAfterInit(); } void NewCUDATreeLearner::BeforeTrain() { cuda_centralized_info_->BeforeTrain(gradients_, hessians_); - cuda_smaller_leaf_splits_->InitValues(); + cuda_smaller_leaf_splits_->InitValues(cuda_data_partition_->cuda_data_indices()); //cuda_smaller_leaf_splits_->Test(); cuda_data_partition_->BeforeTrain(nullptr); - //cuda_data_partition_->Test(); + cuda_data_partition_->Test(); //SerialTreeLearner::BeforeTrain(); /*#pragma omp parallel for schedule(static) num_threads(num_threads_) @@ -203,12 +213,24 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, gradients_ = gradients; hessians_ = hessians; BeforeTrain(); + cuda_data_partition_->Test(); cuda_histogram_constructor_->ConstructHistogramForLeaf( cuda_smaller_leaf_splits_->cuda_leaf_index(), cuda_larger_leaf_splits_->cuda_leaf_index(), cuda_smaller_leaf_splits_->cuda_data_indices_in_leaf(), cuda_larger_leaf_splits_->cuda_data_indices_in_leaf(), - cuda_data_partition_->cuda_leaf_num_data_offsets()); + cuda_data_partition_->cuda_leaf_num_data()); + cuda_best_split_finder_->FindBestSplitsForLeaf(cuda_smaller_leaf_splits_.get(), + cuda_larger_leaf_splits_.get()); + cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves()); + cuda_best_split_finder_->TestAfterFindBestSplits(); + //cuda_data_partition_->TestPrefixSum(); + cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), + cuda_best_split_finder_->cuda_leaf_best_split_feature(), + cuda_best_split_finder_->cuda_leaf_best_split_threshold(), + cuda_best_split_finder_->cuda_leaf_best_split_default_left()); + cuda_data_partition_->TestAfterSplit(); + //cuda_histogram_constructor_->TestAfterConstructHistogram(); /*CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); CUDASUCCESS_OR_FATAL(cudaMemcpy(device_gradients_[0], gradients, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice)); CUDASUCCESS_OR_FATAL(cudaMemcpy(device_hessians_[0], hessians, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice)); diff --git a/src/treelearner/cuda/new_cuda_utils.cu b/src/treelearner/cuda/new_cuda_utils.cu new file mode 100644 index 000000000000..3be69ee4ad92 --- /dev/null +++ b/src/treelearner/cuda/new_cuda_utils.cu @@ -0,0 +1,45 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "new_cuda_utils.hpp" + +namespace LightGBM { + +/*template <> +__device__ void PrefixSum(uint32_t* elements, unsigned int n) { + unsigned int offset = 1; + unsigned int threadIdx_x = threadIdx.x; + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + elements[dst_pos] += elements[src_pos]; + } + offset <<= 1; + __syncthreads(); + } + const uint32_t last_element = elements[n - 1]; + if (threadIdx_x == 0) { + elements[n - 1] = 0; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + if (threadIdx_x < d) { + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const uint32_t src_val = elements[src_pos]; + elements[src_pos] = elements[dst_pos]; + elements[dst_pos] += src_val; + } + offset >>= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[n] = elements[n - 1] + last_element; + } +}*/ + +} // namespace LightGBM diff --git a/src/treelearner/cuda/new_cuda_utils.hpp b/src/treelearner/cuda/new_cuda_utils.hpp index 4c2ccb2a163d..98820187ed9f 100644 --- a/src/treelearner/cuda/new_cuda_utils.hpp +++ b/src/treelearner/cuda/new_cuda_utils.hpp @@ -13,6 +13,10 @@ #include #include +#include + +#define PREFIX_SUM_ARRAY_SIZE_NEW_CUDA_UTILS (1024) + namespace LightGBM { template @@ -67,6 +71,9 @@ void SetCUDAMemory(T* dst_ptr, int value, size_t size) { void PrintLastCUDAError(); +//template +//__device__ void PrefixSum(T* elements, unsigned int n); + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index e063cc9d46b3..ff3375d22c07 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -576,7 +576,8 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, } *left_leaf = best_leaf; auto next_leaf_id = tree->NextLeafId(); - + Log::Warning("best_split_info.feature = %d, best_split_info.threshold = %d", + best_split_info.feature, best_split_info.threshold); // update before tree split constraints_->BeforeSplit(best_leaf, next_leaf_id, best_split_info.monotone_type); @@ -595,6 +596,8 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, best_split_info.left_count = data_partition_->leaf_count(*left_leaf); best_split_info.right_count = data_partition_->leaf_count(next_leaf_id); } + Log::Warning("data_partition_->leaf_count(*left_leaf) = %d, data_partition_->leaf_count(next_leaf_id) = %d", + data_partition_->leaf_count(*left_leaf), data_partition_->leaf_count(next_leaf_id)); // split tree, will return right leaf *right_leaf = tree->Split( best_leaf, inner_feature_index, best_split_info.feature, From 6c14cd9b1810b2693c37b50ed2ce5b2a4e0b400f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 7 May 2021 12:47:27 +0000 Subject: [PATCH 006/166] single tree framework ready --- .../cuda/cuda_best_split_finder.cpp | 8 +- .../cuda/cuda_best_split_finder.cu | 71 +++++++--- .../cuda/cuda_best_split_finder.hpp | 26 ++++ src/treelearner/cuda/cuda_data_partition.cpp | 62 ++++++++- src/treelearner/cuda/cuda_data_partition.cu | 96 +++++++++++-- src/treelearner/cuda/cuda_data_partition.hpp | 45 +++++- .../cuda/cuda_histogram_constructor.cpp | 36 ++++- .../cuda/cuda_histogram_constructor.cu | 130 +++++++++++++++++- .../cuda/cuda_histogram_constructor.hpp | 23 +++- src/treelearner/cuda/cuda_leaf_splits.cpp | 8 +- src/treelearner/cuda/cuda_leaf_splits.hpp | 18 ++- .../cuda/new_cuda_tree_learner.cpp | 60 +++++++- 12 files changed, 523 insertions(+), 60 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index a36533e39233..ed6fd72d25c6 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -49,10 +49,12 @@ void CUDABestSplitFinder::Init() { AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_sum_gradient_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_sum_hessian_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_count_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_gain_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_output_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_sum_gradient_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_sum_hessian_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_count_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_gain_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_output_); AllocateCUDAMemory(feature_hist_offsets_.size(), &cuda_feature_hist_offsets_); @@ -87,10 +89,12 @@ void CUDABestSplitFinder::Init() { AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_sum_gradient_); AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_sum_hessian_); AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_count_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_gain_); AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_output_); AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_sum_gradient_); AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_sum_hessian_); AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_count_); + AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_gain_); AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_output_); AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_found_); @@ -120,7 +124,7 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); - Log::Warning("FindBestSplitsForLeaf time %f", duration); + //Log::Warning("FindBestSplitsForLeaf time %f", duration); } void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves) { @@ -128,7 +132,7 @@ void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves) LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); - Log::Warning("FindBestFromAllSplits time %f", duration); + //Log::Warning("FindBestFromAllSplits time %f", duration); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 686f61054ee1..64912bed4842 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -80,10 +80,12 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, double* output_left_sum_gradients, double* output_left_sum_hessians, data_size_t* output_left_num_data, + double* output_left_gain, double* output_left_output, double* output_right_sum_gradients, double* output_right_sum_hessians, data_size_t* output_right_num_data, + double* output_right_gain, double* output_right_output, uint8_t* output_found) { double best_sum_left_gradient = NAN; @@ -253,8 +255,12 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, *output_left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian, lambda_l1, use_l1, lambda_l2); + *output_left_gain = GetLeafGainGivenOutput(best_sum_left_gradient, + best_sum_left_hessian, lambda_l1, use_l1, lambda_l2, *output_left_output); *output_left_output = CalculateSplittedLeafOutput(best_sum_right_gradient, best_sum_right_hessian, lambda_l1, use_l1, lambda_l2); + *output_right_gain = GetLeafGainGivenOutput(best_sum_right_gradient, + best_sum_right_hessian, lambda_l1, use_l1, lambda_l2, *output_right_output); } } @@ -269,9 +275,11 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* // output uint32_t* cuda_best_split_threshold, uint8_t* cuda_best_split_default_left, double* cuda_best_split_gain, double* cuda_best_split_left_sum_gradient, - double* cuda_best_split_left_sum_hessian, data_size_t* cuda_best_split_left_count, double* cuda_best_split_left_output, + double* cuda_best_split_left_sum_hessian, data_size_t* cuda_best_split_left_count, + double* cuda_best_split_left_gain, double* cuda_best_split_left_output, double* cuda_best_split_right_sum_gradient, double* cuda_best_split_right_sum_hessian, - data_size_t* cuda_best_split_right_count, double* cuda_best_split_right_output, uint8_t* cuda_best_split_found) { + data_size_t* cuda_best_split_right_count, double* cuda_best_split_right_gain, + double* cuda_best_split_right_output, uint8_t* cuda_best_split_found) { const unsigned int num_features = gridDim.x / 4; const unsigned int inner_feature_index = (blockIdx.x / 2) % num_features; const unsigned int global_block_idx = blockIdx.x; @@ -293,6 +301,8 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* data_size_t* out_right_num_data = cuda_best_split_right_count + global_block_idx; double* out_left_output = cuda_best_split_left_output + global_block_idx; double* out_right_output = cuda_best_split_right_output + global_block_idx; + double* out_left_gain = cuda_best_split_left_gain + global_block_idx; + double* out_right_gain = cuda_best_split_right_gain + global_block_idx; uint8_t* out_found = cuda_best_split_found + global_block_idx; uint8_t* out_default_left = cuda_best_split_default_left + global_block_idx; double* out_gain = cuda_best_split_gain + global_block_idx; @@ -309,16 +319,16 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, num_data_in_leaf, reverse, true, false, out_threshold, out_gain, out_default_left, - out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_output, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_output, out_found); + out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found); } else { FindBestSplitsForLeafKernelInner(hist_ptr, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, num_data_in_leaf, reverse, false, true, out_threshold, out_gain, out_default_left, - out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_output, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_output, out_found); + out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found); } } else { if (reverse) { @@ -327,8 +337,8 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, num_data_in_leaf, reverse, true, false, out_threshold, out_gain, out_default_left, - out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_output, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_output, out_found); + out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found); } if (missing_type == 2) { *out_default_left = 0; @@ -354,18 +364,20 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(const int* smaller_l cuda_best_split_threshold_, cuda_best_split_default_left_, cuda_best_split_gain_, cuda_best_split_left_sum_gradient_, cuda_best_split_left_sum_hessian_, - cuda_best_split_left_count_, cuda_best_split_left_output_, + cuda_best_split_left_count_, cuda_best_split_left_gain_, cuda_best_split_left_output_, cuda_best_split_right_sum_gradient_, cuda_best_split_right_sum_hessian_, - cuda_best_split_right_count_, cuda_best_split_right_output_, + cuda_best_split_right_count_, cuda_best_split_right_gain_, cuda_best_split_right_output_, cuda_best_split_found_); } __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index, const int* cuda_num_features, int* cuda_leaf_best_split_feature, uint8_t* cuda_leaf_best_split_default_left, - uint32_t* cuda_leaf_best_split_threshold, double* cuda_leaf_best_split_gain, double* cuda_leaf_best_split_left_sum_gradient, - double* cuda_leaf_best_split_left_sum_hessian, data_size_t* cuda_leaf_best_split_left_count, - double* cuda_leaf_best_split_left_output, double* cuda_leaf_best_split_right_sum_gradient, - double* cuda_leaf_best_split_right_sum_hessian, data_size_t* cuda_leaf_best_split_right_count, + uint32_t* cuda_leaf_best_split_threshold, double* cuda_leaf_best_split_gain, + double* cuda_leaf_best_split_left_sum_gradient, double* cuda_leaf_best_split_left_sum_hessian, + data_size_t* cuda_leaf_best_split_left_count, double* cuda_leaf_best_split_left_gain, + double* cuda_leaf_best_split_left_output, + double* cuda_leaf_best_split_right_sum_gradient, double* cuda_leaf_best_split_right_sum_hessian, + data_size_t* cuda_leaf_best_split_right_count, double* cuda_leaf_best_split_right_gain, double* cuda_leaf_best_split_right_output, // input parameters const int* cuda_best_split_feature, @@ -375,10 +387,12 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const const double* cuda_best_split_left_sum_gradient, const double* cuda_best_split_left_sum_hessian, const data_size_t* cuda_best_split_left_count, + const double* cuda_best_split_left_gain, const double* cuda_best_split_left_output, const double* cuda_best_split_right_sum_gradient, const double* cuda_best_split_right_sum_hessian, const data_size_t* cuda_best_split_right_count, + const double* cuda_best_split_right_gain, const double* cuda_best_split_right_output, const uint8_t* cuda_best_split_found) { if (blockIdx.x == 0 && threadIdx.x == 0) { @@ -393,10 +407,12 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const double& smaller_leaf_best_split_left_sum_gradient = cuda_leaf_best_split_left_sum_gradient[smaller_leaf_index_ref]; double& smaller_leaf_best_split_left_sum_hessian = cuda_leaf_best_split_left_sum_hessian[smaller_leaf_index_ref]; data_size_t& smaller_leaf_best_split_left_count = cuda_leaf_best_split_left_count[smaller_leaf_index_ref]; + double& smaller_leaf_best_split_left_gain = cuda_leaf_best_split_left_gain[smaller_leaf_index_ref]; double& smaller_leaf_best_split_left_output = cuda_leaf_best_split_left_output[smaller_leaf_index_ref]; double& smaller_leaf_best_split_right_sum_gradient = cuda_leaf_best_split_right_sum_gradient[smaller_leaf_index_ref]; double& smaller_leaf_best_split_right_sum_hessian = cuda_leaf_best_split_right_sum_hessian[smaller_leaf_index_ref]; data_size_t& smaller_leaf_best_split_right_count = cuda_leaf_best_split_right_count[smaller_leaf_index_ref]; + double& smaller_leaf_best_split_right_gain = cuda_leaf_best_split_right_gain[smaller_leaf_index_ref]; double& smaller_leaf_best_split_right_output = cuda_leaf_best_split_right_output[smaller_leaf_index_ref]; double& larger_leaf_best_gain = cuda_leaf_best_split_gain[larger_leaf_index_ref]; @@ -406,10 +422,12 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const double& larger_leaf_best_split_left_sum_gradient = cuda_leaf_best_split_left_sum_gradient[larger_leaf_index_ref]; double& larger_leaf_best_split_left_sum_hessian = cuda_leaf_best_split_left_sum_hessian[larger_leaf_index_ref]; data_size_t& larger_leaf_best_split_left_count = cuda_leaf_best_split_left_count[larger_leaf_index_ref]; + double& larger_leaf_best_split_left_gain = cuda_leaf_best_split_left_gain[larger_leaf_index_ref]; double& larger_leaf_best_split_left_output = cuda_leaf_best_split_left_output[larger_leaf_index_ref]; double& larger_leaf_best_split_right_sum_gradient = cuda_leaf_best_split_right_sum_gradient[larger_leaf_index_ref]; double& larger_leaf_best_split_right_sum_hessian = cuda_leaf_best_split_right_sum_hessian[larger_leaf_index_ref]; data_size_t& larger_leaf_best_split_right_count = cuda_leaf_best_split_right_count[larger_leaf_index_ref]; + double& larger_leaf_best_split_right_gain = cuda_leaf_best_split_right_gain[larger_leaf_index_ref]; double& larger_leaf_best_split_right_output = cuda_leaf_best_split_right_output[larger_leaf_index_ref]; smaller_leaf_best_gain = kMinScore; @@ -424,6 +442,7 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const //printf("reverse smaller leaf new best, feature_index = %d, split_gain = %f, default_left = %d, threshold = %d\n", // feature_index, smaller_reverse_gain, cuda_best_split_default_left[smaller_reverse_index], // cuda_best_split_threshold[smaller_reverse_index]); + //printf("leaf index %d gain update to %f\n", smaller_leaf_index_ref, smaller_reverse_gain); smaller_leaf_best_gain = smaller_reverse_gain; smaller_leaf_best_split_feature = feature_index; smaller_leaf_best_split_default_left = cuda_best_split_default_left[smaller_reverse_index]; @@ -431,10 +450,12 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const smaller_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[smaller_reverse_index]; smaller_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[smaller_reverse_index]; smaller_leaf_best_split_left_count = cuda_best_split_left_count[smaller_reverse_index]; + smaller_leaf_best_split_left_gain = cuda_best_split_left_gain[smaller_reverse_index]; smaller_leaf_best_split_left_output = cuda_best_split_left_output[smaller_reverse_index]; smaller_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[smaller_reverse_index]; smaller_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[smaller_reverse_index]; smaller_leaf_best_split_right_count = cuda_best_split_right_count[smaller_reverse_index]; + smaller_leaf_best_split_right_gain = cuda_best_split_right_gain[smaller_reverse_index]; smaller_leaf_best_split_right_output = cuda_best_split_right_output[smaller_reverse_index]; } } @@ -446,6 +467,7 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const //printf("non reverse smaller leaf new best, feature_index = %d, split_gain = %f, default_left = %d, threshold = %d\n", // feature_index, smaller_non_reverse_gain, cuda_best_split_default_left[smaller_non_reverse_index], // cuda_best_split_threshold[smaller_non_reverse_index]); + //printf("leaf index %d gain update to %f\n", smaller_leaf_index_ref, smaller_non_reverse_gain); smaller_leaf_best_gain = smaller_non_reverse_gain; smaller_leaf_best_split_feature = feature_index; smaller_leaf_best_split_default_left = cuda_best_split_default_left[smaller_non_reverse_index]; @@ -453,10 +475,12 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const smaller_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[smaller_non_reverse_index]; smaller_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[smaller_non_reverse_index]; smaller_leaf_best_split_left_count = cuda_best_split_left_count[smaller_non_reverse_index]; + smaller_leaf_best_split_left_gain = cuda_best_split_left_gain[smaller_non_reverse_index]; smaller_leaf_best_split_left_output = cuda_best_split_left_output[smaller_non_reverse_index]; smaller_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[smaller_non_reverse_index]; smaller_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[smaller_non_reverse_index]; smaller_leaf_best_split_right_count = cuda_best_split_right_count[smaller_non_reverse_index]; + smaller_leaf_best_split_right_gain = cuda_best_split_right_gain[smaller_non_reverse_index]; smaller_leaf_best_split_right_output = cuda_best_split_right_output[smaller_non_reverse_index]; } } @@ -467,6 +491,7 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const if (larger_reverse_found) { const double larger_reverse_gain = cuda_best_split_gain[larger_reverse_index]; if (larger_reverse_gain > larger_leaf_best_gain) { + //printf("leaf index %d gain update to %f\n", larger_leaf_index_ref, larger_reverse_gain); larger_leaf_best_gain = larger_reverse_gain; larger_leaf_best_split_feature = feature_index; larger_leaf_best_split_default_left = cuda_best_split_default_left[larger_reverse_index]; @@ -474,10 +499,12 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const larger_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[larger_reverse_index]; larger_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[larger_reverse_index]; larger_leaf_best_split_left_count = cuda_best_split_left_count[larger_reverse_index]; + larger_leaf_best_split_left_gain = cuda_best_split_left_gain[larger_reverse_index]; larger_leaf_best_split_left_output = cuda_best_split_left_output[larger_reverse_index]; larger_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[larger_reverse_index]; larger_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[larger_reverse_index]; larger_leaf_best_split_right_count = cuda_best_split_right_count[larger_reverse_index]; + larger_leaf_best_split_right_gain = cuda_best_split_right_gain[larger_reverse_index]; larger_leaf_best_split_right_output = cuda_best_split_right_output[larger_reverse_index]; } } @@ -486,6 +513,7 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const if (larger_non_reverse_found) { const double larger_non_reverse_gain = cuda_best_split_gain[larger_non_reverse_index]; if (larger_non_reverse_gain > larger_leaf_best_gain) { + //printf("leaf index %d gain update to %f\n", larger_leaf_index_ref, larger_non_reverse_gain); larger_leaf_best_gain = larger_non_reverse_gain; larger_leaf_best_split_feature = feature_index; larger_leaf_best_split_default_left = cuda_best_split_default_left[larger_non_reverse_index]; @@ -493,10 +521,12 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const larger_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[larger_non_reverse_index]; larger_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[larger_non_reverse_index]; larger_leaf_best_split_left_count = cuda_best_split_left_count[larger_non_reverse_index]; + larger_leaf_best_split_left_gain = cuda_best_split_left_gain[larger_non_reverse_index]; larger_leaf_best_split_left_output = cuda_best_split_left_output[larger_non_reverse_index]; larger_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[larger_non_reverse_index]; larger_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[larger_non_reverse_index]; larger_leaf_best_split_right_count = cuda_best_split_right_count[larger_non_reverse_index]; + larger_leaf_best_split_right_gain = cuda_best_split_right_gain[larger_non_reverse_index]; larger_leaf_best_split_right_output = cuda_best_split_right_output[larger_non_reverse_index]; } } @@ -508,10 +538,12 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index) { SyncBestSplitForLeafKernel<<<1, 1>>>(smaller_leaf_index, larger_leaf_index, cuda_num_features_, cuda_leaf_best_split_feature_, cuda_leaf_best_split_default_left_, - cuda_leaf_best_split_threshold_, cuda_leaf_best_split_gain_, cuda_leaf_best_split_left_sum_gradient_, - cuda_leaf_best_split_left_sum_hessian_, cuda_leaf_best_split_left_count_, - cuda_leaf_best_split_left_output_, cuda_leaf_best_split_right_sum_gradient_, - cuda_leaf_best_split_right_sum_hessian_, cuda_leaf_best_split_right_count_, + cuda_leaf_best_split_threshold_, cuda_leaf_best_split_gain_, + cuda_leaf_best_split_left_sum_gradient_, cuda_leaf_best_split_left_sum_hessian_, + cuda_leaf_best_split_left_count_, cuda_leaf_best_split_left_gain_, + cuda_leaf_best_split_left_output_, + cuda_leaf_best_split_right_sum_gradient_, cuda_leaf_best_split_right_sum_hessian_, + cuda_leaf_best_split_right_count_, cuda_leaf_best_split_right_gain_, cuda_leaf_best_split_right_output_, cuda_best_split_feature_, cuda_best_split_default_left_, @@ -520,10 +552,12 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel(const int* smaller_le cuda_best_split_left_sum_gradient_, cuda_best_split_left_sum_hessian_, cuda_best_split_left_count_, + cuda_best_split_left_gain_, cuda_best_split_left_output_, cuda_best_split_right_sum_gradient_, cuda_best_split_right_sum_hessian_, cuda_best_split_right_count_, + cuda_best_split_right_gain_, cuda_best_split_right_output_, cuda_best_split_found_); } @@ -534,6 +568,7 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, double best_gain = kMinScore; for (int leaf_index = 0; leaf_index < cuda_cur_num_leaves_ref; ++leaf_index) { const double leaf_best_gain = cuda_leaf_best_split_gain[leaf_index]; + //printf("cuda_leaf_best_split_gain[%d] = %f\n", leaf_index, leaf_best_gain); if (leaf_best_gain > best_gain) { best_gain = leaf_best_gain; *out_best_leaf = leaf_index; diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 400716c62a8d..72c173b1e438 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -43,6 +43,28 @@ class CUDABestSplitFinder { const uint8_t* cuda_leaf_best_split_default_left() const { return cuda_leaf_best_split_default_left_; } + const double* cuda_leaf_best_split_gain() const { return cuda_leaf_best_split_gain_; } + + const double* cuda_leaf_best_split_left_sum_gradient() const { return cuda_leaf_best_split_left_sum_gradient_; } + + const double* cuda_leaf_best_split_left_sum_hessian() const { return cuda_leaf_best_split_left_sum_hessian_; } + + const data_size_t* cuda_leaf_best_split_left_count() const { return cuda_leaf_best_split_left_count_; } + + const double* cuda_leaf_best_split_left_gain() const { return cuda_leaf_best_split_left_gain_; } + + const double* cuda_leaf_best_split_left_output() const { return cuda_leaf_best_split_left_output_; } + + const double* cuda_leaf_best_split_right_sum_gradient() const { return cuda_leaf_best_split_right_sum_gradient_; } + + const double* cuda_leaf_best_split_right_sum_hessian() const { return cuda_leaf_best_split_right_sum_hessian_; } + + const data_size_t* cuda_leaf_best_split_right_count() const { return cuda_leaf_best_split_right_count_; } + + const double* cuda_leaf_best_split_right_gain() const { return cuda_leaf_best_split_right_gain_; } + + const double* cuda_leaf_best_split_right_output() const { return cuda_leaf_best_split_right_output_; } + void TestAfterInit() { PrintLastCUDAError(); } @@ -103,10 +125,12 @@ class CUDABestSplitFinder { double* cuda_leaf_best_split_left_sum_gradient_; double* cuda_leaf_best_split_left_sum_hessian_; data_size_t* cuda_leaf_best_split_left_count_; + double* cuda_leaf_best_split_left_gain_; double* cuda_leaf_best_split_left_output_; double* cuda_leaf_best_split_right_sum_gradient_; double* cuda_leaf_best_split_right_sum_hessian_; data_size_t* cuda_leaf_best_split_right_count_; + double* cuda_leaf_best_split_right_gain_; double* cuda_leaf_best_split_right_output_; // for best split information when finding best split int* cuda_best_split_feature_; @@ -116,10 +140,12 @@ class CUDABestSplitFinder { double* cuda_best_split_left_sum_gradient_; double* cuda_best_split_left_sum_hessian_; data_size_t* cuda_best_split_left_count_; + double* cuda_best_split_left_gain_; double* cuda_best_split_left_output_; double* cuda_best_split_right_sum_gradient_; double* cuda_best_split_right_sum_hessian_; data_size_t* cuda_best_split_right_count_; + double* cuda_best_split_right_gain_; double* cuda_best_split_right_output_; uint8_t* cuda_best_split_found_; int* cuda_num_total_bin_; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 752e7b1850e0..18d29672f138 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -111,17 +111,42 @@ void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { void CUDADataPartition::Split(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left) { + const uint8_t* best_split_default_left, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, + // for leaf splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer) { auto start = std::chrono::steady_clock::now(); GenDataToLeftBitVector(leaf_id, best_split_feature, best_split_threshold, best_split_default_left); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); - Log::Warning("CUDADataPartition::GenDataToLeftBitVector time %f", duration); + //Log::Warning("CUDADataPartition::GenDataToLeftBitVector time %f", duration); start = std::chrono::steady_clock::now(); - SplitInner(leaf_id); + SplitInner(leaf_id, + best_left_sum_gradients, best_left_sum_hessians, best_left_count, + best_left_gain, best_left_leaf_value, + best_right_sum_gradients, best_right_sum_hessians, best_right_count, + best_right_gain, best_right_leaf_value, + smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, + smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, + smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, + smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, + larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, + larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); - Log::Warning("CUDADataPartition::SplitInner time %f", duration); + //Log::Warning("CUDADataPartition::SplitInner time %f", duration); } void CUDADataPartition::GenDataToLeftBitVector(const int* leaf_id, @@ -131,8 +156,33 @@ void CUDADataPartition::GenDataToLeftBitVector(const int* leaf_id, LaunchGenDataToLeftBitVectorKernel(leaf_id, best_split_feature, best_split_threshold, best_split_default_left); } -void CUDADataPartition::SplitInner(const int* leaf_index) { - LaunchSplitInnerKernel(leaf_index); +void CUDADataPartition::SplitInner(const int* leaf_index, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, + // for leaf splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer) { + LaunchSplitInnerKernel(leaf_index, + best_left_sum_gradients, best_left_sum_hessians, best_left_count, + best_left_gain, best_left_leaf_value, + best_right_sum_gradients, best_right_sum_hessians, best_right_count, + best_right_gain, best_right_leaf_value, + smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, + smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, + smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, + smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, + larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, + larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); } Tree* CUDADataPartition::GetCPUTree() {} diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index fd6be0deae2b..0abe7798e93d 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -169,6 +169,9 @@ __global__ void PrepareOffsetKernel(const int* leaf_index, const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); const unsigned int global_read_index = blockIdx.x * blockDim.x * 2 + threadIdx_x; const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[*leaf_index]; + /*if (blockIdx.x == 0 && threadIdx_x == 0) { + printf("PrepareOffsetKernel leaf_index = %d, num_data_in_leaf = %d\n", (*leaf_index), num_data_in_leaf_ref); + }*/ if (global_read_index < num_data_in_leaf_ref) { const uint8_t bit = split_to_left_bit_vector[global_read_index]; thread_to_left_offset_cnt[conflict_free_threadIdx_x] = bit; @@ -199,8 +202,21 @@ __global__ void PrepareOffsetKernel(const int* leaf_index, __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, - data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, - int* cuda_cur_num_leaves) { + data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, + int* cuda_cur_num_leaves, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, + // for leaf splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer) { __shared__ uint32_t block_to_left_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint32_t block_to_right_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + @@ -292,6 +308,40 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; cuda_leaf_num_data[cur_max_leaf_index] = block_to_right_offset_buffer[num_blocks] - to_left_total_cnt; ++(*cuda_cur_num_leaves); + + if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { + *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; + *smaller_leaf_cuda_sum_of_gradients_pointer = *best_left_sum_gradients; + *smaller_leaf_cuda_sum_of_hessians_pointer = *best_left_sum_hessians; + *smaller_leaf_cuda_num_data_in_leaf_pointer = *best_left_count; + *smaller_leaf_cuda_gain_pointer = *best_left_gain; + *smaller_leaf_cuda_leaf_value_pointer = *best_left_leaf_value; + *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[leaf_index_ref]; + + *larger_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; + *larger_leaf_cuda_sum_of_gradients_pointer = *best_right_sum_gradients; + *larger_leaf_cuda_sum_of_hessians_pointer = *best_right_sum_hessians; + *larger_leaf_cuda_num_data_in_leaf_pointer = *best_right_count; + *larger_leaf_cuda_gain_pointer = *best_right_gain; + *larger_leaf_cuda_leaf_value_pointer = *best_right_leaf_value; + *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[cur_max_leaf_index]; + } else { + *larger_leaf_cuda_leaf_index_pointer = leaf_index_ref; + *larger_leaf_cuda_sum_of_gradients_pointer = *best_left_sum_gradients; + *larger_leaf_cuda_sum_of_hessians_pointer = *best_left_sum_hessians; + *larger_leaf_cuda_num_data_in_leaf_pointer = *best_left_count; + *larger_leaf_cuda_gain_pointer = *best_left_gain; + *larger_leaf_cuda_leaf_value_pointer = *best_left_leaf_value; + *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[leaf_index_ref]; + + *smaller_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; + *smaller_leaf_cuda_sum_of_gradients_pointer = *best_right_sum_gradients; + *smaller_leaf_cuda_sum_of_hessians_pointer = *best_right_sum_hessians; + *smaller_leaf_cuda_num_data_in_leaf_pointer = *best_right_count; + *smaller_leaf_cuda_gain_pointer = *best_right_gain; + *smaller_leaf_cuda_leaf_value_pointer = *best_right_leaf_value; + *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[cur_max_leaf_index]; + } } } @@ -446,7 +496,20 @@ __global__ void CopyDataIndicesKernel(const int* leaf_index, } } -void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index) { +void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, + // for leaf splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer) { auto start = std::chrono::steady_clock::now(); PrepareOffsetKernel<<>>( leaf_index, cuda_leaf_num_data_, cuda_data_to_left_, @@ -454,15 +517,30 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index) { SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); - Log::Warning("CUDADataPartition::PrepareOffsetKernel time %f", duration); + //Log::Warning("CUDADataPartition::PrepareOffsetKernel time %f", duration); start = std::chrono::steady_clock::now(); AggregateBlockOffsetKernel<<<1, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION / 2>>>(leaf_index, cuda_block_data_to_left_offset_, - cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, - cuda_cur_num_leaves_); + cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, + cuda_leaf_num_data_, cuda_data_indices_, + cuda_cur_num_leaves_, + + best_left_sum_gradients, best_left_sum_hessians, best_left_count, + best_left_gain, best_left_leaf_value, + best_right_sum_gradients, best_right_sum_hessians, best_right_count, + best_right_gain, best_right_leaf_value, + + smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, + smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, + smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, + smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, + larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, + larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); SynchronizeCUDADevice(); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); - Log::Warning("CUDADataPartition::AggregateBlockOffsetKernel time %f", duration); + //Log::Warning("CUDADataPartition::AggregateBlockOffsetKernel time %f", duration); start = std::chrono::steady_clock::now(); SplitInnerKernel<<>>( leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, @@ -471,14 +549,14 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index) { SynchronizeCUDADevice(); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); - Log::Warning("CUDADataPartition::SplitInnerKernel time %f", duration); + //Log::Warning("CUDADataPartition::SplitInnerKernel time %f", duration); start = std::chrono::steady_clock::now(); CopyDataIndicesKernel<<>>( leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_data_indices_); SynchronizeCUDADevice(); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); - Log::Warning("CUDADataPartition::CopyDataIndicesKernel time %f", duration); + //Log::Warning("CUDADataPartition::CopyDataIndicesKernel time %f", duration); } __global__ void PrefixSumKernel(uint32_t* cuda_elements) { diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 1ebbf6b9889b..286593b7da38 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -30,7 +30,20 @@ class CUDADataPartition { void BeforeTrain(const data_size_t* data_indices); void Split(const int* leaf_id, const int* best_split_feature, - const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); + const uint32_t* best_split_threshold, const uint8_t* best_split_default_left, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, + // for splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); Tree* GetCPUTree(); @@ -120,12 +133,38 @@ class CUDADataPartition { void GenDataToLeftBitVector(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); - void SplitInner(const int* leaf_index); + void SplitInner(const int* leaf_index, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, + // for leaf splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); // kernel launch functions void LaunchFillDataIndicesBeforeTrain(); - void LaunchSplitInnerKernel(const int* leaf_index); + void LaunchSplitInnerKernel(const int* leaf_index, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, + // for leaf splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); void LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 08eac8bb233a..99bb32bf6b7e 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -12,7 +12,8 @@ namespace LightGBM { CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, - const score_t* cuda_gradients, const score_t* cuda_hessians): num_data_(train_data->num_data()), + const score_t* cuda_gradients, const score_t* cuda_hessians, + const std::vector& feature_hist_offsets): num_data_(train_data->num_data()), num_features_(train_data->num_features()), num_leaves_(num_leaves), num_threads_(num_threads), num_feature_groups_(train_data->num_feature_groups()), cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians) { @@ -21,7 +22,22 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, feature_group_bin_offsets_.emplace_back(offset); offset += train_data->FeatureGroupNumBin(group_id); } + for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) { + const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); + const uint32_t most_freq_bin = bin_mapper->GetMostFreqBin(); + if (most_freq_bin == 0) { + feature_mfb_offsets_.emplace_back(1); + } else { + feature_mfb_offsets_.emplace_back(0); + } + feature_num_bins_.emplace_back(static_cast(bin_mapper->num_bin())); + feature_most_freq_bins_.emplace_back(most_freq_bin); + } feature_group_bin_offsets_.emplace_back(offset); + feature_hist_offsets_.clear(); + for (size_t i = 0; i < feature_hist_offsets.size(); ++i) { + feature_hist_offsets_.emplace_back(feature_hist_offsets[i]); + } num_total_bin_ = offset; } @@ -40,6 +56,20 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data) { InitCUDAMemoryFromHostMemory(&cuda_feature_group_bin_offsets_, feature_group_bin_offsets_.data(), feature_group_bin_offsets_.size()); + InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_offsets_, + feature_mfb_offsets_.data(), feature_mfb_offsets_.size()); + + InitCUDAMemoryFromHostMemory(&cuda_feature_num_bins_, + feature_num_bins_.data(), feature_num_bins_.size()); + + InitCUDAMemoryFromHostMemory(&cuda_feature_hist_offsets_, + feature_hist_offsets_.data(), feature_hist_offsets_.size()); + + InitCUDAMemoryFromHostMemory(&cuda_feature_most_freq_bins_, + feature_most_freq_bins_.data(), feature_most_freq_bins_.size()); + + InitCUDAValueFromConstant(&cuda_num_features_, num_features_); + InitCUDAData(train_data); } @@ -68,14 +98,14 @@ void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, } void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const int* /*cuda_larger_leaf_index*/, - const data_size_t* cuda_data_indices_in_smaller_leaf, const data_size_t* /*cuda_data_indices_in_larger_leaf*/, + const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** /*cuda_data_indices_in_larger_leaf*/, const data_size_t* cuda_leaf_num_data) { auto start = std::chrono::steady_clock::now(); LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data); SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); - Log::Warning("LaunchConstructHistogramKernel time %f", duration); + //Log::Warning("LaunchConstructHistogramKernel time %f", duration); /*PrintLastCUDAError(); std::vector cpu_hist(6143 * 2, 0.0f); CopyFromCUDADeviceToHost(cpu_hist.data(), cuda_hist_, 6143 * 2);*/ diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 934821c38526..c910754a171f 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -10,14 +10,52 @@ namespace LightGBM { +__device__ void PrefixSum(hist_t* elements, unsigned int n) { + unsigned int offset = 1; + unsigned int threadIdx_x = threadIdx.x; + const unsigned int conflict_free_n_minus_1 = (n - 1); + const hist_t last_element = elements[conflict_free_n_minus_1]; + __syncthreads(); + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + elements[(dst_pos)] += elements[(src_pos)]; + } + offset <<= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[conflict_free_n_minus_1] = 0; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int conflict_free_dst_pos = (dst_pos); + const unsigned int conflict_free_src_pos = (src_pos); + const hist_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[(n)] = elements[conflict_free_n_minus_1] + last_element; + } +} + __global__ void CUDAConstructHistogramKernel(const int* leaf_index, const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t* data_indices_ptr, hist_t* feature_histogram, const int* num_feature_groups, + const data_size_t** data_indices_ptr, hist_t* feature_histogram, const int* num_feature_groups, const data_size_t* leaf_num_data, const uint8_t* data, const uint32_t* feature_group_offsets) { const unsigned int threadIdx_x = threadIdx.x; const int num_feature_groups_ref = *num_feature_groups; const int leaf_index_ref = *leaf_index; const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; + const data_size_t* data_indices_ref = *data_indices_ptr; __shared__ float shared_hist[SHRAE_HIST_SIZE]; // 256 * 24 * 2, can use 24 features uint32_t num_bins_in_col_group = feature_group_offsets[blockDim.x]; const uint32_t num_items_per_thread = (2 * num_bins_in_col_group + NUM_THRADS_PER_BLOCK - 1) / NUM_THRADS_PER_BLOCK; @@ -37,7 +75,7 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, for (data_size_t i = start; i < end; ++i) { const score_t grad = cuda_gradients[i]; const score_t hess = cuda_hessians[i]; - const data_size_t data_index = data_indices_ptr[i]; + const data_size_t data_index = data_indices_ref[i]; const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + threadIdx_x]) + feature_group_offsets[threadIdx_x]; const uint32_t pos = bin << 1; @@ -53,14 +91,14 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const int* cuda_smaller_leaf_index, - const data_size_t* cuda_data_indices_in_smaller_leaf, + const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t* cuda_leaf_num_data) { const int block_dim_x = num_features_; // TODO(shiyu1994): only supports the case when the whole histogram can be loaded into shared memory const int block_dim_y = NUM_THRADS_PER_BLOCK / block_dim_x; const int grid_dim_y = ((num_data_ + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + block_dim_y - 1) / block_dim_y; const int grid_dim_x = (static_cast(num_feature_groups_ + NUM_FEATURE_PER_THREAD_GROUP - 1) / NUM_FEATURE_PER_THREAD_GROUP); - Log::Warning("block_dim_x = %d, block_dim_y = %d", block_dim_x, block_dim_y); - Log::Warning("gid_dim_x = %d, grid_dim_y = %d", grid_dim_x, grid_dim_y); + //Log::Warning("block_dim_x = %d, block_dim_y = %d", block_dim_x, block_dim_y); + //Log::Warning("gid_dim_x = %d, grid_dim_y = %d", grid_dim_x, grid_dim_y); dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); CUDAConstructHistogramKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, @@ -68,6 +106,88 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_feature_group_bin_offsets_); } +__global__ void SubtractAndFixHistogramKernel(const int* cuda_smaller_leaf_index, + const int* cuda_larger_leaf_index, const uint8_t* cuda_feature_mfb_offsets, + const uint32_t* cuda_feature_num_bins, const int* cuda_num_total_bin, + hist_t* cuda_hist) { + const int cuda_num_total_bin_ref = *cuda_num_total_bin; + const unsigned int global_thread_index = threadIdx.x + blockIdx.x * blockDim.x; + const int cuda_smaller_leaf_index_ref = *cuda_smaller_leaf_index; + const int cuda_larger_leaf_index_ref = *cuda_larger_leaf_index; + const hist_t* smaller_leaf_hist = cuda_hist + (cuda_smaller_leaf_index_ref * cuda_num_total_bin_ref * 2); + hist_t* larger_leaf_hist = cuda_hist + (cuda_larger_leaf_index_ref * cuda_num_total_bin_ref * 2); + if (global_thread_index < 2 * cuda_num_total_bin_ref) { + larger_leaf_hist[global_thread_index] -= smaller_leaf_hist[global_thread_index]; + } +} + +__global__ void FixHistogramKernel(const int* cuda_smaller_leaf_index, + const int* cuda_larger_leaf_index, + const uint32_t* cuda_feature_num_bins, const int* cuda_num_features, + const int* cuda_num_total_bin, const uint32_t* cuda_feature_hist_offsets, + const uint32_t* cuda_feature_most_freq_bins, + const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, + const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians, + hist_t* cuda_hist) { + const int cuda_num_features_ref = *cuda_num_features; + const unsigned int blockIdx_x = blockIdx.x; + const int feature_index = blockIdx_x % cuda_num_features_ref; + const bool larger_or_smaller = static_cast(blockIdx_x / cuda_num_features_ref); + const int leaf_index_ref = larger_or_smaller ? *cuda_larger_leaf_index : *cuda_smaller_leaf_index; + const int cuda_num_total_bin_ref = *cuda_num_total_bin; + const uint32_t feature_hist_offset = cuda_feature_hist_offsets[feature_index]; + const uint32_t most_freq_bin = cuda_feature_most_freq_bins[feature_index]; + if (most_freq_bin > 0) { + const double leaf_sum_gradients = larger_or_smaller ? *larger_leaf_sum_gradients : *smaller_leaf_sum_gradients; + const double leaf_sum_hessians = larger_or_smaller ? *larger_leaf_sum_hessians : *smaller_leaf_sum_hessians; + hist_t* feature_hist = cuda_hist + cuda_num_total_bin_ref * 2 * leaf_index_ref + feature_hist_offset * 2; + __shared__ double hist_gradients[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; + __shared__ double hist_hessians[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; + const unsigned int threadIdx_x = threadIdx.x; + const uint32_t num_bin = cuda_feature_num_bins[feature_index]; + if (threadIdx_x < num_bin) { + if (threadIdx_x == most_freq_bin) { + hist_gradients[threadIdx_x] = 0.0f; + hist_hessians[threadIdx_x] = 0.0f; + } else { + hist_gradients[threadIdx_x] = feature_hist[threadIdx_x << 1]; + hist_hessians[threadIdx_x] = feature_hist[(threadIdx_x << 1) + 1]; + } + } + uint32_t num_bin_aligned = 1; + uint32_t num_bin_to_shift = num_bin; + while (num_bin_to_shift > 0) { + num_bin_to_shift >>= 1; + num_bin_aligned <<= 1; + } + __syncthreads(); + PrefixSum(hist_gradients, num_bin_aligned); + PrefixSum(hist_hessians, num_bin_aligned); + __syncthreads(); + feature_hist[most_freq_bin << 1] = leaf_sum_gradients - hist_gradients[num_bin_aligned]; + feature_hist[(most_freq_bin << 1) + 1] = leaf_sum_hessians - hist_hessians[num_bin_aligned]; + } +} + +void CUDAHistogramConstructor::LaunchSubtractAndFixHistogramKernel(const int* cuda_smaller_leaf_index, + const int* cuda_larger_leaf_index, const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, + const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians) { + const int num_subtract_threads = 2 * num_total_bin_; + const int num_subtract_blocks = (num_subtract_threads + SUBTRACT_BLOCK_SIZE - 1) / SUBTRACT_BLOCK_SIZE; + SubtractAndFixHistogramKernel<<>>( + cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_feature_mfb_offsets_, + cuda_feature_num_bins_, cuda_num_total_bin_, cuda_hist_); + SynchronizeCUDADevice(); + FixHistogramKernel<<<2 * num_features_, FIX_HISTOGRAM_BLOCK_SIZE>>>( + cuda_smaller_leaf_index, cuda_larger_leaf_index, + cuda_feature_num_bins_, cuda_num_features_, + cuda_num_total_bin_, cuda_feature_hist_offsets_, + cuda_feature_most_freq_bins_, smaller_leaf_sum_gradients, smaller_leaf_sum_hessians, + larger_leaf_sum_gradients, larger_leaf_sum_hessians, + cuda_hist_); + SynchronizeCUDADevice(); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 29e472b0c9a9..f3da7b08e5a9 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -19,18 +19,21 @@ #define NUM_DATA_PER_THREAD (400) #define NUM_THRADS_PER_BLOCK (504) #define NUM_FEATURE_PER_THREAD_GROUP (28) +#define SUBTRACT_BLOCK_SIZE (1024) +#define FIX_HISTOGRAM_SHARED_MEM_SIZE (1024) +#define FIX_HISTOGRAM_BLOCK_SIZE (512) namespace LightGBM { class CUDAHistogramConstructor { public: CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, - const score_t* cuda_gradients, const score_t* cuda_hessians); + const score_t* cuda_gradients, const score_t* cuda_hessians, const std::vector& feature_hist_offsets); void Init(const Dataset* train_data); void ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, - const data_size_t* cuda_data_indices_in_smaller_leaf, const data_size_t* cuda_data_indices_in_larger_leaf, + const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, const data_size_t* cuda_leaf_num_data); const hist_t* cuda_hist() const { return cuda_hist_; } @@ -55,11 +58,14 @@ class CUDAHistogramConstructor { } private: - void LaunchConstructHistogramKernel(const int* cuda_leaf_index, - const data_size_t* cuda_data_indices_in_leaf, + const data_size_t** cuda_data_indices_in_leaf, const data_size_t* cuda_leaf_num_data); + void LaunchSubtractAndFixHistogramKernel(const int* cuda_smaller_leaf_index, + const int* cuda_larger_leaf_index, const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, + const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians); + void InitCUDAData(const Dataset* train_data); void PushOneData(const uint32_t feature_bin_value, const int feature_group_id, const data_size_t data_index); @@ -74,13 +80,22 @@ class CUDAHistogramConstructor { int num_feature_groups_; std::vector data_; std::vector feature_group_bin_offsets_; + std::vector feature_mfb_offsets_; + std::vector feature_num_bins_; + std::vector feature_hist_offsets_; + std::vector feature_most_freq_bins_; // CUDA memory, held by this object uint32_t* cuda_feature_group_bin_offsets_; + uint8_t* cuda_feature_mfb_offsets_; + uint32_t* cuda_feature_num_bins_; + uint32_t* cuda_feature_hist_offsets_; + uint32_t* cuda_feature_most_freq_bins_; hist_t* cuda_hist_; int* cuda_num_total_bin_; int* cuda_num_feature_groups_; uint8_t* cuda_data_; + int* cuda_num_features_; // CUDA memory, held by other objects const score_t* cuda_gradients_; diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index cacddacb0bb5..ae837452ce7c 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -35,6 +35,9 @@ void CUDALeafSplits::Init() { InitCUDAMemoryFromHostMemory(&cuda_num_data_in_leaf_, &num_data_, 1); InitCUDAValueFromConstant(&cuda_gain_, 0.0f); + // since smooth is not used, so the output value for root node is useless + InitCUDAValueFromConstant(&cuda_leaf_value_, 0.0f); + AllocateCUDAMemory(1, &cuda_data_indices_in_leaf_); InitCUDAMemoryFromHostMemory(&cuda_leaf_index_, &leaf_index_, 1); } @@ -45,7 +48,7 @@ void CUDALeafSplits::InitValues(const double* cuda_sum_of_gradients, const doubl CopyFromCUDADeviceToCUDADevice(cuda_sum_of_gradients_, cuda_sum_of_gradients, 1); CopyFromCUDADeviceToCUDADevice(cuda_sum_of_hessians_, cuda_sum_of_hessians, 1); CopyFromCUDADeviceToCUDADevice(cuda_num_data_in_leaf_, cuda_num_data_in_leaf, 1); - cuda_data_indices_in_leaf_ = cuda_data_indices_in_leaf; + CopyFromHostToCUDADevice(cuda_data_indices_in_leaf_, &cuda_data_indices_in_leaf, 1); CopyFromCUDADeviceToCUDADevice(cuda_gain_, cuda_gain, 1); CopyFromCUDADeviceToCUDADevice(cuda_leaf_value_, cuda_leaf_value, 1); SynchronizeCUDADevice(); @@ -53,7 +56,8 @@ void CUDALeafSplits::InitValues(const double* cuda_sum_of_gradients, const doubl void CUDALeafSplits::InitValues(const data_size_t* cuda_data_indices_in_leaf) { LaunchInitValuesKernal(); - cuda_data_indices_in_leaf_ = cuda_data_indices_in_leaf; + CopyFromHostToCUDADevice(cuda_data_indices_in_leaf_, &cuda_data_indices_in_leaf, 1); + SynchronizeCUDADevice(); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index d1617ef83f9e..78ea846fbfaf 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -36,7 +36,7 @@ class CUDALeafSplits { const int* cuda_leaf_index() const { return cuda_leaf_index_; } - const data_size_t* cuda_data_indices_in_leaf() const { return cuda_data_indices_in_leaf_; } + const data_size_t** cuda_data_indices_in_leaf() const { return cuda_data_indices_in_leaf_; } const double* cuda_gain() const { return cuda_gain_; } @@ -46,6 +46,20 @@ class CUDALeafSplits { const data_size_t* cuda_num_data_in_leaf() const { return cuda_num_data_in_leaf_; } + int* cuda_leaf_index_pointer() const { return cuda_leaf_index_; } + + double* cuda_sum_of_gradients_pointer() const { return cuda_sum_of_gradients_; } + + double* cuda_sum_of_hessians_pointer() const { return cuda_sum_of_hessians_; } + + data_size_t* cuda_num_data_in_leaf_pointer() const { return cuda_num_data_in_leaf_; } + + double* cuda_gain_pointer() const { return cuda_gain_; } + + double* cuda_leaf_value_pointer() const { return cuda_leaf_value_; } + + const data_size_t** cuda_data_indices_in_leaf_pointer_pointer() { return cuda_data_indices_in_leaf_; } + void Test() { PrintLastCUDAError(); double test_sum_of_gradients = 0.0f, test_sum_of_hessians = 0.0f; @@ -72,7 +86,7 @@ class CUDALeafSplits { double* cuda_leaf_value_; // CUDA memory, held by other object - const data_size_t* cuda_data_indices_in_leaf_; + const data_size_t** cuda_data_indices_in_leaf_; const score_t* cuda_gradients_; const score_t* cuda_hessians_; const int* cuda_num_data_; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index ea35ae40b5a8..4383da763b9a 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -34,7 +34,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_larger_leaf_splits_->Init(); cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, - cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians())); + cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), share_state_->feature_hist_offsets())); cuda_histogram_constructor_->Init(train_data_); //cuda_histogram_constructor_->TestAfterInit(); @@ -209,11 +209,59 @@ void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, } Tree* NewCUDATreeLearner::Train(const score_t* gradients, - const score_t *hessians, bool /*is_first_tree*/) { + const score_t* hessians, bool /*is_first_tree*/) { gradients_ = gradients; hessians_ = hessians; BeforeTrain(); - cuda_data_partition_->Test(); + const auto start = std::chrono::steady_clock::now(); + for (int i = 0; i < config_->num_leaves - 1; ++i) { + cuda_histogram_constructor_->ConstructHistogramForLeaf( + cuda_smaller_leaf_splits_->cuda_leaf_index(), + cuda_larger_leaf_splits_->cuda_leaf_index(), + cuda_smaller_leaf_splits_->cuda_data_indices_in_leaf(), + cuda_larger_leaf_splits_->cuda_data_indices_in_leaf(), + cuda_data_partition_->cuda_leaf_num_data()); + + cuda_best_split_finder_->FindBestSplitsForLeaf(cuda_smaller_leaf_splits_.get(), + cuda_larger_leaf_splits_.get()); + + cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves()); + + cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), + cuda_best_split_finder_->cuda_leaf_best_split_feature(), + cuda_best_split_finder_->cuda_leaf_best_split_threshold(), + cuda_best_split_finder_->cuda_leaf_best_split_default_left(), + + cuda_best_split_finder_->cuda_leaf_best_split_left_sum_gradient(), + cuda_best_split_finder_->cuda_leaf_best_split_left_sum_hessian(), + cuda_best_split_finder_->cuda_leaf_best_split_left_count(), + cuda_best_split_finder_->cuda_leaf_best_split_left_gain(), + cuda_best_split_finder_->cuda_leaf_best_split_left_output(), + cuda_best_split_finder_->cuda_leaf_best_split_right_sum_gradient(), + cuda_best_split_finder_->cuda_leaf_best_split_right_sum_hessian(), + cuda_best_split_finder_->cuda_leaf_best_split_right_count(), + cuda_best_split_finder_->cuda_leaf_best_split_right_gain(), + cuda_best_split_finder_->cuda_leaf_best_split_right_output(), + + cuda_smaller_leaf_splits_->cuda_leaf_index_pointer(), + cuda_smaller_leaf_splits_->cuda_sum_of_gradients_pointer(), + cuda_smaller_leaf_splits_->cuda_sum_of_hessians_pointer(), + cuda_smaller_leaf_splits_->cuda_num_data_in_leaf_pointer(), + cuda_smaller_leaf_splits_->cuda_gain_pointer(), + cuda_smaller_leaf_splits_->cuda_leaf_value_pointer(), + cuda_smaller_leaf_splits_->cuda_data_indices_in_leaf_pointer_pointer(), + cuda_larger_leaf_splits_->cuda_leaf_index_pointer(), + cuda_larger_leaf_splits_->cuda_sum_of_gradients_pointer(), + cuda_larger_leaf_splits_->cuda_sum_of_hessians_pointer(), + cuda_larger_leaf_splits_->cuda_num_data_in_leaf_pointer(), + cuda_larger_leaf_splits_->cuda_gain_pointer(), + cuda_larger_leaf_splits_->cuda_leaf_value_pointer(), + cuda_larger_leaf_splits_->cuda_data_indices_in_leaf_pointer_pointer()); + } + const auto end = std::chrono::steady_clock::now(); + const double duration = (static_cast>(end - start)).count(); + Log::Warning("Train time %f", duration); + /*cuda_data_partition_->Test(); cuda_histogram_constructor_->ConstructHistogramForLeaf( cuda_smaller_leaf_splits_->cuda_leaf_index(), cuda_larger_leaf_splits_->cuda_leaf_index(), @@ -223,13 +271,13 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, cuda_best_split_finder_->FindBestSplitsForLeaf(cuda_smaller_leaf_splits_.get(), cuda_larger_leaf_splits_.get()); cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves()); - cuda_best_split_finder_->TestAfterFindBestSplits(); + cuda_best_split_finder_->TestAfterFindBestSplits();*/ //cuda_data_partition_->TestPrefixSum(); - cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), + /*cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), cuda_best_split_finder_->cuda_leaf_best_split_feature(), cuda_best_split_finder_->cuda_leaf_best_split_threshold(), cuda_best_split_finder_->cuda_leaf_best_split_default_left()); - cuda_data_partition_->TestAfterSplit(); + cuda_data_partition_->TestAfterSplit();*/ //cuda_histogram_constructor_->TestAfterConstructHistogram(); /*CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); CUDASUCCESS_OR_FATAL(cudaMemcpy(device_gradients_[0], gradients, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice)); From aa0b3ded97982a8301401b1455e51fc056549f0d Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sun, 9 May 2021 13:37:38 +0000 Subject: [PATCH 007/166] single tree training framework --- .../cuda/cuda_best_split_finder.cpp | 5 +- .../cuda/cuda_best_split_finder.cu | 48 ++- .../cuda/cuda_best_split_finder.hpp | 4 +- src/treelearner/cuda/cuda_data_partition.cpp | 50 ++- src/treelearner/cuda/cuda_data_partition.cu | 294 ++++++++++++------ src/treelearner/cuda/cuda_data_partition.hpp | 52 +++- .../cuda/cuda_histogram_constructor.cpp | 16 +- .../cuda/cuda_histogram_constructor.cu | 162 ++++++---- .../cuda/cuda_histogram_constructor.hpp | 27 +- src/treelearner/cuda/cuda_leaf_splits.cpp | 8 +- src/treelearner/cuda/cuda_leaf_splits.cu | 16 +- src/treelearner/cuda/cuda_leaf_splits.hpp | 8 +- .../cuda/new_cuda_tree_learner.cpp | 65 +++- 13 files changed, 536 insertions(+), 219 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index ed6fd72d25c6..53f07e458064 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -116,9 +116,11 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le smaller_leaf_splits->cuda_sum_of_gradients(), smaller_leaf_splits->cuda_sum_of_hessians(), smaller_leaf_splits->cuda_num_data_in_leaf(), + smaller_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), larger_leaf_splits->cuda_sum_of_gradients(), larger_leaf_splits->cuda_sum_of_hessians(), - larger_leaf_splits->cuda_num_data_in_leaf()); + larger_leaf_splits->cuda_num_data_in_leaf(), + larger_leaf_splits->cuda_hist_in_leaf_pointer_pointer()); SynchronizeCUDADevice(); LaunchSyncBestSplitForLeafKernel(smaller_leaf_splits->cuda_leaf_index(), larger_leaf_splits->cuda_leaf_index()); SynchronizeCUDADevice(); @@ -130,6 +132,7 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves) { auto start = std::chrono::steady_clock::now(); LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves); + SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); //Log::Warning("FindBestFromAllSplits time %f", duration); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 64912bed4842..9707845fa31f 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -241,7 +241,7 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, if (*output_found) { *output_threshold = best_threshold; - *output_gain = best_gain; + *output_gain = best_gain - min_gain_shift; *output_default_left = reverse; *output_left_sum_gradients = best_sum_left_gradient; *output_left_sum_hessians = best_sum_left_hessian; @@ -257,7 +257,7 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, best_sum_left_hessian, lambda_l1, use_l1, lambda_l2); *output_left_gain = GetLeafGainGivenOutput(best_sum_left_gradient, best_sum_left_hessian, lambda_l1, use_l1, lambda_l2, *output_left_output); - *output_left_output = CalculateSplittedLeafOutput(best_sum_right_gradient, + *output_right_output = CalculateSplittedLeafOutput(best_sum_right_gradient, best_sum_right_hessian, lambda_l1, use_l1, lambda_l2); *output_right_gain = GetLeafGainGivenOutput(best_sum_right_gradient, best_sum_right_hessian, lambda_l1, use_l1, lambda_l2, *output_right_output); @@ -268,14 +268,14 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* const uint32_t* feature_hist_offsets, const uint8_t* feature_mfb_offsets, const uint32_t* feature_default_bins, const uint8_t* feature_missing_types, const double* lambda_l1, const double* lambda_l2, const int* smaller_leaf_id, const int* larger_leaf_id, const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, - const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, + const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, hist_t** smaller_leaf_hist, const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, - const data_size_t* num_data_in_larger_leaf, const data_size_t* min_data_in_leaf, + const data_size_t* num_data_in_larger_leaf, hist_t** larger_leaf_hist, const data_size_t* min_data_in_leaf, const double* min_sum_hessian_in_leaf, const double* min_gain_to_split, // output uint32_t* cuda_best_split_threshold, uint8_t* cuda_best_split_default_left, double* cuda_best_split_gain, double* cuda_best_split_left_sum_gradient, - double* cuda_best_split_left_sum_hessian, data_size_t* cuda_best_split_left_count, + double* cuda_best_split_left_sum_hessian, data_size_t* cuda_best_split_left_count, double* cuda_best_split_left_gain, double* cuda_best_split_left_output, double* cuda_best_split_right_sum_gradient, double* cuda_best_split_right_sum_hessian, data_size_t* cuda_best_split_right_count, double* cuda_best_split_right_gain, @@ -289,6 +289,9 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* const uint8_t missing_type = feature_missing_types[inner_feature_index]; const int leaf_index = smaller_or_larger ? *smaller_leaf_id : *larger_leaf_id; const double parent_gain = smaller_or_larger ? *smaller_leaf_gain : *larger_leaf_gain; + /*if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("parent_gain = %f\n", parent_gain); + }*/ const double sum_gradients = smaller_or_larger ? *sum_gradients_in_smaller_leaf : *sum_gradients_in_larger_leaf; const double sum_hessians = smaller_or_larger ? *sum_hessians_in_smaller_leaf : *sum_hessians_in_larger_leaf; const double num_data_in_leaf = smaller_or_larger ? *num_data_in_smaller_leaf : *num_data_in_larger_leaf; @@ -311,7 +314,8 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* return; } const int cuda_num_total_bin_ref = *cuda_num_total_bin; - const hist_t* hist_ptr = cuda_hist + (cuda_num_total_bin_ref * leaf_index + feature_hist_offsets[inner_feature_index]) * 2; + const hist_t* hist_ptr = smaller_or_larger ? *smaller_leaf_hist + feature_hist_offsets[inner_feature_index] * 2 : + *larger_leaf_hist + feature_hist_offsets[inner_feature_index] * 2;// cuda_hist + (cuda_num_total_bin_ref * leaf_index + feature_hist_offsets[inner_feature_index]) * 2; if (num_bin > 2 && missing_type != 0) { if (missing_type == 1) { FindBestSplitsForLeafKernelInner(hist_ptr, @@ -348,9 +352,9 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(const int* smaller_leaf_id, const int* larger_leaf_id, const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, - const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, + const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, hist_t** smaller_leaf_hist, const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, - const data_size_t* num_data_in_larger_leaf) { + const data_size_t* num_data_in_larger_leaf, hist_t** larger_leaf_hist) { // * 2 for smaller and larger leaves, * 2 for split direction const int num_blocks = num_features_ * 4; FindBestSplitsForLeafKernel<<>>(cuda_hist_, @@ -358,8 +362,8 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(const int* smaller_l cuda_feature_mfb_offsets_, cuda_feature_default_bins_, cuda_feature_missing_type_, cuda_lambda_l1_, cuda_lambda_l2_, smaller_leaf_id, larger_leaf_id, smaller_leaf_gain, larger_leaf_gain, - sum_gradients_in_smaller_leaf, sum_hessians_in_smaller_leaf, num_data_in_smaller_leaf, - sum_gradients_in_larger_leaf, sum_hessians_in_larger_leaf, num_data_in_larger_leaf, + sum_gradients_in_smaller_leaf, sum_hessians_in_smaller_leaf, num_data_in_smaller_leaf, smaller_leaf_hist, + sum_gradients_in_larger_leaf, sum_hessians_in_larger_leaf, num_data_in_larger_leaf, larger_leaf_hist, cuda_min_data_in_leaf_, cuda_min_sum_hessian_in_leaf_, cuda_min_gain_to_split_, cuda_best_split_threshold_, cuda_best_split_default_left_, cuda_best_split_gain_, @@ -451,12 +455,18 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const smaller_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[smaller_reverse_index]; smaller_leaf_best_split_left_count = cuda_best_split_left_count[smaller_reverse_index]; smaller_leaf_best_split_left_gain = cuda_best_split_left_gain[smaller_reverse_index]; + //printf("leaf index %d split left gain update to %f\n", smaller_leaf_index_ref, smaller_leaf_best_split_left_gain); smaller_leaf_best_split_left_output = cuda_best_split_left_output[smaller_reverse_index]; smaller_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[smaller_reverse_index]; smaller_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[smaller_reverse_index]; smaller_leaf_best_split_right_count = cuda_best_split_right_count[smaller_reverse_index]; smaller_leaf_best_split_right_gain = cuda_best_split_right_gain[smaller_reverse_index]; + //printf("leaf index %d split right gain update to %f\n", smaller_leaf_index_ref, smaller_leaf_best_split_right_gain); smaller_leaf_best_split_right_output = cuda_best_split_right_output[smaller_reverse_index]; + /*printf("smaller_leaf_index = %d, smaller_leaf_best_gain = %f, smaller_leaf_best_split_left_sum_gradient = %f, smaller_leaf_best_split_left_sum_hessian = %f\n", + smaller_leaf_index_ref, smaller_leaf_best_gain, smaller_leaf_best_split_left_sum_gradient, smaller_leaf_best_split_left_sum_hessian); + printf("smaller_leaf_index = %d, smaller_leaf_best_gain = %f, smaller_leaf_best_split_right_sum_gradient = %f, smaller_leaf_best_split_right_sum_hessian = %f\n", + smaller_leaf_index_ref, smaller_leaf_best_gain, smaller_leaf_best_split_right_sum_gradient, smaller_leaf_best_split_right_sum_hessian);*/ } } const int smaller_non_reverse_index = 2 * feature_index + 1; @@ -476,12 +486,18 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const smaller_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[smaller_non_reverse_index]; smaller_leaf_best_split_left_count = cuda_best_split_left_count[smaller_non_reverse_index]; smaller_leaf_best_split_left_gain = cuda_best_split_left_gain[smaller_non_reverse_index]; + //printf("leaf index %d split left gain update to %f\n", smaller_leaf_index_ref, smaller_leaf_best_split_left_gain); smaller_leaf_best_split_left_output = cuda_best_split_left_output[smaller_non_reverse_index]; smaller_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[smaller_non_reverse_index]; smaller_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[smaller_non_reverse_index]; smaller_leaf_best_split_right_count = cuda_best_split_right_count[smaller_non_reverse_index]; smaller_leaf_best_split_right_gain = cuda_best_split_right_gain[smaller_non_reverse_index]; + //printf("leaf index %d split right gain update to %f\n", smaller_leaf_index_ref, smaller_leaf_best_split_right_gain); smaller_leaf_best_split_right_output = cuda_best_split_right_output[smaller_non_reverse_index]; + /*printf("smaller_leaf_index = %d, smaller_leaf_best_gain = %f, smaller_leaf_best_split_left_sum_gradient = %f, smaller_leaf_best_split_left_sum_hessian = %f\n", + smaller_leaf_index_ref, smaller_leaf_best_gain, smaller_leaf_best_split_left_sum_gradient, smaller_leaf_best_split_left_sum_hessian); + printf("smaller_leaf_index = %d, smaller_leaf_best_gain = %f, smaller_leaf_best_split_right_sum_gradient = %f, smaller_leaf_best_split_right_sum_hessian = %f\n", + smaller_leaf_index_ref, smaller_leaf_best_gain, smaller_leaf_best_split_right_sum_gradient, smaller_leaf_best_split_right_sum_hessian);*/ } } @@ -500,12 +516,18 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const larger_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[larger_reverse_index]; larger_leaf_best_split_left_count = cuda_best_split_left_count[larger_reverse_index]; larger_leaf_best_split_left_gain = cuda_best_split_left_gain[larger_reverse_index]; + //printf("leaf index %d split left gain update to %f\n", larger_leaf_index_ref, larger_leaf_best_split_left_gain); larger_leaf_best_split_left_output = cuda_best_split_left_output[larger_reverse_index]; larger_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[larger_reverse_index]; larger_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[larger_reverse_index]; larger_leaf_best_split_right_count = cuda_best_split_right_count[larger_reverse_index]; larger_leaf_best_split_right_gain = cuda_best_split_right_gain[larger_reverse_index]; + //printf("leaf index %d split right gain update to %f\n", larger_leaf_index_ref, larger_leaf_best_split_right_gain); larger_leaf_best_split_right_output = cuda_best_split_right_output[larger_reverse_index]; + /*printf("larger_leaf_index = %d, larger_leaf_best_gain = %f, larger_leaf_best_split_left_sum_gradient = %f, larger_leaf_best_split_left_sum_hessian = %f\n", + larger_leaf_index_ref, larger_leaf_best_gain, larger_leaf_best_split_left_sum_gradient, larger_leaf_best_split_left_sum_hessian); + printf("larger_leaf_index = %d, larger_leaf_best_gain = %f, larger_leaf_best_split_right_sum_gradient = %f, larger_leaf_best_split_right_sum_hessian = %f\n", + larger_leaf_index_ref, larger_leaf_best_gain, larger_leaf_best_split_right_sum_gradient, larger_leaf_best_split_right_sum_hessian);*/ } } const int larger_non_reverse_index = 2 * feature_index + 1 + larger_leaf_offset; @@ -522,12 +544,18 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const larger_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[larger_non_reverse_index]; larger_leaf_best_split_left_count = cuda_best_split_left_count[larger_non_reverse_index]; larger_leaf_best_split_left_gain = cuda_best_split_left_gain[larger_non_reverse_index]; + //printf("leaf index %d split left gain update to %f\n", larger_leaf_index_ref, larger_leaf_best_split_left_gain); larger_leaf_best_split_left_output = cuda_best_split_left_output[larger_non_reverse_index]; larger_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[larger_non_reverse_index]; larger_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[larger_non_reverse_index]; larger_leaf_best_split_right_count = cuda_best_split_right_count[larger_non_reverse_index]; larger_leaf_best_split_right_gain = cuda_best_split_right_gain[larger_non_reverse_index]; + //printf("leaf index %d split right gain update to %f\n", larger_leaf_index_ref, larger_leaf_best_split_right_gain); larger_leaf_best_split_right_output = cuda_best_split_right_output[larger_non_reverse_index]; + /*printf("larger_leaf_index = %d, larger_leaf_best_gain = %f, larger_leaf_best_split_left_sum_gradient = %f, larger_leaf_best_split_left_sum_hessian = %f\n", + larger_leaf_index_ref, larger_leaf_best_gain, larger_leaf_best_split_left_sum_gradient, larger_leaf_best_split_left_sum_hessian); + printf("larger_leaf_index = %d, larger_leaf_best_gain = %f, larger_leaf_best_split_right_sum_gradient = %f, larger_leaf_best_split_right_sum_hessian = %f\n", + larger_leaf_index_ref, larger_leaf_best_gain, larger_leaf_best_split_right_sum_gradient, larger_leaf_best_split_right_sum_hessian);*/ } } } diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 72c173b1e438..18541c4e51e0 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -91,9 +91,9 @@ class CUDABestSplitFinder { private: void LaunchFindBestSplitsForLeafKernel(const int* smaller_leaf_id, const int* larger_leaf_id, const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, - const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, + const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, hist_t** smaller_leaf_hist, const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, - const data_size_t* num_data_in_larger_leaf); + const data_size_t* num_data_in_larger_leaf, hist_t** larger_leaf_hist); void LaunchSyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index); diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 18d29672f138..c579dca309fb 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -12,13 +12,16 @@ namespace LightGBM { CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_features, const int num_leaves, const int num_threads, const data_size_t* cuda_num_data, const int* cuda_num_leaves, const uint8_t* cuda_data, - const int* cuda_num_features, const std::vector& feature_hist_offsets, const Dataset* train_data): + const int* cuda_num_features, const std::vector& feature_hist_offsets, const Dataset* train_data, + hist_t* cuda_hist): num_data_(num_data), num_features_(num_features), num_leaves_(num_leaves), num_threads_(num_threads), - cuda_data_(cuda_data), cuda_num_features_(cuda_num_features) { + num_total_bin_(feature_hist_offsets.back()), cuda_data_(cuda_data), cuda_num_features_(cuda_num_features), + cuda_hist_(cuda_hist) { cuda_num_data_ = cuda_num_data; cuda_num_leaves_ = cuda_num_leaves; max_num_split_indices_blocks_ = (num_data_ + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; + cur_num_leaves_ = 1; feature_default_bins_.resize(train_data->num_features()); feature_most_freq_bins_.resize(train_data->num_features()); feature_max_bins_.resize(train_data->num_features()); @@ -38,8 +41,8 @@ CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_f const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); feature_default_bins_[feature_index] = bin_mapper->GetDefaultBin(); feature_most_freq_bins_[feature_index] = bin_mapper->GetMostFreqBin(); - Log::Warning("feature_index = %d, feature_hist_offsets[feature_index] = %d, prev_group_bins = %d", - feature_index, feature_hist_offsets[feature_index], prev_group_bins); + /*Log::Warning("feature_index = %d, feature_hist_offsets[feature_index] = %d, prev_group_bins = %d", + feature_index, feature_hist_offsets[feature_index], prev_group_bins);*/ feature_min_bins_[feature_index] = feature_hist_offsets[feature_index] - prev_group_bins; feature_max_bins_[feature_index] = feature_hist_offsets[feature_index + 1] - prev_group_bins - 1; const MissingType missing_type = bin_mapper->missing_type(); @@ -68,6 +71,8 @@ CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_f } } } + num_data_in_leaf_.resize(num_leaves_, 0); + num_data_in_leaf_[0] = num_data_; } void CUDADataPartition::Init() { @@ -76,11 +81,14 @@ void CUDADataPartition::Init() { AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_data_start_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_data_end_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_num_data_); + InitCUDAValueFromConstant(&cuda_num_total_bin_, num_total_bin_); InitCUDAValueFromConstant(&cuda_cur_num_leaves_, 1); AllocateCUDAMemory(static_cast(num_data_), &cuda_data_to_left_); AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_), &cuda_block_data_to_left_offset_); AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_), &cuda_block_data_to_right_offset_); AllocateCUDAMemory(static_cast(num_data_), &cuda_out_data_indices_in_leaf_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_hist_pool_); + CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1); InitCUDAMemoryFromHostMemory(&cuda_feature_most_freq_bins_, feature_most_freq_bins_.data(), static_cast(num_features_)); InitCUDAMemoryFromHostMemory(&cuda_feature_default_bins_, feature_default_bins_.data(), static_cast(num_features_)); @@ -121,17 +129,25 @@ void CUDADataPartition::Split(const int* leaf_id, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer) { + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer) { + int leaf_index_cpu = 0; + global_timer.Start("GenDataToLeftBitVector"); + CopyFromCUDADeviceToHost(&leaf_index_cpu, leaf_id, 1); + const data_size_t num_data_in_leaf = num_data_in_leaf_[leaf_index_cpu]; auto start = std::chrono::steady_clock::now(); - GenDataToLeftBitVector(leaf_id, best_split_feature, best_split_threshold, best_split_default_left); + GenDataToLeftBitVector(leaf_id, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); + global_timer.Stop("GenDataToLeftBitVector"); //Log::Warning("CUDADataPartition::GenDataToLeftBitVector time %f", duration); + global_timer.Start("SplitInner"); start = std::chrono::steady_clock::now(); - SplitInner(leaf_id, + SplitInner(leaf_id, num_data_in_leaf, best_left_sum_gradients, best_left_sum_hessians, best_left_count, best_left_gain, best_left_leaf_value, best_right_sum_gradients, best_right_sum_hessians, best_right_count, @@ -140,23 +156,27 @@ void CUDADataPartition::Split(const int* leaf_id, smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + smaller_leaf_cuda_hist_pointer_pointer, larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_hist_pointer_pointer, leaf_index_cpu); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); + global_timer.Stop("SplitInner"); //Log::Warning("CUDADataPartition::SplitInner time %f", duration); } void CUDADataPartition::GenDataToLeftBitVector(const int* leaf_id, + const data_size_t num_data_in_leaf, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left) { - LaunchGenDataToLeftBitVectorKernel(leaf_id, best_split_feature, best_split_threshold, best_split_default_left); + LaunchGenDataToLeftBitVectorKernel(leaf_id, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left); } -void CUDADataPartition::SplitInner(const int* leaf_index, +void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, @@ -166,11 +186,13 @@ void CUDADataPartition::SplitInner(const int* leaf_index, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer) { - LaunchSplitInnerKernel(leaf_index, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer, const int cpu_leaf_index) { + LaunchSplitInnerKernel(leaf_index, num_data_in_leaf, best_left_sum_gradients, best_left_sum_hessians, best_left_count, best_left_gain, best_left_leaf_value, best_right_sum_gradients, best_right_sum_hessians, best_right_count, @@ -179,10 +201,12 @@ void CUDADataPartition::SplitInner(const int* leaf_index, smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + smaller_leaf_cuda_hist_pointer_pointer, larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_index); } Tree* CUDADataPartition::GetCPUTree() {} diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 0abe7798e93d..19fa0c348071 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -71,7 +71,13 @@ __global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_s const uint32_t* min_bin, const uint32_t* max_bin, const uint8_t* missing_is_zero, const uint8_t* missing_is_na, const uint8_t* mfb_is_zero, const uint8_t* mfb_is_na, uint8_t* cuda_data_to_left) { + /*if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("GenDataToLeftBitVectorKernel step 0\n"); + }*/ const int leaf_index_ref = *leaf_index; + /*if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("GenDataToLeftBitVectorKernel leaf_index_ref = %d\n", leaf_index_ref); + }*/ const int best_split_feature_ref = best_split_feature[leaf_index_ref]; const int num_features_ref = *cuda_num_features; const uint32_t best_split_threshold_ref = best_split_threshold[leaf_index_ref]; @@ -80,73 +86,92 @@ __global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_s const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; const data_size_t* data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const unsigned int global_feature_value_index = global_data_index * num_features_ref + best_split_feature_ref; - const uint32_t default_bin_ref = default_bin[best_split_feature_ref]; - const uint32_t most_freq_bin_ref = most_freq_bin[best_split_feature_ref]; - const uint32_t max_bin_ref = max_bin[best_split_feature_ref]; - const uint32_t min_bin_ref = min_bin[best_split_feature_ref]; - const uint8_t missing_is_zero_ref = missing_is_zero[best_split_feature_ref]; - const uint8_t missing_is_na_ref = missing_is_na[best_split_feature_ref]; - const uint8_t mfb_is_zero_ref = mfb_is_zero[best_split_feature_ref]; - const uint8_t mfb_is_na_ref = mfb_is_na[best_split_feature_ref]; - - uint32_t th = best_split_threshold_ref + min_bin_ref; - uint32_t t_zero_bin = min_bin_ref + default_bin_ref; - if (most_freq_bin_ref == 0) { - --th; - --t_zero_bin; - } - uint8_t split_default_to_left = 0; - uint8_t split_missing_default_to_left = 0; - if (most_freq_bin_ref <= best_split_threshold_ref) { - split_default_to_left = 1; - } - if (missing_is_zero_ref || missing_is_na_ref) { - if (default_left_ref) { - split_missing_default_to_left = 1; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const unsigned int global_feature_value_index = global_data_index * num_features_ref + best_split_feature_ref; + const uint32_t default_bin_ref = default_bin[best_split_feature_ref]; + const uint32_t most_freq_bin_ref = most_freq_bin[best_split_feature_ref]; + const uint32_t max_bin_ref = max_bin[best_split_feature_ref]; + const uint32_t min_bin_ref = min_bin[best_split_feature_ref]; + const uint8_t missing_is_zero_ref = missing_is_zero[best_split_feature_ref]; + const uint8_t missing_is_na_ref = missing_is_na[best_split_feature_ref]; + const uint8_t mfb_is_zero_ref = mfb_is_zero[best_split_feature_ref]; + const uint8_t mfb_is_na_ref = mfb_is_na[best_split_feature_ref]; + /*if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("GenDataToLeftBitVectorKernel step 1\n"); + }*/ + uint32_t th = best_split_threshold_ref + min_bin_ref; + uint32_t t_zero_bin = min_bin_ref + default_bin_ref; + if (most_freq_bin_ref == 0) { + --th; + --t_zero_bin; } - } - - if (local_data_index < static_cast(num_data_in_leaf)) { - const uint32_t bin = static_cast(cuda_data[global_feature_value_index]); - if (min_bin_ref < max_bin_ref) { - if ((missing_is_zero_ref && !mfb_is_zero_ref && bin == t_zero_bin)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - } else if (bin < min_bin_ref || bin > max_bin_ref) { - if ((missing_is_na_ref || mfb_is_na_ref) || (missing_is_zero_ref || mfb_is_zero_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - } else { - cuda_data_to_left[local_data_index] = 1; + uint8_t split_default_to_left = 0; + uint8_t split_missing_default_to_left = 0; + if (most_freq_bin_ref <= best_split_threshold_ref) { + split_default_to_left = 1; + } + if (missing_is_zero_ref || missing_is_na_ref) { + if (default_left_ref) { + split_missing_default_to_left = 1; } - } else { - if (missing_is_zero_ref || !mfb_is_zero_ref && bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - } else if (bin != max_bin_ref) { - if ((missing_is_na_ref && mfb_is_na_ref) || (missing_is_zero_ref && mfb_is_zero_ref)) { + } + /*if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("GenDataToLeftBitVectorKernel step 2\n"); + }*/ + if (local_data_index < static_cast(num_data_in_leaf)) { + /*if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("GenDataToLeftBitVectorKernel step 3\n"); + }*/ + const uint32_t bin = static_cast(cuda_data[global_feature_value_index]); + if (min_bin_ref < max_bin_ref) { + if ((missing_is_zero_ref && !mfb_is_zero_ref && bin == t_zero_bin)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin < min_bin_ref || bin > max_bin_ref) { + if ((missing_is_na_ref || mfb_is_na_ref) || (missing_is_zero_ref || mfb_is_zero_ref)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + } + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; } else { - cuda_data_to_left[local_data_index] = split_default_to_left; + cuda_data_to_left[local_data_index] = 1; } } else { - if (missing_is_na_ref && !mfb_is_na_ref) { + if (missing_is_zero_ref || !mfb_is_zero_ref && bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin != max_bin_ref) { + if ((missing_is_na_ref && mfb_is_na_ref) || (missing_is_zero_ref && mfb_is_zero_ref)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + } } else { - cuda_data_to_left[local_data_index] = split_default_to_left; + if (missing_is_na_ref && !mfb_is_na_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + } } } + /*if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("GenDataToLeftBitVectorKernel step 4\n"); + }*/ } } } -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const int* best_split_feature, +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t num_data_in_leaf, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left) { - GenDataToLeftBitVectorKernel<<>>( + const int num_blocks = std::max(80, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; + int split_indices_block_size_data_partition_aligned = 1; + while (split_indices_block_size_data_partition > 0) { + split_indices_block_size_data_partition_aligned <<= 1; + split_indices_block_size_data_partition >>= 1; + } + GenDataToLeftBitVectorKernel<<>>( leaf_index, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, best_split_feature, best_split_threshold, cuda_num_features_, cuda_data_, @@ -159,12 +184,13 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const int* leaf_index __global__ void PrepareOffsetKernel(const int* leaf_index, const data_size_t* cuda_leaf_num_data, const uint8_t* split_to_left_bit_vector, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer) { + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { const unsigned int blockDim_x = blockDim.x; __shared__ uint32_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - __shared__ uint32_t thread_to_right_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; + //__shared__ uint32_t thread_to_right_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + // (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const unsigned int threadIdx_x = threadIdx.x; const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); const unsigned int global_read_index = blockIdx.x * blockDim.x * 2 + threadIdx_x; @@ -175,28 +201,36 @@ __global__ void PrepareOffsetKernel(const int* leaf_index, if (global_read_index < num_data_in_leaf_ref) { const uint8_t bit = split_to_left_bit_vector[global_read_index]; thread_to_left_offset_cnt[conflict_free_threadIdx_x] = bit; - thread_to_right_offset_cnt[conflict_free_threadIdx_x] = 1 - bit; + //thread_to_right_offset_cnt[conflict_free_threadIdx_x] = 1 - bit; } else { thread_to_left_offset_cnt[conflict_free_threadIdx_x] = 0; - thread_to_right_offset_cnt[conflict_free_threadIdx_x] = 0; + //thread_to_right_offset_cnt[conflict_free_threadIdx_x] = 0; } const unsigned int conflict_free_threadIdx_x_offseted = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x); if (global_read_index + blockDim_x < num_data_in_leaf_ref) { const uint8_t bit = split_to_left_bit_vector[global_read_index + blockDim_x]; thread_to_left_offset_cnt[conflict_free_threadIdx_x_offseted] = bit; - thread_to_right_offset_cnt[conflict_free_threadIdx_x_offseted] = 1 - bit; + //thread_to_right_offset_cnt[conflict_free_threadIdx_x_offseted] = 1 - bit; } else { thread_to_left_offset_cnt[conflict_free_threadIdx_x_offseted] = 0; - thread_to_right_offset_cnt[conflict_free_threadIdx_x_offseted] = 0; + //thread_to_right_offset_cnt[conflict_free_threadIdx_x_offseted] = 0; } __syncthreads(); - PrefixSum(thread_to_left_offset_cnt, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); - PrefixSum(thread_to_right_offset_cnt, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + PrefixSum(thread_to_left_offset_cnt, split_indices_block_size_data_partition); + //PrefixSum(thread_to_right_offset_cnt, split_indices_block_size_data_partition); __syncthreads(); if (threadIdx_x == 0) { const unsigned int conflict_free_blockDim_x_times_2 = CONFLICT_FREE_INDEX(blockDim_x << 1); - block_to_left_offset_buffer[blockIdx.x + 1] = thread_to_left_offset_cnt[conflict_free_blockDim_x_times_2]; - block_to_right_offset_buffer[blockIdx.x + 1] = thread_to_right_offset_cnt[conflict_free_blockDim_x_times_2]; + const data_size_t num_data_in_block = (blockIdx.x + 1) * blockDim.x * 2 <= num_data_in_leaf_ref ? static_cast(blockDim_x * 2) : + num_data_in_leaf_ref - static_cast(blockIdx.x * blockDim.x * 2); + if (num_data_in_block > 0) { + const data_size_t data_to_left = static_cast(thread_to_left_offset_cnt[conflict_free_blockDim_x_times_2]); + block_to_left_offset_buffer[blockIdx.x + 1] = data_to_left; + block_to_right_offset_buffer[blockIdx.x + 1] = num_data_in_block - data_to_left; + } else { + block_to_left_offset_buffer[blockIdx.x + 1] = 0; + block_to_right_offset_buffer[blockIdx.x + 1] = 0; + } } } @@ -213,10 +247,14 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer) { + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer, + const int* cuda_num_total_bin, + hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition) { __shared__ uint32_t block_to_left_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint32_t block_to_right_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + @@ -227,8 +265,8 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b const unsigned int threadIdx_x = threadIdx.x; const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); const unsigned int conflict_free_threadIdx_x_plus_blockDim_x = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x); - const uint32_t num_blocks = (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; - const uint32_t num_aggregate_blocks = (num_blocks + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; + const uint32_t num_blocks = (num_data_in_leaf + split_indices_block_size_data_partition - 1) / split_indices_block_size_data_partition; + const uint32_t num_aggregate_blocks = (num_blocks + split_indices_block_size_data_partition - 1) / split_indices_block_size_data_partition; uint32_t left_prev_sum = 0; for (uint32_t block_id = 0; block_id < num_aggregate_blocks; ++block_id) { const unsigned int read_index = block_id * blockDim_x * 2 + threadIdx_x; @@ -247,10 +285,10 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b block_to_left_offset[0] += left_prev_sum; } __syncthreads(); - PrefixSum(block_to_left_offset, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + PrefixSum(block_to_left_offset, split_indices_block_size_data_partition); __syncthreads(); if (threadIdx_x == 0) { - left_prev_sum = block_to_left_offset[CONFLICT_FREE_INDEX(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION)]; + left_prev_sum = block_to_left_offset[CONFLICT_FREE_INDEX(split_indices_block_size_data_partition)]; } if (read_index < num_blocks) { const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); @@ -281,10 +319,10 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b block_to_right_offset[0] += right_prev_sum; } __syncthreads(); - PrefixSum(block_to_right_offset, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + PrefixSum(block_to_right_offset, split_indices_block_size_data_partition); __syncthreads(); if (threadIdx_x == 0) { - right_prev_sum = block_to_right_offset[CONFLICT_FREE_INDEX(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION)]; + right_prev_sum = block_to_right_offset[CONFLICT_FREE_INDEX(split_indices_block_size_data_partition)]; } if (read_index < num_blocks) { const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); @@ -307,40 +345,67 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index_ref]; cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; cuda_leaf_num_data[cur_max_leaf_index] = block_to_right_offset_buffer[num_blocks] - to_left_total_cnt; + const int cuda_num_total_bin_ref = *cuda_num_total_bin; ++(*cuda_cur_num_leaves); - + //printf("best_left_gain = %f, best_right_gain = %f\n", *best_left_gain, *best_right_gain); + //printf("cuda_leaf_num_data[%d] = %d, cuda_leaf_num_data[%d] = %d\n", + // leaf_index_ref, cuda_leaf_num_data[leaf_index_ref], cur_max_leaf_index, cuda_leaf_num_data[cur_max_leaf_index]); if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; - *smaller_leaf_cuda_sum_of_gradients_pointer = *best_left_sum_gradients; - *smaller_leaf_cuda_sum_of_hessians_pointer = *best_left_sum_hessians; - *smaller_leaf_cuda_num_data_in_leaf_pointer = *best_left_count; - *smaller_leaf_cuda_gain_pointer = *best_left_gain; - *smaller_leaf_cuda_leaf_value_pointer = *best_left_leaf_value; + *smaller_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; + *smaller_leaf_cuda_sum_of_hessians_pointer = best_left_sum_hessians[leaf_index_ref]; + *smaller_leaf_cuda_num_data_in_leaf_pointer = best_left_count[leaf_index_ref]; + *smaller_leaf_cuda_gain_pointer = best_left_gain[leaf_index_ref]; + *smaller_leaf_cuda_leaf_value_pointer = best_left_leaf_value[leaf_index_ref]; *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[leaf_index_ref]; *larger_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; - *larger_leaf_cuda_sum_of_gradients_pointer = *best_right_sum_gradients; - *larger_leaf_cuda_sum_of_hessians_pointer = *best_right_sum_hessians; - *larger_leaf_cuda_num_data_in_leaf_pointer = *best_right_count; - *larger_leaf_cuda_gain_pointer = *best_right_gain; - *larger_leaf_cuda_leaf_value_pointer = *best_right_leaf_value; + *larger_leaf_cuda_sum_of_gradients_pointer = best_right_sum_gradients[leaf_index_ref]; + *larger_leaf_cuda_sum_of_hessians_pointer = best_right_sum_hessians[leaf_index_ref]; + *larger_leaf_cuda_num_data_in_leaf_pointer = best_right_count[leaf_index_ref]; + *larger_leaf_cuda_gain_pointer = best_right_gain[leaf_index_ref]; + *larger_leaf_cuda_leaf_value_pointer = best_right_leaf_value[leaf_index_ref]; *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[cur_max_leaf_index]; + + hist_t* parent_hist_ptr = cuda_hist_pool[leaf_index_ref]; + cuda_hist_pool[cur_max_leaf_index] = parent_hist_ptr; + cuda_hist_pool[leaf_index_ref] = cuda_hist + 2 * cur_max_leaf_index * cuda_num_total_bin_ref; + *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; + *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; + //if (leaf_index_ref == 2) { + /*printf("*************** leaf %d sum_gradients %f sum_hessians %f\n", leaf_index_ref, *smaller_leaf_cuda_sum_of_gradients_pointer, + *smaller_leaf_cuda_sum_of_hessians_pointer); + printf("*************** leaf %d sum_gradients %f sum_hessians %f\n", cur_max_leaf_index, *larger_leaf_cuda_sum_of_gradients_pointer, + *larger_leaf_cuda_sum_of_hessians_pointer); + printf("leaf 2 cuda ptr in CUDA kernel = %ld\n", cuda_hist_pool[leaf_index_ref]);*/ + //} } else { *larger_leaf_cuda_leaf_index_pointer = leaf_index_ref; - *larger_leaf_cuda_sum_of_gradients_pointer = *best_left_sum_gradients; - *larger_leaf_cuda_sum_of_hessians_pointer = *best_left_sum_hessians; - *larger_leaf_cuda_num_data_in_leaf_pointer = *best_left_count; - *larger_leaf_cuda_gain_pointer = *best_left_gain; - *larger_leaf_cuda_leaf_value_pointer = *best_left_leaf_value; + *larger_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; + *larger_leaf_cuda_sum_of_hessians_pointer = best_left_sum_hessians[leaf_index_ref]; + *larger_leaf_cuda_num_data_in_leaf_pointer = best_left_count[leaf_index_ref]; + *larger_leaf_cuda_gain_pointer = best_left_gain[leaf_index_ref]; + *larger_leaf_cuda_leaf_value_pointer = best_left_leaf_value[leaf_index_ref]; *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[leaf_index_ref]; *smaller_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; - *smaller_leaf_cuda_sum_of_gradients_pointer = *best_right_sum_gradients; - *smaller_leaf_cuda_sum_of_hessians_pointer = *best_right_sum_hessians; - *smaller_leaf_cuda_num_data_in_leaf_pointer = *best_right_count; - *smaller_leaf_cuda_gain_pointer = *best_right_gain; - *smaller_leaf_cuda_leaf_value_pointer = *best_right_leaf_value; + *smaller_leaf_cuda_sum_of_gradients_pointer = best_right_sum_gradients[leaf_index_ref]; + *smaller_leaf_cuda_sum_of_hessians_pointer = best_right_sum_hessians[leaf_index_ref]; + *smaller_leaf_cuda_num_data_in_leaf_pointer = best_right_count[leaf_index_ref]; + *smaller_leaf_cuda_gain_pointer = best_right_gain[leaf_index_ref]; + *smaller_leaf_cuda_leaf_value_pointer = best_right_leaf_value[leaf_index_ref]; *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[cur_max_leaf_index]; + + cuda_hist_pool[cur_max_leaf_index] = cuda_hist + 2 * cur_max_leaf_index * cuda_num_total_bin_ref; + *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; + *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; + /*printf("*************** leaf %d sum_gradients %f sum_hessians %f\n", cur_max_leaf_index, *smaller_leaf_cuda_sum_of_gradients_pointer, + *smaller_leaf_cuda_sum_of_hessians_pointer); + printf("*************** leaf %d sum_gradients %f sum_hessians %f\n", leaf_index_ref, *larger_leaf_cuda_sum_of_gradients_pointer, + *larger_leaf_cuda_sum_of_hessians_pointer); + if (leaf_index_ref == 2) { + printf("error unexpected, 2 should be the smaller leaf\n"); + }*/ } } } @@ -349,7 +414,7 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ const data_size_t* cuda_leaf_data_start, const data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, const uint8_t* split_to_left_bit_vector, const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, - data_size_t* out_data_indices_in_leaf) { + data_size_t* out_data_indices_in_leaf, const int split_indices_block_size_data_partition) { __shared__ uint8_t thread_split_to_left_bit_vector[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; __shared__ uint32_t thread_to_left_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; @@ -393,8 +458,8 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ thread_to_right_pos[0] = to_right_block_offset; } __syncthreads(); - PrefixSum(thread_to_left_pos, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); - PrefixSum(thread_to_right_pos, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + PrefixSum(thread_to_left_pos, split_indices_block_size_data_partition); + PrefixSum(thread_to_right_pos, split_indices_block_size_data_partition); __syncthreads(); if (global_thread_index < num_data_in_leaf_ref) { if (thread_split_to_left_bit_vector[threadIdx_x] == 1) { @@ -496,7 +561,7 @@ __global__ void CopyDataIndicesKernel(const int* leaf_index, } } -void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, +void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data_size_t num_data_in_leaf, const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, @@ -506,20 +571,33 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer) { + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer, const int cpu_leaf_index) { + //Log::Warning("num_data_in_leaf = %d", num_data_in_leaf); + const int num_blocks = std::max(80, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; + int split_indices_block_size_data_partition_aligned = 1; + while (split_indices_block_size_data_partition > 0) { + split_indices_block_size_data_partition_aligned <<= 1; + split_indices_block_size_data_partition >>= 1; + } + //Log::Warning("num_blocks = %d, split_indices_block_size_data_partition_aligned = %d", num_blocks, split_indices_block_size_data_partition_aligned); auto start = std::chrono::steady_clock::now(); - PrepareOffsetKernel<<>>( + const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; + //Log::Warning("num_blocks_final = %d", num_blocks_final); + PrepareOffsetKernel<<>>( leaf_index, cuda_leaf_num_data_, cuda_data_to_left_, - cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_); + cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, split_indices_block_size_data_partition_aligned); SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); //Log::Warning("CUDADataPartition::PrepareOffsetKernel time %f", duration); start = std::chrono::steady_clock::now(); - AggregateBlockOffsetKernel<<<1, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION / 2>>>(leaf_index, cuda_block_data_to_left_offset_, + AggregateBlockOffsetKernel<<<1, split_indices_block_size_data_partition_aligned / 2>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, cuda_cur_num_leaves_, @@ -533,25 +611,35 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + smaller_leaf_cuda_hist_pointer_pointer, larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_hist_pointer_pointer, + cuda_num_total_bin_, + cuda_hist_, + cuda_hist_pool_, split_indices_block_size_data_partition_aligned); + const auto copy_start = std::chrono::steady_clock::now(); + CopyFromCUDADeviceToHost(num_data_in_leaf_.data(), cuda_leaf_num_data_, num_leaves_); SynchronizeCUDADevice(); + const auto copy_end = std::chrono::steady_clock::now(); + const auto copy_duration = (static_cast>(copy_end - copy_start)).count(); + //Log::Warning("CUDADataPartition::CopyFromCUDADeviceToHost time %f", copy_duration); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); //Log::Warning("CUDADataPartition::AggregateBlockOffsetKernel time %f", duration); start = std::chrono::steady_clock::now(); - SplitInnerKernel<<>>( + SplitInnerKernel<<>>( leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - cuda_out_data_indices_in_leaf_); + cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); SynchronizeCUDADevice(); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); //Log::Warning("CUDADataPartition::SplitInnerKernel time %f", duration); start = std::chrono::steady_clock::now(); - CopyDataIndicesKernel<<>>( + CopyDataIndicesKernel<<>>( leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_data_indices_); SynchronizeCUDADevice(); end = std::chrono::steady_clock::now(); diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 286593b7da38..877c5d2c7221 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -10,6 +10,7 @@ #include #include +#include #include "new_cuda_utils.hpp" #define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) @@ -23,7 +24,8 @@ class CUDADataPartition { public: CUDADataPartition(const data_size_t num_data, const int num_features, const int num_leaves, const int num_threads, const data_size_t* cuda_num_data, const int* cuda_num_leaves, const uint8_t* cuda_data, - const int* cuda_num_features, const std::vector& feature_hist_offsets, const Dataset* train_data); + const int* cuda_num_features, const std::vector& feature_hist_offsets, const Dataset* train_data, + hist_t* cuda_hist); void Init(); @@ -40,10 +42,12 @@ class CUDADataPartition { double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer); Tree* GetCPUTree(); @@ -88,20 +92,31 @@ class CUDADataPartition { CopyFromCUDADeviceToHost(test_leaf_num_data.data(), cuda_leaf_num_data_, static_cast(num_leaves_)); CopyFromCUDADeviceToHost(test_leaf_data_start.data(), cuda_leaf_data_start_, static_cast(num_leaves_)); CopyFromCUDADeviceToHost(test_leaf_data_end.data(), cuda_leaf_data_end_, static_cast(num_leaves_)); - for (int i = 0; i < num_leaves_; ++i) { + /*for (int i = 0; i < num_leaves_; ++i) { Log::Warning("test_leaf_num_data[%d] = %d", i, test_leaf_num_data[i]); Log::Warning("test_leaf_data_start[%d] = %d", i, test_leaf_data_start[i]); Log::Warning("test_leaf_data_end[%d] = %d", i, test_leaf_data_end[i]); - } - const data_size_t num_data_in_leaf_0 = test_leaf_num_data[0]; + }*/ + const data_size_t start_pos = test_leaf_data_start[2]; const int check_window_size = 10; for (data_size_t i = 0; i < check_window_size; ++i) { Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); } - for (data_size_t i = num_data_in_leaf_0 - check_window_size; i < num_data_in_leaf_0; ++i) { + Log::Warning("=========================================================="); + for (data_size_t i = start_pos - check_window_size; i < start_pos; ++i) { + Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); + } + Log::Warning("=========================================================="); + for (data_size_t i = start_pos; i < start_pos + check_window_size; ++i) { + Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); + } + Log::Warning("=========================================================="); + const data_size_t end_pos = test_leaf_data_end[2]; + for (data_size_t i = end_pos - check_window_size; i < end_pos; ++i) { Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); } - for (data_size_t i = num_data_in_leaf_0; i < num_data_in_leaf_0 + check_window_size; ++i) { + Log::Warning("=========================================================="); + for (data_size_t i = end_pos; i < end_pos + check_window_size; ++i) { Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); } } @@ -130,10 +145,10 @@ class CUDADataPartition { const int* cuda_cur_num_leaves() const { return cuda_cur_num_leaves_; } private: - void GenDataToLeftBitVector(const int* leaf_id, const int* best_split_feature, + void GenDataToLeftBitVector(const int* leaf_id, const data_size_t num_data_in_leaf, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); - void SplitInner(const int* leaf_index, + void SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, @@ -143,15 +158,17 @@ class CUDADataPartition { double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer, const int cpu_leaf_index); // kernel launch functions void LaunchFillDataIndicesBeforeTrain(); - void LaunchSplitInnerKernel(const int* leaf_index, + void LaunchSplitInnerKernel(const int* leaf_index, const data_size_t num_data_in_leaf, const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, @@ -161,12 +178,14 @@ class CUDADataPartition { double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer); + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer, const int cpu_leaf_index); - void LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const int* best_split_feature, + void LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t num_data_in_leaf, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); void LaunchPrefixSumKernel(uint32_t* cuda_elements); @@ -176,6 +195,7 @@ class CUDADataPartition { const int num_features_; const int num_leaves_; const int num_threads_; + const int num_total_bin_; int max_num_split_indices_blocks_; std::vector feature_default_bins_; std::vector feature_most_freq_bins_; @@ -185,6 +205,8 @@ class CUDADataPartition { std::vector feature_missing_is_na_; std::vector feature_mfb_is_zero_; std::vector feature_mfb_is_na_; + std::vector num_data_in_leaf_; + int cur_num_leaves_; // CUDA memory, held by this object data_size_t* cuda_data_indices_; @@ -205,12 +227,16 @@ class CUDADataPartition { uint8_t* cuda_feature_missing_is_na_; uint8_t* cuda_feature_mfb_is_zero_; uint8_t* cuda_feature_mfb_is_na_; + int* cuda_num_total_bin_; + // for histogram pool + hist_t** cuda_hist_pool_; // CUDA memory, held by other object const data_size_t* cuda_num_data_; const int* cuda_num_leaves_; const uint8_t* cuda_data_; const int* cuda_num_features_; + hist_t* cuda_hist_; }; } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 99bb32bf6b7e..74b4245b5e07 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -48,6 +48,7 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data) { AllocateCUDAMemory(num_feature_groups_ * num_data_, &cuda_data_); AllocateCUDAMemory(num_total_bin_ * 2 * num_leaves_, &cuda_hist_); + SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_); InitCUDAMemoryFromHostMemory(&cuda_num_total_bin_, &num_total_bin_, 1); @@ -97,15 +98,24 @@ void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, data_[index] = feature_bin_value_uint8; } -void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const int* /*cuda_larger_leaf_index*/, - const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** /*cuda_data_indices_in_larger_leaf*/, +void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, + const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, + const double* cuda_smaller_leaf_sum_gradients, const double* cuda_smaller_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, + const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, const data_size_t* cuda_leaf_num_data) { auto start = std::chrono::steady_clock::now(); - LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data); + LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist); SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); //Log::Warning("LaunchConstructHistogramKernel time %f", duration); + start = std::chrono::steady_clock::now(); + LaunchSubtractHistogramKernel(cuda_smaller_leaf_index, + cuda_larger_leaf_index, cuda_smaller_leaf_sum_gradients, cuda_smaller_leaf_sum_hessians, + cuda_larger_leaf_sum_gradients, cuda_larger_leaf_sum_hessians, cuda_smaller_leaf_hist, cuda_larger_leaf_hist); + end = std::chrono::steady_clock::now(); + duration = (static_cast>(end - start)).count(); + //Log::Warning("LaunchSubtractHistogramKernel time %f", duration); /*PrintLastCUDAError(); std::vector cpu_hist(6143 * 2, 0.0f); CopyFromCUDADeviceToHost(cpu_hist.data(), cuda_hist_, 6143 * 2);*/ diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index c910754a171f..033c103a6b53 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -49,15 +49,27 @@ __device__ void PrefixSum(hist_t* elements, unsigned int n) { __global__ void CUDAConstructHistogramKernel(const int* leaf_index, const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t** data_indices_ptr, hist_t* feature_histogram, const int* num_feature_groups, - const data_size_t* leaf_num_data, const uint8_t* data, const uint32_t* feature_group_offsets) { + const data_size_t** data_indices_ptr, hist_t** feature_histogram, const int* num_feature_groups, + const data_size_t* leaf_num_data, const uint8_t* data, const uint32_t* feature_group_offsets, + const int* /*cuda_num_total_bin*/) { const unsigned int threadIdx_x = threadIdx.x; const int num_feature_groups_ref = *num_feature_groups; const int leaf_index_ref = *leaf_index; + const int dim_y = gridDim.y * blockDim.y; + //const int cuda_num_total_bin_ref = *cuda_num_total_bin; + hist_t* feature_histogram_ptr = *feature_histogram; + /*if (blockIdx.x == 0 && threadIdx.x == 0 && blockIdx.y == 0 && threadIdx.y == 0) { + printf("construct histogram for leaf %d\n", leaf_index_ref); + }*/ const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; + const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; const data_size_t* data_indices_ref = *data_indices_ptr; __shared__ float shared_hist[SHRAE_HIST_SIZE]; // 256 * 24 * 2, can use 24 features uint32_t num_bins_in_col_group = feature_group_offsets[blockDim.x]; + /*if (blockIdx.x == 0 && threadIdx.x == 0 && blockIdx.y == 0 && threadIdx.y == 0) { + printf("cuda_num_total_bin_ref = %d\n", cuda_num_total_bin_ref); + printf("num_bins_in_col_group %d\n", num_bins_in_col_group); + }*/ const uint32_t num_items_per_thread = (2 * num_bins_in_col_group + NUM_THRADS_PER_BLOCK - 1) / NUM_THRADS_PER_BLOCK; const int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; const uint32_t thread_start = thread_idx * num_items_per_thread; @@ -69,13 +81,24 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, __syncthreads(); const unsigned int threadIdx_y = threadIdx.y; const unsigned int blockIdx_y = blockIdx.y; - const data_size_t start = (threadIdx_y + blockIdx_y * blockDim.y) * NUM_DATA_PER_THREAD; - const data_size_t end = start + NUM_DATA_PER_THREAD > num_data_in_smaller_leaf_ref ? - num_data_in_smaller_leaf_ref : start + NUM_DATA_PER_THREAD; + const data_size_t start = (threadIdx_y + blockIdx_y * blockDim.y) * num_data_per_thread; + const data_size_t end = start + num_data_per_thread > num_data_in_smaller_leaf_ref ? + num_data_in_smaller_leaf_ref : start + num_data_per_thread; + /*if (blockIdx.x == 0 && threadIdx.x == 0 && blockIdx.y == 0 && threadIdx.y == 0) { + if (leaf_index_ref == 2) { + for (data_size_t i = 0; i < 10; ++i) { + printf("leaf 2 data index %d = %d\n", i, data_indices_ref[i]); + } + printf("===========================================\n"); + for (data_size_t i = 1030726 - 10; i < 1030726; ++i) { + printf("leaf 2 data index %d = %d\n", i, data_indices_ref[i]); + } + } + }*/ for (data_size_t i = start; i < end; ++i) { - const score_t grad = cuda_gradients[i]; - const score_t hess = cuda_hessians[i]; const data_size_t data_index = data_indices_ref[i]; + const score_t grad = cuda_gradients[data_index]; + const score_t hess = cuda_hessians[data_index]; const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + threadIdx_x]) + feature_group_offsets[threadIdx_x]; const uint32_t pos = bin << 1; @@ -85,107 +108,138 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, } __syncthreads(); for (uint32_t i = thread_start; i < thread_end; ++i) { - atomicAdd_system(feature_histogram + i, shared_hist[i]); + atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); } } void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const int* cuda_smaller_leaf_index, const data_size_t** cuda_data_indices_in_smaller_leaf, - const data_size_t* cuda_leaf_num_data) { + const data_size_t* cuda_leaf_num_data, + hist_t** cuda_leaf_hist) { + int smaller_leaf_index = 0; + CopyFromCUDADeviceToHost(&smaller_leaf_index, cuda_smaller_leaf_index, 1); + SynchronizeCUDADevice(); + data_size_t smaller_leaf_num_data = 0; + CopyFromCUDADeviceToHost(&smaller_leaf_num_data, cuda_leaf_num_data + smaller_leaf_index, 1); + SynchronizeCUDADevice(); const int block_dim_x = num_features_; // TODO(shiyu1994): only supports the case when the whole histogram can be loaded into shared memory const int block_dim_y = NUM_THRADS_PER_BLOCK / block_dim_x; - const int grid_dim_y = ((num_data_ + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + block_dim_y - 1) / block_dim_y; + const int min_grid_dim_y = 80; + const int grid_dim_y = std::max(min_grid_dim_y, ((smaller_leaf_num_data + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + block_dim_y - 1) / block_dim_y); const int grid_dim_x = (static_cast(num_feature_groups_ + NUM_FEATURE_PER_THREAD_GROUP - 1) / NUM_FEATURE_PER_THREAD_GROUP); + //Log::Warning("smaller_leaf_num_data = %d", smaller_leaf_num_data); //Log::Warning("block_dim_x = %d, block_dim_y = %d", block_dim_x, block_dim_y); //Log::Warning("gid_dim_x = %d, grid_dim_y = %d", grid_dim_x, grid_dim_y); dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); CUDAConstructHistogramKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_hist_, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_, - cuda_feature_group_bin_offsets_); + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_, + cuda_feature_group_bin_offsets_, cuda_num_total_bin_); } -__global__ void SubtractAndFixHistogramKernel(const int* cuda_smaller_leaf_index, +__global__ void SubtractHistogramKernel(const int* /*cuda_smaller_leaf_index*/, const int* cuda_larger_leaf_index, const uint8_t* cuda_feature_mfb_offsets, const uint32_t* cuda_feature_num_bins, const int* cuda_num_total_bin, - hist_t* cuda_hist) { + hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist) { const int cuda_num_total_bin_ref = *cuda_num_total_bin; const unsigned int global_thread_index = threadIdx.x + blockIdx.x * blockDim.x; - const int cuda_smaller_leaf_index_ref = *cuda_smaller_leaf_index; + //const int cuda_smaller_leaf_index_ref = *cuda_smaller_leaf_index; const int cuda_larger_leaf_index_ref = *cuda_larger_leaf_index; - const hist_t* smaller_leaf_hist = cuda_hist + (cuda_smaller_leaf_index_ref * cuda_num_total_bin_ref * 2); - hist_t* larger_leaf_hist = cuda_hist + (cuda_larger_leaf_index_ref * cuda_num_total_bin_ref * 2); - if (global_thread_index < 2 * cuda_num_total_bin_ref) { - larger_leaf_hist[global_thread_index] -= smaller_leaf_hist[global_thread_index]; + if (cuda_larger_leaf_index_ref >= 0) { + const hist_t* smaller_leaf_hist = *cuda_smaller_leaf_hist; //cuda_hist + (cuda_smaller_leaf_index_ref * cuda_num_total_bin_ref * 2); + hist_t* larger_leaf_hist = *cuda_larger_leaf_hist; //cuda_hist + (cuda_larger_leaf_index_ref * cuda_num_total_bin_ref * 2); + if (global_thread_index < 2 * cuda_num_total_bin_ref) { + larger_leaf_hist[global_thread_index] -= smaller_leaf_hist[global_thread_index]; + } } } __global__ void FixHistogramKernel(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, const uint32_t* cuda_feature_num_bins, const int* cuda_num_features, - const int* cuda_num_total_bin, const uint32_t* cuda_feature_hist_offsets, + const int* /*cuda_num_total_bin*/, const uint32_t* cuda_feature_hist_offsets, const uint32_t* cuda_feature_most_freq_bins, const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians, - hist_t* cuda_hist) { + hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist) { const int cuda_num_features_ref = *cuda_num_features; const unsigned int blockIdx_x = blockIdx.x; const int feature_index = blockIdx_x % cuda_num_features_ref; const bool larger_or_smaller = static_cast(blockIdx_x / cuda_num_features_ref); const int leaf_index_ref = larger_or_smaller ? *cuda_larger_leaf_index : *cuda_smaller_leaf_index; - const int cuda_num_total_bin_ref = *cuda_num_total_bin; - const uint32_t feature_hist_offset = cuda_feature_hist_offsets[feature_index]; - const uint32_t most_freq_bin = cuda_feature_most_freq_bins[feature_index]; - if (most_freq_bin > 0) { - const double leaf_sum_gradients = larger_or_smaller ? *larger_leaf_sum_gradients : *smaller_leaf_sum_gradients; - const double leaf_sum_hessians = larger_or_smaller ? *larger_leaf_sum_hessians : *smaller_leaf_sum_hessians; - hist_t* feature_hist = cuda_hist + cuda_num_total_bin_ref * 2 * leaf_index_ref + feature_hist_offset * 2; - __shared__ double hist_gradients[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; - __shared__ double hist_hessians[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; - const unsigned int threadIdx_x = threadIdx.x; - const uint32_t num_bin = cuda_feature_num_bins[feature_index]; - if (threadIdx_x < num_bin) { - if (threadIdx_x == most_freq_bin) { + __shared__ double hist_gradients[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; + __shared__ double hist_hessians[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; + if (leaf_index_ref >= 0) { + //const int cuda_num_total_bin_ref = *cuda_num_total_bin; + const uint32_t feature_hist_offset = cuda_feature_hist_offsets[feature_index]; + const uint32_t most_freq_bin = cuda_feature_most_freq_bins[feature_index]; + if (most_freq_bin > 0) { + const double leaf_sum_gradients = larger_or_smaller ? *larger_leaf_sum_gradients : *smaller_leaf_sum_gradients; + const double leaf_sum_hessians = larger_or_smaller ? *larger_leaf_sum_hessians : *smaller_leaf_sum_hessians; + hist_t* feature_hist = larger_or_smaller ? (*cuda_larger_leaf_hist) + feature_hist_offset * 2 : + (*cuda_smaller_leaf_hist) + feature_hist_offset * 2; + //cuda_hist + cuda_num_total_bin_ref * 2 * leaf_index_ref + feature_hist_offset * 2; + const unsigned int threadIdx_x = threadIdx.x; + const uint32_t num_bin = cuda_feature_num_bins[feature_index]; + if (threadIdx_x < num_bin) { + if (threadIdx_x == most_freq_bin) { + hist_gradients[threadIdx_x] = 0.0f; + hist_hessians[threadIdx_x] = 0.0f; + } else { + hist_gradients[threadIdx_x] = feature_hist[threadIdx_x << 1]; + hist_hessians[threadIdx_x] = feature_hist[(threadIdx_x << 1) + 1]; + } + } else { hist_gradients[threadIdx_x] = 0.0f; hist_hessians[threadIdx_x] = 0.0f; - } else { - hist_gradients[threadIdx_x] = feature_hist[threadIdx_x << 1]; - hist_hessians[threadIdx_x] = feature_hist[(threadIdx_x << 1) + 1]; + } + uint32_t num_bin_aligned = 1; + uint32_t num_bin_to_shift = num_bin - 1; + while (num_bin_to_shift > 0) { + num_bin_to_shift >>= 1; + num_bin_aligned <<= 1; + } + /*if (threadIdx.x == 0) { + printf("num_bin_aligned = %d\n", num_bin_aligned); + }*/ + __syncthreads(); + PrefixSum(hist_gradients, num_bin_aligned); + PrefixSum(hist_hessians, num_bin_aligned); + __syncthreads(); + if (threadIdx_x == most_freq_bin) { + feature_hist[most_freq_bin << 1] = leaf_sum_gradients - hist_gradients[num_bin_aligned]; + feature_hist[(most_freq_bin << 1) + 1] = leaf_sum_hessians - hist_hessians[num_bin_aligned]; + } + if (threadIdx.x == 0) { + //printf("fix most freq bin: feature_hist_offset %d + most_freq_bin %d = %d, num_bin_aligned = %d, leaf_sum_gradients = %f, leaf_sum_hessians = %f, hist_gradients[num_bin_aligned] = %f, hist_hessians[num_bin_aligned] = %f, feature_hist[most_freq_bin << 1] = %f, feature_hist[(most_freq_bin << 1) + 1] = %f\n", + // feature_hist_offset, most_freq_bin, feature_hist_offset + most_freq_bin, num_bin_aligned, leaf_sum_gradients, leaf_sum_hessians, hist_gradients[num_bin_aligned], hist_hessians[num_bin_aligned], feature_hist[most_freq_bin << 1], feature_hist[(most_freq_bin << 1) + 1]); } } - uint32_t num_bin_aligned = 1; - uint32_t num_bin_to_shift = num_bin; - while (num_bin_to_shift > 0) { - num_bin_to_shift >>= 1; - num_bin_aligned <<= 1; - } - __syncthreads(); - PrefixSum(hist_gradients, num_bin_aligned); - PrefixSum(hist_hessians, num_bin_aligned); - __syncthreads(); - feature_hist[most_freq_bin << 1] = leaf_sum_gradients - hist_gradients[num_bin_aligned]; - feature_hist[(most_freq_bin << 1) + 1] = leaf_sum_hessians - hist_hessians[num_bin_aligned]; } } -void CUDAHistogramConstructor::LaunchSubtractAndFixHistogramKernel(const int* cuda_smaller_leaf_index, +void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, - const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians) { + const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians, + hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist) { const int num_subtract_threads = 2 * num_total_bin_; const int num_subtract_blocks = (num_subtract_threads + SUBTRACT_BLOCK_SIZE - 1) / SUBTRACT_BLOCK_SIZE; - SubtractAndFixHistogramKernel<<>>( + //Log::Warning("Before SubtractHistogramKernel"); + SubtractHistogramKernel<<>>( cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_feature_mfb_offsets_, - cuda_feature_num_bins_, cuda_num_total_bin_, cuda_hist_); + cuda_feature_num_bins_, cuda_num_total_bin_, cuda_smaller_leaf_hist, cuda_larger_leaf_hist); SynchronizeCUDADevice(); + //Log::Warning("After SubtractHistogramKernel"); FixHistogramKernel<<<2 * num_features_, FIX_HISTOGRAM_BLOCK_SIZE>>>( cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_feature_num_bins_, cuda_num_features_, cuda_num_total_bin_, cuda_feature_hist_offsets_, cuda_feature_most_freq_bins_, smaller_leaf_sum_gradients, smaller_leaf_sum_hessians, larger_leaf_sum_gradients, larger_leaf_sum_hessians, - cuda_hist_); + cuda_smaller_leaf_hist, cuda_larger_leaf_hist); SynchronizeCUDADevice(); + //Log::Warning("After FixHistogramKernel"); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index f3da7b08e5a9..470d08ef69ab 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -11,6 +11,8 @@ #include #include +#include + #include "new_cuda_utils.hpp" #include @@ -34,10 +36,16 @@ class CUDAHistogramConstructor { void ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, + const double* cuda_smaller_leaf_sum_gradients, const double* cuda_smaller_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, + const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, const data_size_t* cuda_leaf_num_data); const hist_t* cuda_hist() const { return cuda_hist_; } + hist_t* cuda_hist_pointer() const { return cuda_hist_; } + + hist_t* cuda_hist_pointer() { return cuda_hist_; } + const uint8_t* cuda_data() const { return cuda_data_; } void TestAfterInit() { @@ -51,20 +59,31 @@ class CUDAHistogramConstructor { void TestAfterConstructHistogram() { PrintLastCUDAError(); std::vector test_hist(num_total_bin_ * 2, 0.0f); - CopyFromCUDADeviceToHost(test_hist.data(), cuda_hist_, static_cast(num_total_bin_) * 2); + /*CopyFromCUDADeviceToHost(test_hist.data(), cuda_hist_, static_cast(num_total_bin_) * 2); for (int i = 0; i < 100; ++i) { Log::Warning("bin %d grad %f hess %f", i, test_hist[2 * i], test_hist[2 * i + 1]); + }*/ + const hist_t* leaf_2_cuda_hist_ptr = cuda_hist_ + 3 * 2 * num_total_bin_; + Log::Warning("cuda_hist_ptr = %ld", leaf_2_cuda_hist_ptr); + CopyFromCUDADeviceToHost(test_hist.data(), leaf_2_cuda_hist_ptr, 2 * num_total_bin_); + std::ofstream fout("leaf_2_cuda_hist.txt"); + for (int i = 0; i < num_total_bin_; ++i) { + Log::Warning("bin %d grad %f hess %f", i, test_hist[2 * i], test_hist[2 * i + 1]); + fout << "bin " << i << " grad " << test_hist[2 * i] << " hess " << test_hist[2 * i + 1] << "\n"; } + fout.close(); } private: void LaunchConstructHistogramKernel(const int* cuda_leaf_index, const data_size_t** cuda_data_indices_in_leaf, - const data_size_t* cuda_leaf_num_data); + const data_size_t* cuda_leaf_num_data, + hist_t** cuda_leaf_hist); - void LaunchSubtractAndFixHistogramKernel(const int* cuda_smaller_leaf_index, + void LaunchSubtractHistogramKernel(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, - const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians); + const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians, + hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist); void InitCUDAData(const Dataset* train_data); diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index ae837452ce7c..be190d9a6c37 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -34,29 +34,33 @@ void CUDALeafSplits::Init() { AllocateCUDAMemory(num_blocks_init_from_gradients_, &cuda_sum_of_hessians_); InitCUDAMemoryFromHostMemory(&cuda_num_data_in_leaf_, &num_data_, 1); + // TODO(shiyu1994): should initialize root gain for min_gain_shift InitCUDAValueFromConstant(&cuda_gain_, 0.0f); // since smooth is not used, so the output value for root node is useless InitCUDAValueFromConstant(&cuda_leaf_value_, 0.0f); AllocateCUDAMemory(1, &cuda_data_indices_in_leaf_); + AllocateCUDAMemory(1, &cuda_hist_in_leaf_); InitCUDAMemoryFromHostMemory(&cuda_leaf_index_, &leaf_index_, 1); } void CUDALeafSplits::InitValues(const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, - const data_size_t* cuda_num_data_in_leaf, const data_size_t* cuda_data_indices_in_leaf, + const data_size_t* cuda_num_data_in_leaf, const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, const double* cuda_gain, const double* cuda_leaf_value) { CopyFromCUDADeviceToCUDADevice(cuda_sum_of_gradients_, cuda_sum_of_gradients, 1); CopyFromCUDADeviceToCUDADevice(cuda_sum_of_hessians_, cuda_sum_of_hessians, 1); CopyFromCUDADeviceToCUDADevice(cuda_num_data_in_leaf_, cuda_num_data_in_leaf, 1); CopyFromHostToCUDADevice(cuda_data_indices_in_leaf_, &cuda_data_indices_in_leaf, 1); + CopyFromHostToCUDADevice(cuda_hist_in_leaf_, &cuda_hist_in_leaf, 1); CopyFromCUDADeviceToCUDADevice(cuda_gain_, cuda_gain, 1); CopyFromCUDADeviceToCUDADevice(cuda_leaf_value_, cuda_leaf_value, 1); SynchronizeCUDADevice(); } -void CUDALeafSplits::InitValues(const data_size_t* cuda_data_indices_in_leaf) { +void CUDALeafSplits::InitValues(const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf) { LaunchInitValuesKernal(); CopyFromHostToCUDADevice(cuda_data_indices_in_leaf_, &cuda_data_indices_in_leaf, 1); + CopyFromHostToCUDADevice(cuda_hist_in_leaf_, &cuda_hist_in_leaf, 1); SynchronizeCUDADevice(); } diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 3eb32e5da1fd..984140478718 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -42,22 +42,34 @@ __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score __global__ void CUDAInitValuesKernel2(double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) { if (blockIdx.x == 0) { + double sum_of_gradients = 0.0f; + double sum_of_hessians = 0.0f; for (unsigned int i = 1; i < gridDim.x; ++i) { - cuda_sum_of_gradients[0] += cuda_sum_of_gradients[i]; - cuda_sum_of_hessians[0] += cuda_sum_of_hessians[i]; + sum_of_gradients += cuda_sum_of_gradients[i]; + sum_of_hessians += cuda_sum_of_hessians[i]; } + cuda_sum_of_gradients[0] += sum_of_gradients; + cuda_sum_of_hessians[0] += sum_of_hessians; } } void CUDALeafSplits::LaunchInitValuesKernal() { + auto start = std::chrono::steady_clock::now(); CUDAInitValuesKernel1<<>>( cuda_gradients_, cuda_hessians_, cuda_num_data_, cuda_sum_of_gradients_, cuda_sum_of_hessians_); CopyFromCUDADeviceToCUDADevice(cuda_num_data_in_leaf_, cuda_num_data_, 1); SynchronizeCUDADevice(); + auto end = std::chrono::steady_clock::now(); + auto duration = static_cast>(end - start); + Log::Warning("CUDAInitValuesKernel1 duration = %f", duration.count()); + start = std::chrono::steady_clock::now(); CUDAInitValuesKernel2<<>>( cuda_sum_of_gradients_, cuda_sum_of_hessians_); SynchronizeCUDADevice(); + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + Log::Warning("CUDAInitValuesKernel2 duration = %f", duration.count()); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index 78ea846fbfaf..47b0aefca29c 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -10,6 +10,7 @@ #include #include +#include #include "new_cuda_utils.hpp" #define INIT_SUM_BLOCK_SIZE_LEAF_SPLITS (6144) @@ -30,9 +31,9 @@ class CUDALeafSplits { void InitValues(const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, const data_size_t* cuda_num_data_in_leaf, const data_size_t* cuda_data_indices_in_leaf, - const double* cuda_gain, const double* cuda_leaf_value); + hist_t* cuda_hist_in_leaf, const double* cuda_gain, const double* cuda_leaf_value); - void InitValues(const data_size_t* cuda_data_indices_in_leaf); + void InitValues(const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf); const int* cuda_leaf_index() const { return cuda_leaf_index_; } @@ -60,6 +61,8 @@ class CUDALeafSplits { const data_size_t** cuda_data_indices_in_leaf_pointer_pointer() { return cuda_data_indices_in_leaf_; } + hist_t** cuda_hist_in_leaf_pointer_pointer() const { return cuda_hist_in_leaf_; } + void Test() { PrintLastCUDAError(); double test_sum_of_gradients = 0.0f, test_sum_of_hessians = 0.0f; @@ -87,6 +90,7 @@ class CUDALeafSplits { // CUDA memory, held by other object const data_size_t** cuda_data_indices_in_leaf_; + hist_t** cuda_hist_in_leaf_; const score_t* cuda_gradients_; const score_t* cuda_hessians_; const int* cuda_num_data_; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 4383da763b9a..9498259a1fab 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -24,7 +24,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); cuda_centralized_info_.reset(new CUDACentralizedInfo(num_data_, this->config_->num_leaves, num_features_)); cuda_centralized_info_->Init(); - cuda_centralized_info_->Test(); + //cuda_centralized_info_->Test(); cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); @@ -41,7 +41,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_data_partition_.reset(new CUDADataPartition(num_data_, num_features_, this->config_->num_leaves, num_threads_, cuda_centralized_info_->cuda_num_data(), cuda_centralized_info_->cuda_num_leaves(), cuda_histogram_constructor_->cuda_data(), cuda_centralized_info_->cuda_num_features(), - share_state_->feature_hist_offsets(), train_data_)); + share_state_->feature_hist_offsets(), train_data_, cuda_histogram_constructor_->cuda_hist_pointer())); cuda_data_partition_->Init(); cuda_best_split_finder_.reset(new CUDABestSplitFinder(cuda_histogram_constructor_->cuda_hist(), @@ -50,15 +50,23 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia this->config_->min_sum_hessian_in_leaf, this->config_->min_gain_to_split, cuda_centralized_info_->cuda_num_features())); cuda_best_split_finder_->Init(); - cuda_best_split_finder_->TestAfterInit(); + //cuda_best_split_finder_->TestAfterInit(); } void NewCUDATreeLearner::BeforeTrain() { + auto start = std::chrono::steady_clock::now(); cuda_centralized_info_->BeforeTrain(gradients_, hessians_); - cuda_smaller_leaf_splits_->InitValues(cuda_data_partition_->cuda_data_indices()); + auto end = std::chrono::steady_clock::now(); + auto duration = static_cast>(end - start); + Log::Warning("cuda_centralized_info_->BeforeTrain duration = %f", duration.count()); + cuda_smaller_leaf_splits_->InitValues(cuda_data_partition_->cuda_data_indices(), cuda_histogram_constructor_->cuda_hist_pointer()); //cuda_smaller_leaf_splits_->Test(); + start = std::chrono::steady_clock::now(); cuda_data_partition_->BeforeTrain(nullptr); - cuda_data_partition_->Test(); + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + Log::Warning("cuda_data_partition_->BeforeTrain duration = %f", duration.count()); + //cuda_data_partition_->Test(); //SerialTreeLearner::BeforeTrain(); /*#pragma omp parallel for schedule(static) num_threads(num_threads_) @@ -212,21 +220,45 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, const score_t* hessians, bool /*is_first_tree*/) { gradients_ = gradients; hessians_ = hessians; - BeforeTrain(); const auto start = std::chrono::steady_clock::now(); + auto before_train_start = std::chrono::steady_clock::now(); + BeforeTrain(); + auto before_train_end = std::chrono::steady_clock::now(); + double construct_histogram_time = 0.0f; + double find_best_split_time = 0.0f; + double split_data_indices_time = 0.0f; for (int i = 0; i < config_->num_leaves - 1; ++i) { + //Log::Warning("Before ConstructHistogramForLeaf"); + auto start = std::chrono::steady_clock::now(); cuda_histogram_constructor_->ConstructHistogramForLeaf( cuda_smaller_leaf_splits_->cuda_leaf_index(), cuda_larger_leaf_splits_->cuda_leaf_index(), cuda_smaller_leaf_splits_->cuda_data_indices_in_leaf(), cuda_larger_leaf_splits_->cuda_data_indices_in_leaf(), + cuda_smaller_leaf_splits_->cuda_sum_of_gradients_pointer(), + cuda_smaller_leaf_splits_->cuda_sum_of_hessians_pointer(), + cuda_smaller_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), + cuda_larger_leaf_splits_->cuda_sum_of_gradients_pointer(), + cuda_larger_leaf_splits_->cuda_sum_of_hessians_pointer(), + cuda_larger_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), cuda_data_partition_->cuda_leaf_num_data()); - + auto end = std::chrono::steady_clock::now(); + auto duration = static_cast>(end - start); + construct_histogram_time += duration.count(); + /*if (i == 3) { + cuda_histogram_constructor_->TestAfterConstructHistogram(); + }*/ + //Log::Warning("Before FindBestSplitsForLeaf"); + start = std::chrono::steady_clock::now(); cuda_best_split_finder_->FindBestSplitsForLeaf(cuda_smaller_leaf_splits_.get(), cuda_larger_leaf_splits_.get()); - + //Log::Warning("Before FindBestFromAllSplits"); cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves()); - + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + find_best_split_time += duration.count(); + //Log::Warning("Before Split"); + start = std::chrono::steady_clock::now(); cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), cuda_best_split_finder_->cuda_leaf_best_split_feature(), cuda_best_split_finder_->cuda_leaf_best_split_threshold(), @@ -250,17 +282,30 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, cuda_smaller_leaf_splits_->cuda_gain_pointer(), cuda_smaller_leaf_splits_->cuda_leaf_value_pointer(), cuda_smaller_leaf_splits_->cuda_data_indices_in_leaf_pointer_pointer(), + cuda_smaller_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), cuda_larger_leaf_splits_->cuda_leaf_index_pointer(), cuda_larger_leaf_splits_->cuda_sum_of_gradients_pointer(), cuda_larger_leaf_splits_->cuda_sum_of_hessians_pointer(), cuda_larger_leaf_splits_->cuda_num_data_in_leaf_pointer(), cuda_larger_leaf_splits_->cuda_gain_pointer(), cuda_larger_leaf_splits_->cuda_leaf_value_pointer(), - cuda_larger_leaf_splits_->cuda_data_indices_in_leaf_pointer_pointer()); + cuda_larger_leaf_splits_->cuda_data_indices_in_leaf_pointer_pointer(), + cuda_larger_leaf_splits_->cuda_hist_in_leaf_pointer_pointer()); + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + split_data_indices_time += duration.count(); + /*if (i == 2) { + cuda_data_partition_->TestAfterSplit(); + }*/ } const auto end = std::chrono::steady_clock::now(); const double duration = (static_cast>(end - start)).count(); Log::Warning("Train time %f", duration); + Log::Warning("before train time %f", static_cast>(before_train_end - before_train_start).count()); + Log::Warning("construct histogram time %f", construct_histogram_time); + Log::Warning("find best split time %f", find_best_split_time); + Log::Warning("split data indices time %f", split_data_indices_time); + global_timer.Print(); /*cuda_data_partition_->Test(); cuda_histogram_constructor_->ConstructHistogramForLeaf( cuda_smaller_leaf_splits_->cuda_leaf_index(), From bc85ced78182c1548cd876f8ca58de3fa05aabfa Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sun, 9 May 2021 13:38:45 +0000 Subject: [PATCH 008/166] remove comments --- src/treelearner/cuda/cuda_data_partition.cu | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 19fa0c348071..a52fa515ce0b 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -347,9 +347,6 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b cuda_leaf_num_data[cur_max_leaf_index] = block_to_right_offset_buffer[num_blocks] - to_left_total_cnt; const int cuda_num_total_bin_ref = *cuda_num_total_bin; ++(*cuda_cur_num_leaves); - //printf("best_left_gain = %f, best_right_gain = %f\n", *best_left_gain, *best_right_gain); - //printf("cuda_leaf_num_data[%d] = %d, cuda_leaf_num_data[%d] = %d\n", - // leaf_index_ref, cuda_leaf_num_data[leaf_index_ref], cur_max_leaf_index, cuda_leaf_num_data[cur_max_leaf_index]); if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; *smaller_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; @@ -372,13 +369,6 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b cuda_hist_pool[leaf_index_ref] = cuda_hist + 2 * cur_max_leaf_index * cuda_num_total_bin_ref; *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; - //if (leaf_index_ref == 2) { - /*printf("*************** leaf %d sum_gradients %f sum_hessians %f\n", leaf_index_ref, *smaller_leaf_cuda_sum_of_gradients_pointer, - *smaller_leaf_cuda_sum_of_hessians_pointer); - printf("*************** leaf %d sum_gradients %f sum_hessians %f\n", cur_max_leaf_index, *larger_leaf_cuda_sum_of_gradients_pointer, - *larger_leaf_cuda_sum_of_hessians_pointer); - printf("leaf 2 cuda ptr in CUDA kernel = %ld\n", cuda_hist_pool[leaf_index_ref]);*/ - //} } else { *larger_leaf_cuda_leaf_index_pointer = leaf_index_ref; *larger_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; @@ -399,13 +389,6 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b cuda_hist_pool[cur_max_leaf_index] = cuda_hist + 2 * cur_max_leaf_index * cuda_num_total_bin_ref; *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; - /*printf("*************** leaf %d sum_gradients %f sum_hessians %f\n", cur_max_leaf_index, *smaller_leaf_cuda_sum_of_gradients_pointer, - *smaller_leaf_cuda_sum_of_hessians_pointer); - printf("*************** leaf %d sum_gradients %f sum_hessians %f\n", leaf_index_ref, *larger_leaf_cuda_sum_of_gradients_pointer, - *larger_leaf_cuda_sum_of_hessians_pointer); - if (leaf_index_ref == 2) { - printf("error unexpected, 2 should be the smaller leaf\n"); - }*/ } } } From 18d957a4e9058cde5f9e2b8994c13330f4581513 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 10 May 2021 05:53:11 +0000 Subject: [PATCH 009/166] boosting with cuda --- .../cuda/cuda_best_split_finder.cpp | 7 + .../cuda/cuda_best_split_finder.cu | 4 + .../cuda/cuda_best_split_finder.hpp | 2 + src/treelearner/cuda/cuda_data_partition.cpp | 47 ++++++ src/treelearner/cuda/cuda_data_partition.cu | 38 ++++- src/treelearner/cuda/cuda_data_partition.hpp | 46 +++++- .../cuda/cuda_histogram_constructor.cpp | 4 + .../cuda/cuda_histogram_constructor.hpp | 4 +- src/treelearner/cuda/cuda_leaf_splits.cpp | 16 ++ src/treelearner/cuda/cuda_leaf_splits.cu | 3 + src/treelearner/cuda/cuda_leaf_splits.hpp | 2 + .../cuda/new_cuda_tree_learner.cpp | 145 ++++++++++++++++-- .../cuda/new_cuda_tree_learner.hpp | 4 + 13 files changed, 305 insertions(+), 17 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 53f07e458064..f2543efacac8 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -106,6 +106,13 @@ void CUDABestSplitFinder::Init() { CopyFromHostToCUDADevice(cuda_min_gain_to_split_, &min_gain_to_split_, 1); } +void CUDABestSplitFinder::BeforeTrain() { + const size_t feature_best_split_info_buffer_size = static_cast(num_features_) * 4; + SetCUDAMemory(cuda_leaf_best_split_gain_, 0, static_cast(num_leaves_)); + SetCUDAMemory(cuda_best_split_found_, 0, feature_best_split_info_buffer_size); + SetCUDAMemory(cuda_best_split_gain_, 0, feature_best_split_info_buffer_size); +} + void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, const CUDALeafSplits* larger_leaf_splits) { auto start = std::chrono::steady_clock::now(); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 9707845fa31f..1d7e15e36695 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -602,6 +602,10 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, *out_best_leaf = leaf_index; } } + if (best_gain <= 0.0f) { + printf("error !!! too smaller best gain %f\n", best_gain); + } + //printf("find best cuda_leaf_best_split_gain[%d] = %f\n", *out_best_leaf, best_gain); } void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves) { diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 18541c4e51e0..94591a6da17c 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -31,6 +31,8 @@ class CUDABestSplitFinder { void Init(); + void BeforeTrain(); + void FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, const CUDALeafSplits* larger_leaf_splits); void FindBestFromAllSplits(const int* cuda_cur_num_leaves); diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index c579dca309fb..0b8b03983606 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -98,6 +98,24 @@ void CUDADataPartition::Init() { InitCUDAMemoryFromHostMemory(&cuda_feature_missing_is_na_, feature_missing_is_na_.data(), static_cast(num_features_)); InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_zero_, feature_mfb_is_zero_.data(), static_cast(num_features_)); InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_na_, feature_mfb_is_na_.data(), static_cast(num_features_)); + + AllocateCUDAMemory(static_cast(num_leaves_), &tree_split_leaf_index_); + AllocateCUDAMemory(static_cast(num_leaves_), &tree_inner_feature_index_); + AllocateCUDAMemory(static_cast(num_leaves_), &tree_threshold_); + AllocateCUDAMemory(static_cast(num_leaves_), &tree_left_output_); + AllocateCUDAMemory(static_cast(num_leaves_), &tree_right_output_); + AllocateCUDAMemory(static_cast(num_leaves_), &tree_left_count_); + AllocateCUDAMemory(static_cast(num_leaves_), &tree_right_count_); + AllocateCUDAMemory(static_cast(num_leaves_), &tree_left_sum_hessian_); + AllocateCUDAMemory(static_cast(num_leaves_), &tree_right_sum_hessian_); + AllocateCUDAMemory(static_cast(num_leaves_), &tree_gain_); + AllocateCUDAMemory(static_cast(num_leaves_), &tree_default_left_); + + AllocateCUDAMemory(static_cast(num_leaves_), &data_partition_leaf_output_); + + AllocateCUDAMemory(static_cast(num_data_), &train_data_score_tmp_); + + cpu_train_data_score_tmp_.resize(num_data_, 0.0f); } void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { @@ -111,12 +129,19 @@ void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { CopyFromCUDADeviceToCUDADevice(cuda_leaf_num_data_, cuda_num_data_, 1); CopyFromCUDADeviceToCUDADevice(cuda_leaf_data_end_, cuda_num_data_, 1); SynchronizeCUDADevice(); + cur_num_leaves_ = 1; + CopyFromHostToCUDADevice(cuda_cur_num_leaves_, &cur_num_leaves_, 1); + num_data_in_leaf_.clear(); + num_data_in_leaf_.resize(num_leaves_, 0); + num_data_in_leaf_[0] = num_data_; + CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1); } else { Log::Fatal("bagging is not supported by GPU"); } } void CUDADataPartition::Split(const int* leaf_id, + const double* best_split_gain, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left, @@ -146,6 +171,20 @@ void CUDADataPartition::Split(const int* leaf_id, global_timer.Stop("GenDataToLeftBitVector"); //Log::Warning("CUDADataPartition::GenDataToLeftBitVector time %f", duration); global_timer.Start("SplitInner"); + CopyFromCUDADeviceToCUDADevice(tree_split_leaf_index_ + cur_num_leaves_ - 1, leaf_id, 1); + CopyFromCUDADeviceToCUDADevice(tree_inner_feature_index_ + cur_num_leaves_ - 1, best_split_feature + leaf_index_cpu, 1); + CopyFromCUDADeviceToCUDADevice(tree_threshold_ + cur_num_leaves_ - 1, best_split_threshold + leaf_index_cpu, 1); + CopyFromCUDADeviceToCUDADevice(tree_left_output_ + cur_num_leaves_ - 1, best_left_leaf_value + leaf_index_cpu, 1); + CopyFromCUDADeviceToCUDADevice(tree_right_output_ + cur_num_leaves_ - 1, best_right_leaf_value + leaf_index_cpu, 1); + CopyFromCUDADeviceToCUDADevice(tree_left_count_ + cur_num_leaves_ - 1, best_left_count + leaf_index_cpu, 1); + CopyFromCUDADeviceToCUDADevice(tree_right_count_ + cur_num_leaves_ - 1, best_right_count + leaf_index_cpu, 1); + CopyFromCUDADeviceToCUDADevice(tree_left_sum_hessian_ + cur_num_leaves_ - 1, best_left_sum_hessians + leaf_index_cpu, 1); + CopyFromCUDADeviceToCUDADevice(tree_right_sum_hessian_ + cur_num_leaves_ - 1, best_right_sum_hessians + leaf_index_cpu, 1); + CopyFromCUDADeviceToCUDADevice(tree_gain_ + cur_num_leaves_ - 1, best_split_gain + leaf_index_cpu, 1); + CopyFromCUDADeviceToCUDADevice(tree_default_left_ + cur_num_leaves_ - 1, best_split_default_left + leaf_index_cpu, 1); + CopyFromCUDADeviceToCUDADevice(data_partition_leaf_output_ + leaf_index_cpu, best_left_leaf_value + leaf_index_cpu, 1); + CopyFromCUDADeviceToCUDADevice(data_partition_leaf_output_ + cur_num_leaves_, best_right_leaf_value + leaf_index_cpu, 1); + start = std::chrono::steady_clock::now(); SplitInner(leaf_id, num_data_in_leaf, best_left_sum_gradients, best_left_sum_hessians, best_left_count, @@ -207,10 +246,18 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_index); + ++cur_num_leaves_; } Tree* CUDADataPartition::GetCPUTree() {} +void CUDADataPartition::UpdateTrainScore(const double learning_rate, double* train_score) { + LaunchAddPredictionToScoreKernel(learning_rate); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (data_size_t i = 0; i < num_data_; ++i) { + train_score[i] += cpu_train_data_score_tmp_[i]; + } +} } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index a52fa515ce0b..8ad843f74dd8 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -7,6 +7,7 @@ #ifdef USE_CUDA #include "cuda_data_partition.hpp" +#include namespace LightGBM { @@ -351,7 +352,7 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; *smaller_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; *smaller_leaf_cuda_sum_of_hessians_pointer = best_left_sum_hessians[leaf_index_ref]; - *smaller_leaf_cuda_num_data_in_leaf_pointer = best_left_count[leaf_index_ref]; + *smaller_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt;//best_left_count[leaf_index_ref]; *smaller_leaf_cuda_gain_pointer = best_left_gain[leaf_index_ref]; *smaller_leaf_cuda_leaf_value_pointer = best_left_leaf_value[leaf_index_ref]; *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[leaf_index_ref]; @@ -359,7 +360,7 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b *larger_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; *larger_leaf_cuda_sum_of_gradients_pointer = best_right_sum_gradients[leaf_index_ref]; *larger_leaf_cuda_sum_of_hessians_pointer = best_right_sum_hessians[leaf_index_ref]; - *larger_leaf_cuda_num_data_in_leaf_pointer = best_right_count[leaf_index_ref]; + *larger_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index];//best_right_count[leaf_index_ref]; *larger_leaf_cuda_gain_pointer = best_right_gain[leaf_index_ref]; *larger_leaf_cuda_leaf_value_pointer = best_right_leaf_value[leaf_index_ref]; *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[cur_max_leaf_index]; @@ -373,7 +374,7 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b *larger_leaf_cuda_leaf_index_pointer = leaf_index_ref; *larger_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; *larger_leaf_cuda_sum_of_hessians_pointer = best_left_sum_hessians[leaf_index_ref]; - *larger_leaf_cuda_num_data_in_leaf_pointer = best_left_count[leaf_index_ref]; + *larger_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt;//best_left_count[leaf_index_ref]; *larger_leaf_cuda_gain_pointer = best_left_gain[leaf_index_ref]; *larger_leaf_cuda_leaf_value_pointer = best_left_leaf_value[leaf_index_ref]; *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[leaf_index_ref]; @@ -381,7 +382,7 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b *smaller_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; *smaller_leaf_cuda_sum_of_gradients_pointer = best_right_sum_gradients[leaf_index_ref]; *smaller_leaf_cuda_sum_of_hessians_pointer = best_right_sum_hessians[leaf_index_ref]; - *smaller_leaf_cuda_num_data_in_leaf_pointer = best_right_count[leaf_index_ref]; + *smaller_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index];//best_right_count[leaf_index_ref]; *smaller_leaf_cuda_gain_pointer = best_right_gain[leaf_index_ref]; *smaller_leaf_cuda_leaf_value_pointer = best_right_leaf_value[leaf_index_ref]; *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[cur_max_leaf_index]; @@ -648,6 +649,35 @@ void CUDADataPartition::LaunchPrefixSumKernel(uint32_t* cuda_elements) { SynchronizeCUDADevice(); } +__global__ void AddPredictionToScoreKernel(const double* data_partition_leaf_output, + const data_size_t* num_data_in_leaf, const data_size_t* data_indices_in_leaf, + const data_size_t* leaf_data_start, const double learning_rate, double* output_score) { + const unsigned int threadIdx_x = threadIdx.x; + const unsigned int blockIdx_x = blockIdx.x; + const unsigned int blockDim_x = blockDim.x; + const data_size_t num_data = num_data_in_leaf[blockIdx_x]; + const data_size_t* data_indices = data_indices_in_leaf + leaf_data_start[blockIdx_x]; + const double leaf_prediction_value = data_partition_leaf_output[blockIdx_x] * learning_rate; + /*if (threadIdx_x == 0) { + printf("leaf index = %d, leaf_prediction_value = %f\n", blockIdx_x, leaf_prediction_value); + }*/ + for (unsigned int offset = 0; offset < static_cast(num_data); offset += blockDim_x) { + const data_size_t inner_data_index = static_cast(offset + threadIdx_x); + if (inner_data_index < num_data) { + const data_size_t data_index = data_indices[inner_data_index]; + output_score[data_index] = leaf_prediction_value; + } + } +} + +void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double learning_rate) { + AddPredictionToScoreKernel<<>>(data_partition_leaf_output_, + cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, learning_rate, train_data_score_tmp_); + SynchronizeCUDADevice(); + CopyFromCUDADeviceToHost(cpu_train_data_score_tmp_.data(), train_data_score_tmp_, static_cast(num_data_)); + SynchronizeCUDADevice(); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 877c5d2c7221..5b05c618927a 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -31,7 +31,7 @@ class CUDADataPartition { void BeforeTrain(const data_size_t* data_indices); - void Split(const int* leaf_id, const int* best_split_feature, + void Split(const int* leaf_id, const double* best_split_gain, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left, const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, @@ -132,6 +132,8 @@ class CUDADataPartition { } } + void UpdateTrainScore(const double learning_rate, double* train_score); + const data_size_t* cuda_leaf_data_start() const { return cuda_leaf_data_start_; } const data_size_t* cuda_leaf_data_end() const { return cuda_leaf_data_end_; } @@ -144,6 +146,30 @@ class CUDADataPartition { const int* cuda_cur_num_leaves() const { return cuda_cur_num_leaves_; } + const int* tree_split_leaf_index() const { return tree_split_leaf_index_; } + + const int* tree_inner_feature_index() const { return tree_inner_feature_index_; } + + const uint32_t* tree_threshold() const { return tree_threshold_; } + + const double* tree_left_output() const { return tree_left_output_; } + + const double* tree_right_output() const { return tree_right_output_; } + + const data_size_t* tree_left_count() const { return tree_left_count_; } + + const data_size_t* tree_right_count() const { return tree_right_count_; } + + const double* tree_left_sum_hessian() const { return tree_left_sum_hessian_; } + + const double* tree_right_sum_hessian() const { return tree_right_sum_hessian_; } + + const double* tree_gain() const { return tree_gain_; } + + const uint8_t* tree_default_left() const { return tree_default_left_; } + + const double* train_data_score_tmp() const { return train_data_score_tmp_; } + private: void GenDataToLeftBitVector(const int* leaf_id, const data_size_t num_data_in_leaf, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); @@ -190,6 +216,8 @@ class CUDADataPartition { void LaunchPrefixSumKernel(uint32_t* cuda_elements); + void LaunchAddPredictionToScoreKernel(const double learning_rate); + // Host memory const data_size_t num_data_; const int num_features_; @@ -207,6 +235,7 @@ class CUDADataPartition { std::vector feature_mfb_is_na_; std::vector num_data_in_leaf_; int cur_num_leaves_; + std::vector cpu_train_data_score_tmp_; // CUDA memory, held by this object data_size_t* cuda_data_indices_; @@ -230,6 +259,21 @@ class CUDADataPartition { int* cuda_num_total_bin_; // for histogram pool hist_t** cuda_hist_pool_; + // for tree structure + int* tree_split_leaf_index_; + int* tree_inner_feature_index_; + uint32_t* tree_threshold_; + double* tree_left_output_; + double* tree_right_output_; + data_size_t* tree_left_count_; + data_size_t* tree_right_count_; + double* tree_left_sum_hessian_; + double* tree_right_sum_hessian_; + double* tree_gain_; + uint8_t* tree_default_left_; + double* data_partition_leaf_output_; + // for train data update + double* train_data_score_tmp_; // CUDA memory, held by other object const data_size_t* cuda_num_data_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 74b4245b5e07..3e869ffc0081 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -41,6 +41,10 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, num_total_bin_ = offset; } +void CUDAHistogramConstructor::BeforeTrain() { + SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_); +} + void CUDAHistogramConstructor::Init(const Dataset* train_data) { // allocate CPU memory data_.resize(num_data_ * num_feature_groups_, 0); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 470d08ef69ab..73b7cdd0cf45 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -40,6 +40,8 @@ class CUDAHistogramConstructor { const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, const data_size_t* cuda_leaf_num_data); + void BeforeTrain(); + const hist_t* cuda_hist() const { return cuda_hist_; } hist_t* cuda_hist_pointer() const { return cuda_hist_; } @@ -63,7 +65,7 @@ class CUDAHistogramConstructor { for (int i = 0; i < 100; ++i) { Log::Warning("bin %d grad %f hess %f", i, test_hist[2 * i], test_hist[2 * i + 1]); }*/ - const hist_t* leaf_2_cuda_hist_ptr = cuda_hist_ + 3 * 2 * num_total_bin_; + const hist_t* leaf_2_cuda_hist_ptr = cuda_hist_;// + 3 * 2 * num_total_bin_; Log::Warning("cuda_hist_ptr = %ld", leaf_2_cuda_hist_ptr); CopyFromCUDADeviceToHost(test_hist.data(), leaf_2_cuda_hist_ptr, 2 * num_total_bin_); std::ofstream fout("leaf_2_cuda_hist.txt"); diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index be190d9a6c37..a27fa834b766 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -57,10 +57,26 @@ void CUDALeafSplits::InitValues(const double* cuda_sum_of_gradients, const doubl SynchronizeCUDADevice(); } +void CUDALeafSplits::InitValues() { + SetCUDAMemory(cuda_sum_of_gradients_, 0, num_blocks_init_from_gradients_); + SetCUDAMemory(cuda_sum_of_hessians_, 0, num_blocks_init_from_gradients_); + const int larger_leaf_index = -1; + CopyFromHostToCUDADevice(cuda_leaf_index_, &larger_leaf_index, 1); + SetCUDAMemory(cuda_gain_, 0, 1); + SetCUDAMemory(cuda_leaf_value_, 0, 1); + SynchronizeCUDADevice(); +} + void CUDALeafSplits::InitValues(const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf) { + SetCUDAMemory(cuda_sum_of_gradients_, 0, num_blocks_init_from_gradients_); + SetCUDAMemory(cuda_sum_of_hessians_, 0, num_blocks_init_from_gradients_); LaunchInitValuesKernal(); + SetCUDAMemory(cuda_leaf_index_, 0, 1); CopyFromHostToCUDADevice(cuda_data_indices_in_leaf_, &cuda_data_indices_in_leaf, 1); CopyFromHostToCUDADevice(cuda_hist_in_leaf_, &cuda_hist_in_leaf, 1); + CopyFromHostToCUDADevice(cuda_num_data_in_leaf_, &num_data_, 1); + SetCUDAMemory(cuda_gain_, 0, 1); + SetCUDAMemory(cuda_leaf_value_, 0, 1); SynchronizeCUDADevice(); } diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 984140478718..af844623f4fb 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -48,6 +48,8 @@ __global__ void CUDAInitValuesKernel2(double* cuda_sum_of_gradients, double* cud sum_of_gradients += cuda_sum_of_gradients[i]; sum_of_hessians += cuda_sum_of_hessians[i]; } + //printf("sum_of_gradients = %f\n", sum_of_gradients); + //printf("sum_of_hessians = %f\n", sum_of_hessians); cuda_sum_of_gradients[0] += sum_of_gradients; cuda_sum_of_hessians[0] += sum_of_hessians; } @@ -69,6 +71,7 @@ void CUDALeafSplits::LaunchInitValuesKernal() { SynchronizeCUDADevice(); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); + //Log::Warning("cuda_sum_of_gradients_ = %f, cuda_sum_of_hessians_ = %f", *cuda_sum_of_gradients_, *cuda_sum_of_hessians_); Log::Warning("CUDAInitValuesKernel2 duration = %f", duration.count()); } diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index 47b0aefca29c..5af6c1ae5480 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -35,6 +35,8 @@ class CUDALeafSplits { void InitValues(const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf); + void InitValues(); + const int* cuda_leaf_index() const { return cuda_leaf_index_; } const data_size_t** cuda_data_indices_in_leaf() const { return cuda_data_indices_in_leaf_; } diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 9498259a1fab..58effa0b9693 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -55,17 +55,28 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia void NewCUDATreeLearner::BeforeTrain() { auto start = std::chrono::steady_clock::now(); - cuda_centralized_info_->BeforeTrain(gradients_, hessians_); + cuda_data_partition_->BeforeTrain(nullptr); auto end = std::chrono::steady_clock::now(); auto duration = static_cast>(end - start); - Log::Warning("cuda_centralized_info_->BeforeTrain duration = %f", duration.count()); + //Log::Warning("cuda_data_partition_->BeforeTrain duration = %f", duration.count()); + start = std::chrono::steady_clock::now(); + cuda_centralized_info_->BeforeTrain(gradients_, hessians_); + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + //Log::Warning("cuda_centralized_info_->BeforeTrain duration = %f", duration.count()); cuda_smaller_leaf_splits_->InitValues(cuda_data_partition_->cuda_data_indices(), cuda_histogram_constructor_->cuda_hist_pointer()); + cuda_larger_leaf_splits_->InitValues(); //cuda_smaller_leaf_splits_->Test(); start = std::chrono::steady_clock::now(); - cuda_data_partition_->BeforeTrain(nullptr); + cuda_histogram_constructor_->BeforeTrain(); + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + //Log::Warning("cuda_histogram_constructor_->BeforeTrain() duration = %f", duration.count()); + start = std::chrono::steady_clock::now(); + cuda_best_split_finder_->BeforeTrain(); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); - Log::Warning("cuda_data_partition_->BeforeTrain duration = %f", duration.count()); + //Log::Warning("cuda_best_split_finder_->BeforeTrain() duration = %f", duration.count()); //cuda_data_partition_->Test(); //SerialTreeLearner::BeforeTrain(); @@ -216,6 +227,106 @@ void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, }*/ } +/*void NewCUDATreeLearner::SplitTree(Tree* tree) { + int leaf_index = 0; + int inner_feature_index = 0; + uint32_t threshold = 0; + double left_output = 0.0f; + double right_output = 0.0f; + data_size_t left_count = 0; + data_size_t right_count = 0; + double left_sum_hessian = 0.0f; + double right_sum_hessian = 0.0f; + double gain = 0.0f; + uint8_t default_left = 0; + CopyFromCUDADeviceToHost(&leaf_index, cuda_best_split_finder_->cuda_best_leaf(), 1); + CopyFromCUDADeviceToHost(&inner_feature_index, cuda_best_split_finder_->cuda_leaf_best_split_feature() + leaf_index, 1); + CopyFromCUDADeviceToHost(&threshold, cuda_best_split_finder_->cuda_leaf_best_split_threshold() + leaf_index, 1); + CopyFromCUDADeviceToHost(&left_output, cuda_best_split_finder_->cuda_leaf_best_split_left_output() + leaf_index, 1); + CopyFromCUDADeviceToHost(&right_output, cuda_best_split_finder_->cuda_leaf_best_split_right_output() + leaf_index, 1); + CopyFromCUDADeviceToHost(&left_count, cuda_best_split_finder_->cuda_leaf_best_split_left_count() + leaf_index, 1); + CopyFromCUDADeviceToHost(&right_count, cuda_best_split_finder_->cuda_leaf_best_split_right_count() + leaf_index, 1); + CopyFromCUDADeviceToHost(&left_sum_hessian, cuda_best_split_finder_->cuda_leaf_best_split_left_sum_hessian() + leaf_index, 1); + CopyFromCUDADeviceToHost(&right_sum_hessian, cuda_best_split_finder_->cuda_leaf_best_split_right_sum_hessian() + leaf_index, 1); + CopyFromCUDADeviceToHost(&gain, cuda_best_split_finder_->cuda_leaf_best_split_gain() + leaf_index, 1); + CopyFromCUDADeviceToHost(&default_left, cuda_best_split_finder_->cuda_leaf_best_split_default_left() + leaf_index, 1); + SynchronizeCUDADevice(); + int real_feature_index = train_data_->RealFeatureIndex(inner_feature_index); + double real_split_threshold = train_data_->RealThreshold(inner_feature_index, threshold); + tree->Split(leaf_index, inner_feature_index, real_feature_index, threshold, real_split_threshold, left_output, right_output, left_count, right_count, + left_sum_hessian, right_sum_hessian, gain, train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), static_cast(default_left)); +}*/ + +void NewCUDATreeLearner::AddPredictionToScore(const Tree* /*tree*/, double* out_score) const { + const auto start = std::chrono::steady_clock::now(); + cuda_data_partition_->UpdateTrainScore(config_->learning_rate, out_score); + const auto end = std::chrono::steady_clock::now(); + const auto duration = static_cast>(end - start).count(); + Log::Warning("AddPredictionToScore time %f", duration); +} + +Tree* NewCUDATreeLearner::BuildTree() { + std::unique_ptr tree(new Tree(config_->num_leaves, false, false)); + std::vector leaf_index(config_->num_leaves); + std::vector inner_feature_index(config_->num_leaves); + std::vector threshold(config_->num_leaves); + std::vector left_output(config_->num_leaves); + std::vector right_output(config_->num_leaves); + std::vector left_count(config_->num_leaves); + std::vector right_count(config_->num_leaves); + std::vector left_sum_hessian(config_->num_leaves); + std::vector right_sum_hessian(config_->num_leaves); + std::vector gain(config_->num_leaves); + std::vector default_left(config_->num_leaves); + //Log::Warning("BuildTree step 0"); + CopyFromCUDADeviceToHost(leaf_index.data(), cuda_data_partition_->tree_split_leaf_index(), config_->num_leaves); + CopyFromCUDADeviceToHost(inner_feature_index.data(), cuda_data_partition_->tree_inner_feature_index(), config_->num_leaves); + CopyFromCUDADeviceToHost(threshold.data(), cuda_data_partition_->tree_threshold(), config_->num_leaves); + CopyFromCUDADeviceToHost(left_output.data(), cuda_data_partition_->tree_left_output(), config_->num_leaves); + CopyFromCUDADeviceToHost(right_output.data(), cuda_data_partition_->tree_right_output(), config_->num_leaves); + CopyFromCUDADeviceToHost(left_count.data(), cuda_data_partition_->tree_left_count(), config_->num_leaves); + CopyFromCUDADeviceToHost(right_count.data(), cuda_data_partition_->tree_right_count(), config_->num_leaves); + CopyFromCUDADeviceToHost(left_sum_hessian.data(), cuda_data_partition_->tree_left_sum_hessian(), config_->num_leaves); + CopyFromCUDADeviceToHost(right_sum_hessian.data(), cuda_data_partition_->tree_right_sum_hessian(), config_->num_leaves); + CopyFromCUDADeviceToHost(gain.data(), cuda_data_partition_->tree_gain(), config_->num_leaves); + CopyFromCUDADeviceToHost(default_left.data(), cuda_data_partition_->tree_default_left(), config_->num_leaves); + //Log::Warning("BuildTree step 1"); + for (int i = 0; i < config_->num_leaves - 1; ++i) { + /*Log::Warning("BuildTree step 2"); + Log::Warning("leaf_index[i] = %d", leaf_index[i]); + Log::Warning("inner_feature_index[i] = %d", inner_feature_index[i]); + Log::Warning("train_data_->RealFeatureIndex(inner_feature_index[i]) = %d", train_data_->RealFeatureIndex(inner_feature_index[i])); + Log::Warning("threshold[i] = %d", threshold[i]); + Log::Warning("train_data_->RealThreshold(inner_feature_index[i], threshold[i]) = %f", train_data_->RealThreshold(inner_feature_index[i], threshold[i])); + Log::Warning("left_output[i] = %f", left_output[i]); + Log::Warning("right_output[i] = %f", right_output[i]); + Log::Warning("left_count[i] = %d", left_count[i]); + Log::Warning("right_count[i] = %d", right_count[i]); + Log::Warning("left_sum_hessian[i] = %f", left_sum_hessian[i]); + Log::Warning("right_sum_hessian[i] = %f", right_sum_hessian[i]); + Log::Warning("gain[i] = %f", gain[i]); + Log::Warning("train_data_->FeatureBinMapper(inner_feature_index[i])->missing_type() = %d", train_data_->FeatureBinMapper(inner_feature_index[i])->missing_type()); + Log::Warning("default_left[i] = %d", default_left[i]);*/ + tree->Split( + leaf_index[i], + inner_feature_index[i], + train_data_->RealFeatureIndex(inner_feature_index[i]), + threshold[i], + train_data_->RealThreshold(inner_feature_index[i], threshold[i]), + left_output[i], + right_output[i], + left_count[i], + right_count[i], + left_sum_hessian[i], + right_sum_hessian[i], + gain[i], + train_data_->FeatureBinMapper(inner_feature_index[i])->missing_type(), + static_cast(default_left[i])); + } + //Log::Warning("BuildTree step 3"); + return tree.release(); +} + Tree* NewCUDATreeLearner::Train(const score_t* gradients, const score_t* hessians, bool /*is_first_tree*/) { gradients_ = gradients; @@ -227,6 +338,8 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, double construct_histogram_time = 0.0f; double find_best_split_time = 0.0f; double split_data_indices_time = 0.0f; + double split_tree_time = 0.0f; + //std::unique_ptr tree(new Tree(config_->num_leaves, false, false)); for (int i = 0; i < config_->num_leaves - 1; ++i) { //Log::Warning("Before ConstructHistogramForLeaf"); auto start = std::chrono::steady_clock::now(); @@ -242,12 +355,12 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, cuda_larger_leaf_splits_->cuda_sum_of_hessians_pointer(), cuda_larger_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), cuda_data_partition_->cuda_leaf_num_data()); + /*if (i == 0) { + cuda_histogram_constructor_->TestAfterConstructHistogram(); + }*/ auto end = std::chrono::steady_clock::now(); auto duration = static_cast>(end - start); construct_histogram_time += duration.count(); - /*if (i == 3) { - cuda_histogram_constructor_->TestAfterConstructHistogram(); - }*/ //Log::Warning("Before FindBestSplitsForLeaf"); start = std::chrono::steady_clock::now(); cuda_best_split_finder_->FindBestSplitsForLeaf(cuda_smaller_leaf_splits_.get(), @@ -258,8 +371,14 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, duration = static_cast>(end - start); find_best_split_time += duration.count(); //Log::Warning("Before Split"); + //start = std::chrono::steady_clock::now(); + //SplitTree(tree.get()); + //end = std::chrono::steady_clock::now(); + //duration = static_cast>(end - start); + //split_tree_time += duration.count(); start = std::chrono::steady_clock::now(); cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), + cuda_best_split_finder_->cuda_leaf_best_split_gain(), cuda_best_split_finder_->cuda_leaf_best_split_feature(), cuda_best_split_finder_->cuda_leaf_best_split_threshold(), cuda_best_split_finder_->cuda_leaf_best_split_default_left(), @@ -294,17 +413,21 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); split_data_indices_time += duration.count(); - /*if (i == 2) { - cuda_data_partition_->TestAfterSplit(); - }*/ } const auto end = std::chrono::steady_clock::now(); const double duration = (static_cast>(end - start)).count(); + const auto build_tree_start = std::chrono::steady_clock::now(); + //Log::Warning("Before BuildTree"); + std::unique_ptr tree(BuildTree()); + const auto build_tree_end = std::chrono::steady_clock::now(); + const auto build_tre_duration = (static_cast>(build_tree_end - build_tree_start)).count(); Log::Warning("Train time %f", duration); Log::Warning("before train time %f", static_cast>(before_train_end - before_train_start).count()); Log::Warning("construct histogram time %f", construct_histogram_time); Log::Warning("find best split time %f", find_best_split_time); Log::Warning("split data indices time %f", split_data_indices_time); + //Log::Warning("split tree time %f", split_tree_time); + Log::Warning("build tree time %f", build_tre_duration); global_timer.Print(); /*cuda_data_partition_->Test(); cuda_histogram_constructor_->ConstructHistogramForLeaf( @@ -343,7 +466,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, device_splitters_[0]->leaf_num_data(), device_splitters_[0]->leaf_num_data_offsets(), device_splitters_[0]->data_indices(), device_gradients_[0], device_hessians_[0], device_gradients_and_hessians_[0]); Log::Warning("after construction of root histograms");*/ - return nullptr; + return tree.release(); } void NewCUDATreeLearner::ResetTrainingData(const Dataset* /*train_data*/, diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 4e1a7a17ea94..0b181b9bbf14 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -32,6 +32,8 @@ class NewCUDATreeLearner: public SerialTreeLearner { void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override; + void AddPredictionToScore(const Tree* tree, double* out_score) const override; + protected: void AllocateFeatureTasks(); @@ -51,6 +53,8 @@ class NewCUDATreeLearner: public SerialTreeLearner { void BeforeTrain() override; + Tree* BuildTree(); + // number of GPUs int num_gpus_; // number of threads on CPU From 28186c033732eca2e6889f1cc3987e3b62316aa6 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 13 May 2021 07:40:05 +0000 Subject: [PATCH 010/166] optimize for best split find --- .../cuda/cuda_best_split_finder.cu | 782 ++++++++++++++---- .../cuda/cuda_best_split_finder.hpp | 5 +- src/treelearner/cuda/cuda_data_partition.cu | 1 + .../cuda/new_cuda_tree_learner.cpp | 17 +- 4 files changed, 622 insertions(+), 183 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 1d7e15e36695..1f72c275a870 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -10,6 +10,120 @@ namespace LightGBM { +#define CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(n) \ + ((n) + ((n) >> LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER)) \ + +__device__ void PrefixSumHist(hist_t* elements, unsigned int n) { + unsigned int offset = 1; + unsigned int threadIdx_x = threadIdx.x; + const unsigned int conflict_free_n_minus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(n - 1); + const hist_t last_element = elements[conflict_free_n_minus_1]; + __syncthreads(); + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(dst_pos)] += elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(src_pos)]; + } + offset <<= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[conflict_free_n_minus_1] = 0; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const unsigned int dst_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 2) - 1); + const unsigned int src_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 1) - 1); + //const unsigned int conflict_free_dst_pos = (dst_pos); + //const unsigned int conflict_free_src_pos = (src_pos); + const hist_t src_val = elements[src_pos]; + elements[src_pos] = elements[dst_pos]; + elements[dst_pos] += src_val; + } + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(n)] = elements[conflict_free_n_minus_1] + last_element; + } + __syncthreads(); +} + +__device__ void PrefixSumHistCnt(data_size_t* elements, unsigned int n) { + unsigned int offset = 1; + unsigned int threadIdx_x = threadIdx.x; + const unsigned int conflict_free_n_minus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(n - 1); + const data_size_t last_element = elements[conflict_free_n_minus_1]; + __syncthreads(); + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(dst_pos)] += elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(src_pos)]; + } + offset <<= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[conflict_free_n_minus_1] = 0; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const unsigned int dst_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 2) - 1); + const unsigned int src_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 1) - 1); + //const unsigned int conflict_free_dst_pos = (dst_pos); + //const unsigned int conflict_free_src_pos = (src_pos); + const data_size_t src_val = elements[src_pos]; + elements[src_pos] = elements[dst_pos]; + elements[dst_pos] += src_val; + } + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(n)] = elements[conflict_free_n_minus_1] + last_element; + } +} + +__device__ void ReduceBestGain(double* gain, hist_t* sum_gradients, + hist_t* sum_hessians, data_size_t* num_data, uint8_t* found, + uint32_t* threshold_value) { + const unsigned int tid = threadIdx.x; + const unsigned int conflict_free_tid_plus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(tid + 1); + for (unsigned int s = 1; s < MAX_NUM_BIN_IN_FEATURE; s *= 2) { + if (tid % (2 * s) == 0 && (tid + s) < MAX_NUM_BIN_IN_FEATURE) { + const uint32_t tid_s = tid + s; + const uint32_t conflict_free_tid_s_plus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(tid_s + 1); + if ((found[tid_s] && !found[tid]) || (found[tid_s] && found[tid] && gain[tid_s] > gain[tid])) { + gain[tid] = gain[tid_s]; + sum_gradients[conflict_free_tid_plus_1] = sum_gradients[conflict_free_tid_s_plus_1]; + sum_hessians[conflict_free_tid_plus_1] = sum_hessians[conflict_free_tid_s_plus_1]; + num_data[conflict_free_tid_plus_1] = num_data[conflict_free_tid_s_plus_1]; + found[tid] = found[tid_s]; + threshold_value[tid] = threshold_value[tid_s]; + } + } + __syncthreads(); + } +} + +__device__ void ReduceBestGainForLeaves(double* gain, int* leaves) { + const unsigned int tid = threadIdx.x; + for (unsigned int s = 1; s < NUM_THREADS_FIND_BEST_LEAF; s *= 2) { + if (tid % (2 * s) == 0 && (tid + s) < MAX_NUM_BIN_IN_FEATURE) { + const uint32_t tid_s = tid + s; + if ((leaves[tid] == -1 && leaves[tid_s] != -1) || (leaves[tid] != -1 && leaves[tid_s] != -1 && gain[tid_s] > gain[tid])) { + gain[tid] = gain[tid_s]; + leaves[tid] = leaves[tid_s]; + } + } + __syncthreads(); + } +} + __device__ double ThresholdL1(double s, double l1) { const double reg_s = fmax(0.0, fabs(s) - l1); if (s >= 0.0f) { @@ -66,7 +180,7 @@ __device__ double GetSplitGains(double sum_left_gradients, l1, use_l1, l2); } -__device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, +__device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr, const uint32_t feature_num_bin, const uint8_t feature_mfb_offset, const uint32_t feature_default_bin, const uint8_t feature_missing_type, const double lambda_l1, const double lambda_l2, const double parent_gain, const data_size_t min_data_in_leaf, @@ -87,180 +201,430 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, data_size_t* output_right_num_data, double* output_right_gain, double* output_right_output, - uint8_t* output_found) { - double best_sum_left_gradient = NAN; - double best_sum_left_hessian = NAN; - double best_gain = kMinScore; - data_size_t best_left_count = 0; - uint32_t best_threshold = feature_num_bin; + uint8_t* output_found, const int inner_feature_index) { + const double cnt_factor = num_data / sum_hessians; const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; *output_found = 0; - if (reverse) { - double sum_right_gradient = 0.0f; - double sum_right_hessian = kEpsilon; - data_size_t right_count = 0; - - int t = feature_num_bin - 1 - feature_mfb_offset - na_as_missing; - const int t_end = 1 - feature_mfb_offset; - - // from right to left, and we don't need data in bin0 - for (; t >= t_end; --t) { - // need to skip default bin - if (skip_default_bin) { - if ((t + feature_mfb_offset) == static_cast(feature_default_bin)) { - continue; - } - } - const auto grad = GET_GRAD(feature_hist_ptr, t); - const auto hess = GET_HESS(feature_hist_ptr, t); - data_size_t cnt = - static_cast(__double2int_rn(hess * cnt_factor)); - sum_right_gradient += grad; - sum_right_hessian += hess; - right_count += cnt; - // if data not enough, or sum hessian too small - if (right_count < min_data_in_leaf || - sum_right_hessian < min_sum_hessian_in_leaf) { - continue; - } - data_size_t left_count = num_data - right_count; - // if data not enough - if (left_count < min_data_in_leaf) { - break; - } + __shared__ hist_t local_grad_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; + __shared__ hist_t local_hess_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; + __shared__ data_size_t local_cnt_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; + __shared__ double local_gain[MAX_NUM_BIN_IN_FEATURE]; + __shared__ uint8_t threshold_found[MAX_NUM_BIN_IN_FEATURE]; + __shared__ uint32_t threshold_value[MAX_NUM_BIN_IN_FEATURE]; - double sum_left_hessian = sum_hessians - sum_right_hessian; - // if sum hessian too small - if (sum_left_hessian < min_sum_hessian_in_leaf) { - break; + const unsigned int threadIdx_x = threadIdx.x; + const bool skip_sum = (skip_default_bin && (threadIdx_x + feature_mfb_offset) == static_cast(feature_default_bin)); + const uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; + const bool skip_split = (skip_default_bin && (feature_num_bin_minus_offset - 1 - threadIdx_x + feature_mfb_offset == static_cast(feature_default_bin))); + /*if (threadIdx_x == 0) { + printf("feature_num_bin_minus_offset = %d\n", feature_num_bin_minus_offset); + }*/ + const unsigned int bin_offset = threadIdx_x << 1; + /*hist_t default_bin_grad = 0.0f; + hist_t default_bin_hess = 0.0f; + if (feature_default_bin >= feature_mfb_offset) { + const uint32_t default_bin_pos = feature_default_bin - feature_mfb_offset; + default_bin_grad = feature_hist_ptr[default_bin_pos << 1]; + default_bin_hess = feature_hist_ptr[(default_bin_pos << 1) + 1]; + }*/ + const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(threadIdx_x); + if (!reverse) { + if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { + local_grad_hist[conflict_free_threadIdx_x] = feature_hist_ptr[bin_offset]; + const hist_t hess = feature_hist_ptr[bin_offset + 1]; + local_hess_hist[conflict_free_threadIdx_x] = hess; + local_cnt_hist[conflict_free_threadIdx_x] = static_cast(__double2int_rn(hess * cnt_factor)); + } else { + local_grad_hist[conflict_free_threadIdx_x] = 0.0f; + local_hess_hist[conflict_free_threadIdx_x] = 0.0f; + local_cnt_hist[conflict_free_threadIdx_x] = 0; + } + } else { + if (threadIdx_x < feature_num_bin_minus_offset) { + const unsigned int write_index = feature_num_bin_minus_offset - 1 - threadIdx_x; + const unsigned int conflict_free_write_index = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(write_index); + if (!skip_sum) { + local_grad_hist[conflict_free_write_index] = feature_hist_ptr[bin_offset]; + const hist_t hess = feature_hist_ptr[bin_offset + 1]; + local_hess_hist[conflict_free_write_index] = hess; + local_cnt_hist[conflict_free_write_index] = static_cast(__double2int_rn(hess * cnt_factor)); + } else { + //printf("unwrite gradient = %f, hessian = %f\n", feature_hist_ptr[bin_offset], feature_hist_ptr[bin_offset + 1]); + local_grad_hist[conflict_free_write_index] = 0.0f; + local_hess_hist[conflict_free_write_index] = 0.0f; + local_cnt_hist[conflict_free_write_index] = 0; } - - double sum_left_gradient = sum_gradients - sum_right_gradient; - - // current split gain - double current_gain = GetSplitGains( + } else { + local_grad_hist[conflict_free_threadIdx_x] = 0.0f; + local_hess_hist[conflict_free_threadIdx_x] = 0.0f; + local_cnt_hist[conflict_free_threadIdx_x] = 0; + } + } + __syncthreads(); + if (threadIdx_x == 0) { + local_hess_hist[conflict_free_threadIdx_x] += kEpsilon; + } + local_gain[threadIdx_x] = kMinScore; + __syncthreads(); + /*if (inner_feature_index == 6) { + if (threadIdx_x == 0) { + for (unsigned i = 0; i < MAX_NUM_BIN_IN_FEATURE; ++i) { + printf("local_grad_hist[%d] = %f\n", i, local_grad_hist[i]); + } + } + }*/ + //__syncthreads(); + PrefixSumHist(local_grad_hist, MAX_NUM_BIN_IN_FEATURE); + PrefixSumHist(local_hess_hist, MAX_NUM_BIN_IN_FEATURE); + PrefixSumHistCnt(local_cnt_hist, MAX_NUM_BIN_IN_FEATURE); + __syncthreads(); +/*if (threadIdx_x == 0) { + printf("inner_feature_index = %d, feature_mfb_offset = %d, local_grad_hist[%d] = %f, local_hess_hist[%d] = %f, local_cnt_hist[%d] = %d, total_sum_grad = %f, total_sum_hess = %f\n", + inner_feature_index, feature_mfb_offset, + MAX_NUM_BIN_IN_FEATURE, local_grad_hist[MAX_NUM_BIN_IN_FEATURE], + MAX_NUM_BIN_IN_FEATURE, local_hess_hist[MAX_NUM_BIN_IN_FEATURE], + MAX_NUM_BIN_IN_FEATURE, local_cnt_hist[MAX_NUM_BIN_IN_FEATURE], + local_grad_hist[MAX_NUM_BIN_IN_FEATURE] + default_bin_grad, + local_hess_hist[MAX_NUM_BIN_IN_FEATURE] + default_bin_hess); + }*/ + const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(threadIdx_x + 1); + if (reverse) { + if (threadIdx_x >= na_as_missing && threadIdx_x <= feature_num_bin - 2 && !skip_split) { + const double sum_right_gradient = local_grad_hist[conflict_free_threadIdx_x_plus_1]; + const double sum_right_hessian = local_hess_hist[conflict_free_threadIdx_x_plus_1]; + const data_size_t right_count = local_cnt_hist[conflict_free_threadIdx_x_plus_1]; + const double sum_left_gradient = sum_gradients - sum_right_gradient; + const double sum_left_hessian = sum_hessians - sum_right_hessian; + const data_size_t left_count = num_data - right_count; + if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { + double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, lambda_l2); - // gain with split is worse than without split - if (current_gain <= min_gain_shift) { - continue; - } - *output_found = 1; - // better split point - if (current_gain > best_gain) { - best_left_count = left_count; - best_sum_left_gradient = sum_left_gradient; - best_sum_left_hessian = sum_left_hessian; - // left is <= threshold, right is > threshold. so this is t-1 - best_threshold = static_cast(t - 1 + feature_mfb_offset); - best_gain = current_gain; + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + threshold_found[threadIdx_x] = 0; + } else { + local_gain[threadIdx_x] = current_gain - min_gain_shift; + threshold_value[threadIdx_x] = static_cast(feature_num_bin - 2 - threadIdx_x); + threshold_found[threadIdx_x] = 1; + } + } else { + threshold_found[threadIdx_x] = 0; } + } else { + threshold_found[threadIdx_x] = 0; } } else { - double sum_left_gradient = 0.0f; - double sum_left_hessian = kEpsilon; - data_size_t left_count = 0; - - int t = 0; - const int t_end = feature_num_bin - 2 - feature_mfb_offset; - if (na_as_missing) { - if (feature_mfb_offset == 1) { - sum_left_gradient = sum_gradients; - sum_left_hessian = sum_hessians - kEpsilon; - left_count = num_data; - for (int i = 0; i < feature_num_bin - feature_mfb_offset; ++i) { - const auto grad = GET_GRAD(feature_hist_ptr, i); - const auto hess = GET_HESS(feature_hist_ptr, i); - data_size_t cnt = - static_cast(__double2int_rn(hess * cnt_factor)); - sum_left_gradient -= grad; - sum_left_hessian -= hess; - left_count -= cnt; + //printf("error!!!!! non reverse\n"); + if (threadIdx_x <= feature_num_bin_minus_offset - 2 /* TODO(shiyu1994): skip default */) { + const double sum_left_gradient = local_grad_hist[conflict_free_threadIdx_x_plus_1]; + const double sum_left_hessian = local_hess_hist[conflict_free_threadIdx_x_plus_1]; + const data_size_t left_count = local_cnt_hist[conflict_free_threadIdx_x_plus_1]; + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = num_data - left_count; + if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + threshold_found[threadIdx_x] = 0; + } else { + local_gain[threadIdx_x] = current_gain - min_gain_shift; + threshold_value[threadIdx_x] = static_cast(threadIdx_x + feature_mfb_offset); + threshold_found[threadIdx_x] = 1; } - t = -1; + } else { + threshold_found[threadIdx_x] = 0; } + } else { + threshold_found[threadIdx_x] = 0; + } + } + __syncthreads(); + ReduceBestGain(local_gain, local_grad_hist, local_hess_hist, local_cnt_hist, threshold_found, threshold_value); + const uint8_t found = threshold_found[0]; + if (found && threadIdx_x == 0) { + *output_found = 1; + *output_threshold = threshold_value[0]; + *output_gain = local_gain[0]; + *output_default_left = reverse; + if (reverse) { + const double sum_right_gradient = local_grad_hist[1]; + const double sum_right_hessian = local_hess_hist[1]; + const data_size_t right_count = local_cnt_hist[1]; + const double sum_left_gradient = sum_gradients - sum_right_gradient; + const double sum_left_hessian = sum_hessians - sum_right_hessian; + const data_size_t left_count = num_data - right_count; + /*if (threadIdx_x == 0) { + printf("sum_left_gradient = %f, sum_left_hessian = %f, left_count = %d, sum_right_gradient = %f, sum_right_hessian = %f, right_count = %d\n", + sum_left_gradient, sum_left_hessian, left_count, sum_right_gradient, sum_right_hessian, right_count); + }*/ + *output_left_sum_gradients = sum_left_gradient; + *output_left_sum_hessians = sum_left_hessian; + *output_left_num_data = left_count; + *output_right_sum_gradients = sum_right_gradient; + *output_right_sum_hessians = sum_right_hessian; + *output_right_num_data = right_count; + *output_left_output = CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2); + *output_left_gain = GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2, *output_left_output); + *output_right_output = CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2); + *output_right_gain = GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2, *output_right_output); + } else { + const double sum_left_gradient = local_grad_hist[1]; + const double sum_left_hessian = local_hess_hist[1]; + const data_size_t left_count = local_cnt_hist[1]; + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = num_data - left_count; + *output_left_sum_gradients = sum_left_gradient; + *output_left_sum_hessians = sum_left_hessian; + *output_left_num_data = left_count; + *output_right_sum_gradients = sum_right_gradient; + *output_right_sum_hessians = sum_right_hessian; + *output_right_num_data = right_count; + *output_left_output = CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2); + *output_left_gain = GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2, *output_left_output); + *output_right_output = CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2); + *output_right_gain = GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2, *output_right_output); } + } +} - for (; t <= t_end; ++t) { - if (skip_default_bin) { - if ((t + feature_mfb_offset) == static_cast(feature_default_bin)) { +__device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, + const uint32_t feature_num_bin, const uint8_t feature_mfb_offset, + const uint32_t feature_default_bin, const uint8_t feature_missing_type, + const double lambda_l1, const double lambda_l2, const double parent_gain, const data_size_t min_data_in_leaf, + const double min_sum_hessian_in_leaf, const double min_gain_to_split, + const double sum_gradients, const double sum_hessians, const data_size_t num_data, + const bool reverse, const bool skip_default_bin, const bool na_as_missing, + // output parameters + uint32_t* output_threshold, + double* output_gain, + uint8_t* output_default_left, + double* output_left_sum_gradients, + double* output_left_sum_hessians, + data_size_t* output_left_num_data, + double* output_left_gain, + double* output_left_output, + double* output_right_sum_gradients, + double* output_right_sum_hessians, + data_size_t* output_right_num_data, + double* output_right_gain, + double* output_right_output, + uint8_t* output_found, const int inner_feature_index) { + double best_sum_left_gradient = NAN; + double best_sum_left_hessian = NAN; + double best_gain = kMinScore; + data_size_t best_left_count = 0; + uint32_t best_threshold = feature_num_bin; + const double cnt_factor = num_data / sum_hessians; + const bool use_l1 = lambda_l1 > 0.0f; + const double min_gain_shift = parent_gain + min_gain_to_split; + + *output_found = 0; + + __shared__ hist_t local_hist[2 * MAX_NUM_BIN_IN_FEATURE]; + uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; + const unsigned int threadIdx_x = threadIdx.x; + if (threadIdx_x < feature_num_bin_minus_offset * 2) { + local_hist[threadIdx_x] = feature_hist_ptr[threadIdx_x]; + } + __syncthreads(); + /*if (inner_feature_index == 6) { + if (threadIdx_x == 0) { + for (unsigned i = 0; i < MAX_NUM_BIN_IN_FEATURE; ++i) { + printf("local_grad_hist[%d] = %f\n", i, local_hist[2 * i]); + } + } + }*/ + //__syncthreads(); + if (threadIdx_x == 0) { + if (reverse) { + double sum_right_gradient = 0.0f; + double sum_right_hessian = kEpsilon; + data_size_t right_count = 0; + + int t = feature_num_bin - 1 - feature_mfb_offset - na_as_missing; + const int t_end = 1 - feature_mfb_offset; + + // from right to left, and we don't need data in bin0 + for (; t >= t_end; --t) { + // need to skip default bin + if (skip_default_bin) { + if ((t + feature_mfb_offset) == static_cast(feature_default_bin)) { + continue; + } + } + const auto grad = GET_GRAD(local_hist, t); + const auto hess = GET_HESS(local_hist, t); + data_size_t cnt = + static_cast(__double2int_rn(hess * cnt_factor)); + sum_right_gradient += grad; + sum_right_hessian += hess; + right_count += cnt; + // if data not enough, or sum hessian too small + if (right_count < min_data_in_leaf || + sum_right_hessian < min_sum_hessian_in_leaf) { continue; } + data_size_t left_count = num_data - right_count; + // if data not enough + if (left_count < min_data_in_leaf) { + break; + } + + double sum_left_hessian = sum_hessians - sum_right_hessian; + // if sum hessian too small + if (sum_left_hessian < min_sum_hessian_in_leaf) { + break; + } + + double sum_left_gradient = sum_gradients - sum_right_gradient; + /*if (inner_feature_index == 11) { + if (static_cast(t - 1 + feature_mfb_offset) == 252) { + printf("*************** feature_index 11, threshold = 252, sum_left_gradient = %f, sum_left_hessian = %f, sum_right_gradient = %f, sum_right_hessian = %f\n", + sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian); + } + }*/ + // current split gain + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + continue; + } + *output_found = 1; + // better split point + if (current_gain > best_gain) { + best_left_count = left_count; + best_sum_left_gradient = sum_left_gradient; + best_sum_left_hessian = sum_left_hessian; + // left is <= threshold, right is > threshold. so this is t-1 + best_threshold = static_cast(t - 1 + feature_mfb_offset); + best_gain = current_gain; + } } - if (t >= 0) { - sum_left_gradient += GET_GRAD(feature_hist_ptr, t); - const hist_t hess = GET_HESS(feature_hist_ptr, t); - sum_left_hessian += hess; - left_count += static_cast( - __double2int_rn(hess * cnt_factor)); - } - // if data not enough, or sum hessian too small - if (left_count < min_data_in_leaf || - sum_left_hessian < min_sum_hessian_in_leaf) { - continue; - } - data_size_t right_count = num_data - left_count; - // if data not enough - if (right_count < min_data_in_leaf) { - break; - } + } else { + double sum_left_gradient = 0.0f; + double sum_left_hessian = kEpsilon; + data_size_t left_count = 0; - double sum_right_hessian = sum_hessians - sum_left_hessian; - // if sum hessian too small - if (sum_right_hessian < min_sum_hessian_in_leaf) { - break; + int t = 0; + const int t_end = feature_num_bin - 2 - feature_mfb_offset; + if (na_as_missing) { + if (feature_mfb_offset == 1) { + sum_left_gradient = sum_gradients; + sum_left_hessian = sum_hessians - kEpsilon; + left_count = num_data; + for (int i = 0; i < feature_num_bin - feature_mfb_offset; ++i) { + const auto grad = GET_GRAD(local_hist, i); + const auto hess = GET_HESS(local_hist, i); + data_size_t cnt = + static_cast(__double2int_rn(hess * cnt_factor)); + sum_left_gradient -= grad; + sum_left_hessian -= hess; + left_count -= cnt; + } + t = -1; + } } - double sum_right_gradient = sum_gradients - sum_left_gradient; + for (; t <= t_end; ++t) { + if (skip_default_bin) { + if ((t + feature_mfb_offset) == static_cast(feature_default_bin)) { + continue; + } + } + if (t >= 0) { + sum_left_gradient += GET_GRAD(local_hist, t); + const hist_t hess = GET_HESS(local_hist, t); + sum_left_hessian += hess; + left_count += static_cast( + __double2int_rn(hess * cnt_factor)); + } + // if data not enough, or sum hessian too small + if (left_count < min_data_in_leaf || + sum_left_hessian < min_sum_hessian_in_leaf) { + continue; + } + data_size_t right_count = num_data - left_count; + // if data not enough + if (right_count < min_data_in_leaf) { + break; + } + + double sum_right_hessian = sum_hessians - sum_left_hessian; + // if sum hessian too small + if (sum_right_hessian < min_sum_hessian_in_leaf) { + break; + } - // current split gain - double current_gain = GetSplitGains( - sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - lambda_l2); - // gain with split is worse than without split - if (current_gain <= min_gain_shift) { - continue; - } - *output_found = 1; - // better split point - if (current_gain > best_gain) { - best_left_count = left_count; - best_sum_left_gradient = sum_left_gradient; - best_sum_left_hessian = sum_left_hessian; - best_threshold = static_cast(t + feature_mfb_offset); - best_gain = current_gain; + double sum_right_gradient = sum_gradients - sum_left_gradient; + + // current split gain + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + continue; + } + *output_found = 1; + // better split point + if (current_gain > best_gain) { + best_left_count = left_count; + best_sum_left_gradient = sum_left_gradient; + best_sum_left_hessian = sum_left_hessian; + best_threshold = static_cast(t + feature_mfb_offset); + best_gain = current_gain; + } } } - } - if (*output_found) { - *output_threshold = best_threshold; - *output_gain = best_gain - min_gain_shift; - *output_default_left = reverse; - *output_left_sum_gradients = best_sum_left_gradient; - *output_left_sum_hessians = best_sum_left_hessian; - *output_left_num_data = best_left_count; - - const double best_sum_right_gradient = sum_gradients - best_sum_left_gradient; - const double best_sum_right_hessian = sum_hessians - best_sum_left_hessian; - *output_right_sum_gradients = best_sum_right_gradient; - *output_right_sum_hessians = best_sum_right_hessian; - *output_right_num_data = num_data - best_left_count; - - *output_left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, - best_sum_left_hessian, lambda_l1, use_l1, lambda_l2); - *output_left_gain = GetLeafGainGivenOutput(best_sum_left_gradient, - best_sum_left_hessian, lambda_l1, use_l1, lambda_l2, *output_left_output); - *output_right_output = CalculateSplittedLeafOutput(best_sum_right_gradient, - best_sum_right_hessian, lambda_l1, use_l1, lambda_l2); - *output_right_gain = GetLeafGainGivenOutput(best_sum_right_gradient, - best_sum_right_hessian, lambda_l1, use_l1, lambda_l2, *output_right_output); + if (*output_found) { + *output_threshold = best_threshold; + *output_gain = best_gain - min_gain_shift; + *output_default_left = reverse; + *output_left_sum_gradients = best_sum_left_gradient; + *output_left_sum_hessians = best_sum_left_hessian; + *output_left_num_data = best_left_count; + + const double best_sum_right_gradient = sum_gradients - best_sum_left_gradient; + const double best_sum_right_hessian = sum_hessians - best_sum_left_hessian; + *output_right_sum_gradients = best_sum_right_gradient; + *output_right_sum_hessians = best_sum_right_hessian; + *output_right_num_data = num_data - best_left_count; + + *output_left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, + best_sum_left_hessian, lambda_l1, use_l1, lambda_l2); + *output_left_gain = GetLeafGainGivenOutput(best_sum_left_gradient, + best_sum_left_hessian, lambda_l1, use_l1, lambda_l2, *output_left_output); + *output_right_output = CalculateSplittedLeafOutput(best_sum_right_gradient, + best_sum_right_hessian, lambda_l1, use_l1, lambda_l2); + *output_right_gain = GetLeafGainGivenOutput(best_sum_right_gradient, + best_sum_right_hessian, lambda_l1, use_l1, lambda_l2, *output_right_output); + } } } @@ -280,11 +644,11 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* double* cuda_best_split_right_sum_gradient, double* cuda_best_split_right_sum_hessian, data_size_t* cuda_best_split_right_count, double* cuda_best_split_right_gain, double* cuda_best_split_right_output, uint8_t* cuda_best_split_found) { - const unsigned int num_features = gridDim.x / 4; - const unsigned int inner_feature_index = (blockIdx.x / 2) % num_features; + const unsigned int num_features = gridDim.x / 2; + const unsigned int inner_feature_index = (blockIdx.x /*/ 2*/) % num_features; const unsigned int global_block_idx = blockIdx.x; - const bool reverse = blockIdx.x % 2 == 0 ? true : false; - const bool smaller_or_larger = static_cast(blockIdx.x / (2 * num_features) == 0); + const bool reverse = true;//blockIdx.x % 2 == 0 ? true : false; + const bool smaller_or_larger = static_cast(blockIdx.x / (/*2 **/ num_features) == 0); const int num_bin = feature_hist_offsets[inner_feature_index + 1] - feature_hist_offsets[inner_feature_index]; const uint8_t missing_type = feature_missing_types[inner_feature_index]; const int leaf_index = smaller_or_larger ? *smaller_leaf_id : *larger_leaf_id; @@ -293,7 +657,7 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* printf("parent_gain = %f\n", parent_gain); }*/ const double sum_gradients = smaller_or_larger ? *sum_gradients_in_smaller_leaf : *sum_gradients_in_larger_leaf; - const double sum_hessians = smaller_or_larger ? *sum_hessians_in_smaller_leaf : *sum_hessians_in_larger_leaf; + const double sum_hessians = (smaller_or_larger ? *sum_hessians_in_smaller_leaf : *sum_hessians_in_larger_leaf) + 2 * kEpsilon; const double num_data_in_leaf = smaller_or_larger ? *num_data_in_smaller_leaf : *num_data_in_larger_leaf; uint32_t* out_threshold = cuda_best_split_threshold + global_block_idx; double* out_left_sum_gradients = cuda_best_split_left_sum_gradient + global_block_idx; @@ -318,31 +682,31 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* *larger_leaf_hist + feature_hist_offsets[inner_feature_index] * 2;// cuda_hist + (cuda_num_total_bin_ref * leaf_index + feature_hist_offsets[inner_feature_index]) * 2; if (num_bin > 2 && missing_type != 0) { if (missing_type == 1) { - FindBestSplitsForLeafKernelInner(hist_ptr, + FindBestSplitsForLeafKernelInner2(hist_ptr, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, num_data_in_leaf, reverse, true, false, out_threshold, out_gain, out_default_left, out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found); + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found, inner_feature_index); } else { - FindBestSplitsForLeafKernelInner(hist_ptr, + FindBestSplitsForLeafKernelInner2(hist_ptr, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, num_data_in_leaf, reverse, false, true, out_threshold, out_gain, out_default_left, out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found); + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found, inner_feature_index); } } else { if (reverse) { - FindBestSplitsForLeafKernelInner(hist_ptr, + FindBestSplitsForLeafKernelInner2(hist_ptr, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, num_data_in_leaf, reverse, true, false, out_threshold, out_gain, out_default_left, out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found); + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found, inner_feature_index); } if (missing_type == 2) { *out_default_left = 0; @@ -356,8 +720,8 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(const int* smaller_l const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, const data_size_t* num_data_in_larger_leaf, hist_t** larger_leaf_hist) { // * 2 for smaller and larger leaves, * 2 for split direction - const int num_blocks = num_features_ * 4; - FindBestSplitsForLeafKernel<<>>(cuda_hist_, + const int num_blocks = num_features_ * 2; + FindBestSplitsForLeafKernel<<>>(cuda_hist_, cuda_num_total_bin_, cuda_feature_hist_offsets_, cuda_feature_mfb_offsets_, cuda_feature_default_bins_, cuda_feature_missing_type_, cuda_lambda_l1_, cuda_lambda_l2_, @@ -398,7 +762,8 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const const data_size_t* cuda_best_split_right_count, const double* cuda_best_split_right_gain, const double* cuda_best_split_right_output, - const uint8_t* cuda_best_split_found) { + const uint8_t* cuda_best_split_found, + const uint32_t* cuda_feature_default_bins) { if (blockIdx.x == 0 && threadIdx.x == 0) { const int num_features_ref = *cuda_num_features; const int smaller_leaf_index_ref = *smaller_leaf_index; @@ -436,9 +801,19 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const smaller_leaf_best_gain = kMinScore; larger_leaf_best_gain = kMinScore; - int larger_leaf_offset = 2 * num_features_ref; + int larger_leaf_offset = /*2 * */num_features_ref; + /*if (larger_leaf_index_ref == -1) { + for (int feature_index = 0; feature_index < num_features_ref; ++feature_index) { + const int smaller_reverse_index = 2 * feature_index; + const uint8_t smaller_reverse_found = cuda_best_split_found[smaller_reverse_index]; + const double gain = cuda_best_split_gain[smaller_reverse_index]; + const uint32_t threshold = cuda_best_split_threshold[smaller_reverse_index]; + printf("feature_index = %d, threshold = %d, gain = %f, found = %d, default_bin = %d\n", feature_index, threshold, gain, smaller_reverse_found, + cuda_feature_default_bins[feature_index]); + } + }*/ for (int feature_index = 0; feature_index < num_features_ref; ++feature_index) { - const int smaller_reverse_index = 2 * feature_index; + const int smaller_reverse_index = /*2 * */feature_index; const uint8_t smaller_reverse_found = cuda_best_split_found[smaller_reverse_index]; if (smaller_reverse_found) { const double smaller_reverse_gain = cuda_best_split_gain[smaller_reverse_index]; @@ -469,7 +844,7 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const smaller_leaf_index_ref, smaller_leaf_best_gain, smaller_leaf_best_split_right_sum_gradient, smaller_leaf_best_split_right_sum_hessian);*/ } } - const int smaller_non_reverse_index = 2 * feature_index + 1; + /*const int smaller_non_reverse_index = 2 * feature_index + 1; const uint8_t smaller_non_reverse_found = cuda_best_split_found[smaller_non_reverse_index]; if (smaller_non_reverse_found) { const double smaller_non_reverse_gain = cuda_best_split_gain[smaller_non_reverse_index]; @@ -493,16 +868,16 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const smaller_leaf_best_split_right_count = cuda_best_split_right_count[smaller_non_reverse_index]; smaller_leaf_best_split_right_gain = cuda_best_split_right_gain[smaller_non_reverse_index]; //printf("leaf index %d split right gain update to %f\n", smaller_leaf_index_ref, smaller_leaf_best_split_right_gain); - smaller_leaf_best_split_right_output = cuda_best_split_right_output[smaller_non_reverse_index]; + smaller_leaf_best_split_right_output = cuda_best_split_right_output[smaller_non_reverse_index];*/ /*printf("smaller_leaf_index = %d, smaller_leaf_best_gain = %f, smaller_leaf_best_split_left_sum_gradient = %f, smaller_leaf_best_split_left_sum_hessian = %f\n", smaller_leaf_index_ref, smaller_leaf_best_gain, smaller_leaf_best_split_left_sum_gradient, smaller_leaf_best_split_left_sum_hessian); printf("smaller_leaf_index = %d, smaller_leaf_best_gain = %f, smaller_leaf_best_split_right_sum_gradient = %f, smaller_leaf_best_split_right_sum_hessian = %f\n", smaller_leaf_index_ref, smaller_leaf_best_gain, smaller_leaf_best_split_right_sum_gradient, smaller_leaf_best_split_right_sum_hessian);*/ - } - } + //} + //} if (larger_leaf_index_ref >= 0) { - const int larger_reverse_index = 2 * feature_index + larger_leaf_offset; + const int larger_reverse_index = /*2 **/ feature_index + larger_leaf_offset; const uint8_t larger_reverse_found = cuda_best_split_found[larger_reverse_index]; if (larger_reverse_found) { const double larger_reverse_gain = cuda_best_split_gain[larger_reverse_index]; @@ -530,7 +905,7 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const larger_leaf_index_ref, larger_leaf_best_gain, larger_leaf_best_split_right_sum_gradient, larger_leaf_best_split_right_sum_hessian);*/ } } - const int larger_non_reverse_index = 2 * feature_index + 1 + larger_leaf_offset; + /*const int larger_non_reverse_index = 2 * feature_index + 1 + larger_leaf_offset; const uint8_t larger_non_reverse_found = cuda_best_split_found[larger_non_reverse_index]; if (larger_non_reverse_found) { const double larger_non_reverse_gain = cuda_best_split_gain[larger_non_reverse_index]; @@ -551,13 +926,13 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const larger_leaf_best_split_right_count = cuda_best_split_right_count[larger_non_reverse_index]; larger_leaf_best_split_right_gain = cuda_best_split_right_gain[larger_non_reverse_index]; //printf("leaf index %d split right gain update to %f\n", larger_leaf_index_ref, larger_leaf_best_split_right_gain); - larger_leaf_best_split_right_output = cuda_best_split_right_output[larger_non_reverse_index]; + larger_leaf_best_split_right_output = cuda_best_split_right_output[larger_non_reverse_index];*/ /*printf("larger_leaf_index = %d, larger_leaf_best_gain = %f, larger_leaf_best_split_left_sum_gradient = %f, larger_leaf_best_split_left_sum_hessian = %f\n", larger_leaf_index_ref, larger_leaf_best_gain, larger_leaf_best_split_left_sum_gradient, larger_leaf_best_split_left_sum_hessian); printf("larger_leaf_index = %d, larger_leaf_best_gain = %f, larger_leaf_best_split_right_sum_gradient = %f, larger_leaf_best_split_right_sum_hessian = %f\n", larger_leaf_index_ref, larger_leaf_best_gain, larger_leaf_best_split_right_sum_gradient, larger_leaf_best_split_right_sum_hessian);*/ - } - } + //} + //} } } } @@ -587,29 +962,76 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel(const int* smaller_le cuda_best_split_right_count_, cuda_best_split_right_gain_, cuda_best_split_right_output_, - cuda_best_split_found_); + cuda_best_split_found_, + cuda_feature_default_bins_); } __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, - const double* cuda_leaf_best_split_gain, int* out_best_leaf) { + const double* cuda_leaf_best_split_gain, int* out_best_leaf, + const int* cuda_leaf_best_split_feature, const uint32_t* cuda_leaf_best_split_threshold, + const uint32_t* cuda_feature_default_bins, + const double* cuda_leaf_best_split_left_sum_gradient, + const double* cuda_leaf_best_split_left_sum_hessian, + const double* cuda_leaf_best_split_right_sum_gradient, + const double* cuda_leaf_best_split_right_sum_hessian, + const data_size_t* cuda_leaf_best_split_left_count, + const data_size_t* cuda_leaf_best_split_right_count) { const int cuda_cur_num_leaves_ref = *cuda_cur_num_leaves; double best_gain = kMinScore; - for (int leaf_index = 0; leaf_index < cuda_cur_num_leaves_ref; ++leaf_index) { + __shared__ double thread_best_gain[NUM_THREADS_FIND_BEST_LEAF]; + __shared__ int thread_best_leaf[NUM_THREADS_FIND_BEST_LEAF]; + const unsigned int threadIdx_x = threadIdx.x; + thread_best_gain[threadIdx_x] = kMinScore; + thread_best_leaf[threadIdx_x] = -1; + const int num_leaves_per_thread = (cuda_cur_num_leaves_ref + NUM_THREADS_FIND_BEST_LEAF - 1) / NUM_THREADS_FIND_BEST_LEAF; + const int start = num_leaves_per_thread * threadIdx_x; + const int end = min(start + num_leaves_per_thread, cuda_cur_num_leaves_ref); + for (int leaf_index = start; leaf_index < end; ++leaf_index) { + const double leaf_best_gain = cuda_leaf_best_split_gain[leaf_index]; + if (leaf_best_gain > thread_best_gain[threadIdx_x]) { + thread_best_gain[threadIdx_x] = leaf_best_gain; + thread_best_leaf[threadIdx_x] = leaf_index; + } + } + __syncthreads(); + ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf); + if (threadIdx_x == 0) { + *out_best_leaf = thread_best_leaf[0]; + best_gain = thread_best_gain[0]; + /*if (best_gain <= 0.0f) { + printf("error !!! too smaller best gain %f\n", best_gain); + }*/ + } + /*for (int leaf_index = 0; leaf_index < cuda_cur_num_leaves_ref; ++leaf_index) { const double leaf_best_gain = cuda_leaf_best_split_gain[leaf_index]; //printf("cuda_leaf_best_split_gain[%d] = %f\n", leaf_index, leaf_best_gain); if (leaf_best_gain > best_gain) { best_gain = leaf_best_gain; *out_best_leaf = leaf_index; } - } - if (best_gain <= 0.0f) { - printf("error !!! too smaller best gain %f\n", best_gain); - } + }*/ //printf("find best cuda_leaf_best_split_gain[%d] = %f\n", *out_best_leaf, best_gain); + //printf("split feature = %d, threshold = %d, default_bin = %d\n", + // cuda_leaf_best_split_feature[*out_best_leaf], cuda_leaf_best_split_threshold[*out_best_leaf], cuda_feature_default_bins[*out_best_leaf]); + /*printf("left_sum_gradient = %f, left_sum_hessian = %f, left_count = %d\n", + cuda_leaf_best_split_left_sum_gradient[*out_best_leaf], + cuda_leaf_best_split_left_sum_hessian[*out_best_leaf], + cuda_leaf_best_split_left_count[*out_best_leaf]);*/ + /*printf("right_sum_gradient = %f, right_sum_hessian = %f, right_count = %d\n", + cuda_leaf_best_split_right_sum_gradient[*out_best_leaf], + cuda_leaf_best_split_right_sum_hessian[*out_best_leaf], + cuda_leaf_best_split_right_count[*out_best_leaf]);*/ } void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves) { - FindBestFromAllSplitsKernel<<<1, 1>>>(cuda_cur_num_leaves, cuda_leaf_best_split_gain_, cuda_best_leaf_); + FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF>>>(cuda_cur_num_leaves, cuda_leaf_best_split_gain_, cuda_best_leaf_, + cuda_leaf_best_split_feature_, cuda_leaf_best_split_threshold_, cuda_feature_default_bins_, + cuda_leaf_best_split_left_sum_gradient_, + cuda_leaf_best_split_left_sum_hessian_, + cuda_leaf_best_split_right_sum_gradient_, + cuda_leaf_best_split_right_sum_hessian_, + cuda_leaf_best_split_left_count_, + cuda_leaf_best_split_right_count_); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 94591a6da17c..89f96936729a 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -17,7 +17,10 @@ #include -#define MAX_NUM_BIN_IN_FEATURE (256) +#define MAX_NUM_BIN_IN_FEATURE (512) +#define NUM_THREADS_FIND_BEST_LEAF (256) +#define LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER (4) +#define NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER (16) namespace LightGBM { diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 8ad843f74dd8..7cacf5a3eb6d 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -337,6 +337,7 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b } if (blockIdx.x == 0 && threadIdx.x == 0) { const int cur_max_leaf_index = (*cuda_cur_num_leaves); + //printf("left_leaf_index = %d, right_leaf_index = %d\n", leaf_index_ref, cur_max_leaf_index); block_to_left_offset_buffer[0] = 0; const unsigned int to_left_total_cnt = block_to_left_offset_buffer[num_blocks]; block_to_right_offset_buffer[0] = to_left_total_cnt; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 58effa0b9693..2b14b89c4c7a 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -333,15 +333,19 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, hessians_ = hessians; const auto start = std::chrono::steady_clock::now(); auto before_train_start = std::chrono::steady_clock::now(); + global_timer.Start("NewCUDATreeLearner::BeforeTrain"); BeforeTrain(); + global_timer.Stop("NewCUDATreeLearner::BeforeTrain"); auto before_train_end = std::chrono::steady_clock::now(); double construct_histogram_time = 0.0f; double find_best_split_time = 0.0f; + double find_best_split_from_all_leaves_time = 0.0f; double split_data_indices_time = 0.0f; double split_tree_time = 0.0f; //std::unique_ptr tree(new Tree(config_->num_leaves, false, false)); for (int i = 0; i < config_->num_leaves - 1; ++i) { //Log::Warning("Before ConstructHistogramForLeaf"); + global_timer.Start("NewCUDATreeLearner::ConstructHistogramForLeaf"); auto start = std::chrono::steady_clock::now(); cuda_histogram_constructor_->ConstructHistogramForLeaf( cuda_smaller_leaf_splits_->cuda_leaf_index(), @@ -360,22 +364,30 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, }*/ auto end = std::chrono::steady_clock::now(); auto duration = static_cast>(end - start); + global_timer.Stop("NewCUDATreeLearner::ConstructHistogramForLeaf"); construct_histogram_time += duration.count(); //Log::Warning("Before FindBestSplitsForLeaf"); + global_timer.Start("NewCUDATreeLearner::FindBestSplitsForLeaf"); start = std::chrono::steady_clock::now(); cuda_best_split_finder_->FindBestSplitsForLeaf(cuda_smaller_leaf_splits_.get(), cuda_larger_leaf_splits_.get()); //Log::Warning("Before FindBestFromAllSplits"); - cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves()); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); + global_timer.Stop("NewCUDATreeLearner::FindBestSplitsForLeaf"); find_best_split_time += duration.count(); + start = std::chrono::steady_clock::now(); + cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves()); + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + find_best_split_from_all_leaves_time += duration.count(); //Log::Warning("Before Split"); //start = std::chrono::steady_clock::now(); //SplitTree(tree.get()); //end = std::chrono::steady_clock::now(); //duration = static_cast>(end - start); //split_tree_time += duration.count(); + global_timer.Start("NewCUDATreeLearner::Split"); start = std::chrono::steady_clock::now(); cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), cuda_best_split_finder_->cuda_leaf_best_split_gain(), @@ -412,6 +424,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, cuda_larger_leaf_splits_->cuda_hist_in_leaf_pointer_pointer()); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); + global_timer.Stop("NewCUDATreeLearner::Split"); split_data_indices_time += duration.count(); } const auto end = std::chrono::steady_clock::now(); @@ -425,10 +438,10 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, Log::Warning("before train time %f", static_cast>(before_train_end - before_train_start).count()); Log::Warning("construct histogram time %f", construct_histogram_time); Log::Warning("find best split time %f", find_best_split_time); + Log::Warning("find best split time from all leaves %f", find_best_split_from_all_leaves_time); Log::Warning("split data indices time %f", split_data_indices_time); //Log::Warning("split tree time %f", split_tree_time); Log::Warning("build tree time %f", build_tre_duration); - global_timer.Print(); /*cuda_data_partition_->Test(); cuda_histogram_constructor_->ConstructHistogramForLeaf( cuda_smaller_leaf_splits_->cuda_leaf_index(), From 60c7e4efa603749e2d0f130f19cab8022c1e155c Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 14 May 2021 07:12:13 +0000 Subject: [PATCH 011/166] data split --- src/treelearner/cuda/cuda_data_partition.cpp | 66 +- src/treelearner/cuda/cuda_data_partition.cu | 680 +++++++++++++++--- src/treelearner/cuda/cuda_data_partition.hpp | 29 +- .../cuda/cuda_histogram_constructor.cu | 19 - src/treelearner/cuda/new_cuda_utils.hpp | 16 + 5 files changed, 670 insertions(+), 140 deletions(-) diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 0b8b03983606..6c6fcb051e08 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -98,6 +98,7 @@ void CUDADataPartition::Init() { InitCUDAMemoryFromHostMemory(&cuda_feature_missing_is_na_, feature_missing_is_na_.data(), static_cast(num_features_)); InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_zero_, feature_mfb_is_zero_.data(), static_cast(num_features_)); InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_na_, feature_mfb_is_na_.data(), static_cast(num_features_)); + AllocateCUDAMemory(5, &cuda_split_info_buffer_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_split_leaf_index_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_inner_feature_index_); @@ -115,7 +116,16 @@ void CUDADataPartition::Init() { AllocateCUDAMemory(static_cast(num_data_), &train_data_score_tmp_); + AllocateCUDAMemory(static_cast(num_data_) * static_cast(num_features_), &cuda_data_col_wise_); + + CopyColWiseData(); + cpu_train_data_score_tmp_.resize(num_data_, 0.0f); + cpu_split_info_buffer_.resize(5, 0); +} + +void CUDADataPartition::CopyColWiseData() { + LaunchCopyColWiseDataKernel(); } void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { @@ -160,33 +170,33 @@ void CUDADataPartition::Split(const int* leaf_id, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, hist_t** larger_leaf_cuda_hist_pointer_pointer) { - int leaf_index_cpu = 0; + data_size_t cpu_num_data_in_leaf = 0; + int cpu_split_feature_index = 0; + uint32_t cpu_split_threshold = 0; + uint8_t cpu_split_default_left = 0; + data_size_t cpu_leaf_data_start = 0; global_timer.Start("GenDataToLeftBitVector"); - CopyFromCUDADeviceToHost(&leaf_index_cpu, leaf_id, 1); - const data_size_t num_data_in_leaf = num_data_in_leaf_[leaf_index_cpu]; + global_timer.Start("SplitInner Copy CUDA To Host"); + PrepareCUDASplitInforBuffer(leaf_id, best_split_feature, best_split_threshold, best_split_default_left); + CopyFromCUDADeviceToHost(cpu_split_info_buffer_.data(), cuda_split_info_buffer_, 5); + cpu_num_data_in_leaf = static_cast(cpu_split_info_buffer_[0]); + cpu_split_feature_index = static_cast(cpu_split_info_buffer_[1]); + cpu_split_threshold = static_cast(cpu_split_info_buffer_[2]); + cpu_split_default_left = static_cast(cpu_split_info_buffer_[3]); + cpu_leaf_data_start = static_cast(cpu_split_info_buffer_[4]); + global_timer.Stop("SplitInner Copy CUDA To Host"); auto start = std::chrono::steady_clock::now(); - GenDataToLeftBitVector(leaf_id, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left); + //GenDataToLeftBitVector(leaf_id, cpu_num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left); + GenDataToLeftBitVector2(cpu_num_data_in_leaf, cpu_split_feature_index, cpu_split_threshold, cpu_split_default_left, cpu_leaf_data_start); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); global_timer.Stop("GenDataToLeftBitVector"); //Log::Warning("CUDADataPartition::GenDataToLeftBitVector time %f", duration); global_timer.Start("SplitInner"); - CopyFromCUDADeviceToCUDADevice(tree_split_leaf_index_ + cur_num_leaves_ - 1, leaf_id, 1); - CopyFromCUDADeviceToCUDADevice(tree_inner_feature_index_ + cur_num_leaves_ - 1, best_split_feature + leaf_index_cpu, 1); - CopyFromCUDADeviceToCUDADevice(tree_threshold_ + cur_num_leaves_ - 1, best_split_threshold + leaf_index_cpu, 1); - CopyFromCUDADeviceToCUDADevice(tree_left_output_ + cur_num_leaves_ - 1, best_left_leaf_value + leaf_index_cpu, 1); - CopyFromCUDADeviceToCUDADevice(tree_right_output_ + cur_num_leaves_ - 1, best_right_leaf_value + leaf_index_cpu, 1); - CopyFromCUDADeviceToCUDADevice(tree_left_count_ + cur_num_leaves_ - 1, best_left_count + leaf_index_cpu, 1); - CopyFromCUDADeviceToCUDADevice(tree_right_count_ + cur_num_leaves_ - 1, best_right_count + leaf_index_cpu, 1); - CopyFromCUDADeviceToCUDADevice(tree_left_sum_hessian_ + cur_num_leaves_ - 1, best_left_sum_hessians + leaf_index_cpu, 1); - CopyFromCUDADeviceToCUDADevice(tree_right_sum_hessian_ + cur_num_leaves_ - 1, best_right_sum_hessians + leaf_index_cpu, 1); - CopyFromCUDADeviceToCUDADevice(tree_gain_ + cur_num_leaves_ - 1, best_split_gain + leaf_index_cpu, 1); - CopyFromCUDADeviceToCUDADevice(tree_default_left_ + cur_num_leaves_ - 1, best_split_default_left + leaf_index_cpu, 1); - CopyFromCUDADeviceToCUDADevice(data_partition_leaf_output_ + leaf_index_cpu, best_left_leaf_value + leaf_index_cpu, 1); - CopyFromCUDADeviceToCUDADevice(data_partition_leaf_output_ + cur_num_leaves_, best_right_leaf_value + leaf_index_cpu, 1); start = std::chrono::steady_clock::now(); - SplitInner(leaf_id, num_data_in_leaf, + SplitInner(leaf_id, cpu_num_data_in_leaf, + best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, best_left_gain, best_left_leaf_value, best_right_sum_gradients, best_right_sum_hessians, best_right_count, @@ -200,7 +210,7 @@ void CUDADataPartition::Split(const int* leaf_id, larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, leaf_index_cpu); + larger_leaf_cuda_hist_pointer_pointer); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); global_timer.Stop("SplitInner"); @@ -215,7 +225,15 @@ void CUDADataPartition::GenDataToLeftBitVector(const int* leaf_id, LaunchGenDataToLeftBitVectorKernel(leaf_id, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left); } +void CUDADataPartition::GenDataToLeftBitVector2(const data_size_t num_data_in_leaf, + const int split_feature_index, const uint32_t split_threshold, + const uint8_t split_default_left, const data_size_t leaf_data_start) { + LaunchGenDataToLeftBitVectorKernel2(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start); +} + void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, + const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left, const double* best_split_gain, const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, @@ -230,8 +248,9 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, const int cpu_leaf_index) { + hist_t** larger_leaf_cuda_hist_pointer_pointer) { LaunchSplitInnerKernel(leaf_index, num_data_in_leaf, + best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, best_left_gain, best_left_leaf_value, best_right_sum_gradients, best_right_sum_hessians, best_right_count, @@ -245,7 +264,7 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_index); + larger_leaf_cuda_hist_pointer_pointer); ++cur_num_leaves_; } @@ -259,6 +278,11 @@ void CUDADataPartition::UpdateTrainScore(const double learning_rate, double* tra } } +void CUDADataPartition::PrepareCUDASplitInforBuffer(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left) { + LaunchPrepareCUDASplitInforBufferKernel(leaf_id, best_split_feature, best_split_threshold, best_split_default_left); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 7cacf5a3eb6d..1f3d0d879f74 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -51,6 +51,43 @@ __device__ void PrefixSum(uint32_t* elements, unsigned int n) { } } +__device__ void PrefixSum(uint16_t* elements, unsigned int n) { + unsigned int offset = 1; + unsigned int threadIdx_x = threadIdx.x; + const unsigned int conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); + const uint16_t last_element = elements[conflict_free_n_minus_1]; + __syncthreads(); + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + offset <<= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[conflict_free_n_minus_1] = 0; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint16_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[CONFLICT_FREE_INDEX(n)] = elements[conflict_free_n_minus_1] + last_element; + } +} + __global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_data, data_size_t* data_indices) { const data_size_t num_data_ref = *cuda_num_data; @@ -65,6 +102,319 @@ void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() { FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_); } +// missing_is_zero = 0, missing_is_na = 0, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t split_default_to_left, const uint8_t /*split_missing_default_to_left*/, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel4(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin == t_zero_bin) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel5(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel6(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin == max_bin_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel7(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel8(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin == t_zero_bin) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel9(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin == t_zero_bin) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel10(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel11(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel12(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin == t_zero_bin || bin == max_bin_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel13(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin == t_zero_bin) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel14(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin == max_bin_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + +// missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref +__global__ void GenDataToLeftBitVectorKernel15(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(cuda_data[global_data_index]); + if (bin < min_bin_ref || bin > max_bin_ref) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + } else if (bin > th) { + cuda_data_to_left[local_data_index] = 0; + } else { + cuda_data_to_left[local_data_index] = 1; + } + } +} + __global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t* cuda_leaf_data_start, const data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, const int* best_split_feature, const uint32_t* best_split_threshold, const int* cuda_num_features, const uint8_t* cuda_data, @@ -72,13 +422,7 @@ __global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_s const uint32_t* min_bin, const uint32_t* max_bin, const uint8_t* missing_is_zero, const uint8_t* missing_is_na, const uint8_t* mfb_is_zero, const uint8_t* mfb_is_na, uint8_t* cuda_data_to_left) { - /*if (blockIdx.x == 0 && threadIdx.x == 0) { - printf("GenDataToLeftBitVectorKernel step 0\n"); - }*/ const int leaf_index_ref = *leaf_index; - /*if (blockIdx.x == 0 && threadIdx.x == 0) { - printf("GenDataToLeftBitVectorKernel leaf_index_ref = %d\n", leaf_index_ref); - }*/ const int best_split_feature_ref = best_split_feature[leaf_index_ref]; const int num_features_ref = *cuda_num_features; const uint32_t best_split_threshold_ref = best_split_threshold[leaf_index_ref]; @@ -98,9 +442,6 @@ __global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_s const uint8_t missing_is_na_ref = missing_is_na[best_split_feature_ref]; const uint8_t mfb_is_zero_ref = mfb_is_zero[best_split_feature_ref]; const uint8_t mfb_is_na_ref = mfb_is_na[best_split_feature_ref]; - /*if (blockIdx.x == 0 && threadIdx.x == 0) { - printf("GenDataToLeftBitVectorKernel step 1\n"); - }*/ uint32_t th = best_split_threshold_ref + min_bin_ref; uint32_t t_zero_bin = min_bin_ref + default_bin_ref; if (most_freq_bin_ref == 0) { @@ -117,19 +458,14 @@ __global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_s split_missing_default_to_left = 1; } } - /*if (blockIdx.x == 0 && threadIdx.x == 0) { - printf("GenDataToLeftBitVectorKernel step 2\n"); - }*/ if (local_data_index < static_cast(num_data_in_leaf)) { - /*if (blockIdx.x == 0 && threadIdx.x == 0) { - printf("GenDataToLeftBitVectorKernel step 3\n"); - }*/ const uint32_t bin = static_cast(cuda_data[global_feature_value_index]); if (min_bin_ref < max_bin_ref) { - if ((missing_is_zero_ref && !mfb_is_zero_ref && bin == t_zero_bin)) { + if ((missing_is_zero_ref && !mfb_is_zero_ref && bin == t_zero_bin) || + (missing_is_na_ref && !mfb_is_na_ref && bin == max_bin_ref)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; } else if (bin < min_bin_ref || bin > max_bin_ref) { - if ((missing_is_na_ref || mfb_is_na_ref) || (missing_is_zero_ref || mfb_is_zero_ref)) { + if ((missing_is_na_ref && mfb_is_na_ref) || (missing_is_zero_ref && mfb_is_zero_ref)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; } else { cuda_data_to_left[local_data_index] = split_default_to_left; @@ -140,7 +476,7 @@ __global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_s cuda_data_to_left[local_data_index] = 1; } } else { - if (missing_is_zero_ref || !mfb_is_zero_ref && bin == t_zero_bin) { + if (missing_is_zero_ref && !mfb_is_zero_ref && bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; } else if (bin != max_bin_ref) { if ((missing_is_na_ref && mfb_is_na_ref) || (missing_is_zero_ref && mfb_is_zero_ref)) { @@ -156,13 +492,135 @@ __global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_s } } } - /*if (blockIdx.x == 0 && threadIdx.x == 0) { - printf("GenDataToLeftBitVectorKernel step 4\n"); - }*/ } } } +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t num_data_in_leaf, + const int split_feature_index, const uint32_t split_threshold, + const uint8_t split_default_left, const data_size_t leaf_data_start) { + const int num_blocks = std::max(80, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; + int split_indices_block_size_data_partition_aligned = 1; + while (split_indices_block_size_data_partition > 0) { + split_indices_block_size_data_partition_aligned <<= 1; + split_indices_block_size_data_partition >>= 1; + } + const uint8_t missing_is_zero = feature_missing_is_zero_[split_feature_index]; + const uint8_t missing_is_na = feature_missing_is_na_[split_feature_index]; + const uint8_t mfb_is_zero = feature_mfb_is_zero_[split_feature_index]; + const uint8_t mfb_is_na = feature_mfb_is_na_[split_feature_index]; + const uint32_t default_bin = feature_default_bins_[split_feature_index]; + const uint32_t most_freq_bin = feature_most_freq_bins_[split_feature_index]; + const uint32_t min_bin = feature_min_bins_[split_feature_index]; + const uint32_t max_bin = feature_max_bins_[split_feature_index]; + + uint32_t th = split_threshold + min_bin; + uint32_t t_zero_bin = min_bin + default_bin; + if (most_freq_bin == 0) { + --th; + --t_zero_bin; + } + uint8_t split_default_to_left = 0; + uint8_t split_missing_default_to_left = 0; + if (most_freq_bin <= split_threshold) { + split_default_to_left = 1; + } + if (missing_is_zero || missing_is_na) { + if (split_default_left) { + split_missing_default_to_left = 1; + } + } + const uint8_t* cuda_data_col_wise_ptr = cuda_data_col_wise_ + split_feature_index * num_data_; + if (min_bin < max_bin) { + if (!missing_is_zero && !missing_is_na) { + GenDataToLeftBitVectorKernel0_1_2_3<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else { + if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { + GenDataToLeftBitVectorKernel4<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + GenDataToLeftBitVectorKernel5<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + GenDataToLeftBitVectorKernel6<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + GenDataToLeftBitVectorKernel7<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { + GenDataToLeftBitVectorKernel8<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { + GenDataToLeftBitVectorKernel9<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { + GenDataToLeftBitVectorKernel10<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { + GenDataToLeftBitVectorKernel11<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { + GenDataToLeftBitVectorKernel12<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + GenDataToLeftBitVectorKernel13<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + GenDataToLeftBitVectorKernel14<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + GenDataToLeftBitVectorKernel15<<>>( + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, + th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ + cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, + split_missing_default_to_left, cuda_data_to_left_); + } + } + } else { + Log::Fatal("Unsupported for max_bin == min_bin"); + } + + SynchronizeCUDADevice(); +} + void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t num_data_in_leaf, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left) { const int num_blocks = std::max(80, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); @@ -188,7 +646,7 @@ __global__ void PrepareOffsetKernel(const int* leaf_index, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, const int split_indices_block_size_data_partition) { const unsigned int blockDim_x = blockDim.x; - __shared__ uint32_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; //__shared__ uint32_t thread_to_right_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + // (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; @@ -239,6 +697,8 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, int* cuda_cur_num_leaves, + const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left, const double* best_split_gain, const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, @@ -255,7 +715,12 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, hist_t** larger_leaf_cuda_hist_pointer_pointer, const int* cuda_num_total_bin, - hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition) { + hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition, + + int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, + double* tree_left_output, double* tree_right_output, data_size_t* tree_left_count, data_size_t* tree_right_count, + double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, + double* data_partition_leaf_output) { __shared__ uint32_t block_to_left_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint32_t block_to_right_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + @@ -348,6 +813,21 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; cuda_leaf_num_data[cur_max_leaf_index] = block_to_right_offset_buffer[num_blocks] - to_left_total_cnt; const int cuda_num_total_bin_ref = *cuda_num_total_bin; + + tree_split_leaf_index[cur_max_leaf_index - 1] = leaf_index_ref; + tree_inner_feature_index[cur_max_leaf_index - 1] = best_split_feature[leaf_index_ref]; + tree_threshold[cur_max_leaf_index - 1] = best_split_threshold[leaf_index_ref]; + tree_left_output[cur_max_leaf_index - 1] = best_left_leaf_value[leaf_index_ref]; + tree_right_output[cur_max_leaf_index - 1] = best_right_leaf_value[leaf_index_ref]; + tree_left_count[cur_max_leaf_index - 1] = best_left_count[leaf_index_ref]; + tree_right_count[cur_max_leaf_index - 1] = best_right_count[leaf_index_ref]; + tree_left_sum_hessian[cur_max_leaf_index - 1] = best_left_sum_hessians[leaf_index_ref]; + tree_right_sum_hessian[cur_max_leaf_index - 1] = best_right_sum_hessians[leaf_index_ref]; + tree_gain[cur_max_leaf_index - 1] = best_split_gain[leaf_index_ref]; + tree_default_left[cur_max_leaf_index - 1] = best_split_default_left[leaf_index_ref]; + data_partition_leaf_output[leaf_index_ref] = best_left_leaf_value[leaf_index_ref]; + data_partition_leaf_output[cur_max_leaf_index] = best_right_leaf_value[leaf_index_ref]; + ++(*cuda_cur_num_leaves); if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; @@ -401,10 +881,9 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, data_size_t* out_data_indices_in_leaf, const int split_indices_block_size_data_partition) { __shared__ uint8_t thread_split_to_left_bit_vector[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; - __shared__ uint32_t thread_to_left_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; - __shared__ uint32_t thread_to_right_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + __shared__ uint16_t thread_to_left_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint16_t thread_to_right_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; const int leaf_index_ref = *leaf_index; const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index_ref] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; @@ -417,11 +896,9 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ const uint8_t bit = split_to_left_bit_vector[global_thread_index]; thread_split_to_left_bit_vector[threadIdx_x] = bit; thread_to_left_pos[conflict_free_threadIdx_x_plus_1] = bit; - thread_to_right_pos[conflict_free_threadIdx_x_plus_1] = 1 - bit; } else { thread_split_to_left_bit_vector[threadIdx_x] = 0; thread_to_left_pos[conflict_free_threadIdx_x_plus_1] = 0; - thread_to_right_pos[conflict_free_threadIdx_x_plus_1] = 0; } const unsigned int conflict_free_threadIdx_x_plus_blockDim_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x + 1); const unsigned int global_thread_index_plus_blockDim_x = global_thread_index + blockDim_x; @@ -429,35 +906,44 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ const uint8_t bit = split_to_left_bit_vector[global_thread_index_plus_blockDim_x]; thread_split_to_left_bit_vector[threadIdx_x + blockDim_x] = bit; thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = bit; - thread_to_right_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = 1 - bit; } else { thread_split_to_left_bit_vector[threadIdx_x + blockDim_x] = 0; thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = 0; - thread_to_right_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = 0; } __syncthreads(); + const uint32_t to_right_block_offset = block_to_right_offset_buffer[blockIdx.x]; + const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x]; if (threadIdx_x == 0) { - const uint32_t to_right_block_offset = block_to_right_offset_buffer[blockIdx.x]; - const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x]; - thread_to_left_pos[0] = to_left_block_offset; - thread_to_right_pos[0] = to_right_block_offset; + thread_to_left_pos[0] = 0; + // thread_to_left_pos[0] = to_left_block_offset; } __syncthreads(); PrefixSum(thread_to_left_pos, split_indices_block_size_data_partition); - PrefixSum(thread_to_right_pos, split_indices_block_size_data_partition); + //thread_to_right_pos[threadIdx_x] = to_right_block_offset; + //thread_to_right_pos[threadIdx_x + blockDim_x] = to_right_block_offset; + __syncthreads(); + if (threadIdx_x > 0) { + thread_to_right_pos[threadIdx_x] = (threadIdx_x /*+ to_left_block_offset*/ - thread_to_left_pos[conflict_free_threadIdx_x_plus_1]); + } else { + thread_to_right_pos[threadIdx_x] = 0; + } + thread_to_right_pos[threadIdx_x + blockDim_x] = (threadIdx_x + blockDim_x /*+ to_left_block_offset*/ - thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]); __syncthreads(); + //PrefixSum(thread_to_right_pos, split_indices_block_size_data_partition); + data_size_t* left_out_data_indices_in_leaf = out_data_indices_in_leaf + to_left_block_offset; + data_size_t* right_out_data_indices_in_leaf = out_data_indices_in_leaf + to_right_block_offset; if (global_thread_index < num_data_in_leaf_ref) { if (thread_split_to_left_bit_vector[threadIdx_x] == 1) { - out_data_indices_in_leaf[thread_to_left_pos[conflict_free_threadIdx_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index]; + left_out_data_indices_in_leaf[thread_to_left_pos[conflict_free_threadIdx_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index]; } else { - out_data_indices_in_leaf[thread_to_right_pos[conflict_free_threadIdx_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index]; + right_out_data_indices_in_leaf[thread_to_right_pos[threadIdx_x]] = cuda_data_indices_in_leaf[global_thread_index]; } } if (global_thread_index_plus_blockDim_x < num_data_in_leaf_ref) { if (thread_split_to_left_bit_vector[threadIdx_x + blockDim_x] == 1) { - out_data_indices_in_leaf[thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x]; + left_out_data_indices_in_leaf[thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x]; } else { - out_data_indices_in_leaf[thread_to_right_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x]; + right_out_data_indices_in_leaf[thread_to_right_pos[threadIdx_x + blockDim_x]] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x]; } } /*if (thread_to_left_pos[conflict_free_threadIdx_x_plus_1] == 0) { @@ -478,57 +964,6 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ }*/ } -/*__global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_leaves, - const data_size_t* cuda_leaf_data_start, const data_size_t* cuda_leaf_num_data, - const data_size_t* cuda_data_indices, const uint8_t* split_to_left_bit_vector, - const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, - data_size_t* out_data_indices_in_leaf) { - __shared__ uint8_t thread_split_to_left_bit_vector[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; - __shared__ uint32_t thread_to_left_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; - __shared__ uint32_t thread_to_right_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; - const int leaf_index_ref = *leaf_index; - const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; - const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index_ref] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; - const unsigned int threadIdx_x = threadIdx.x; - const unsigned int global_thread_index = blockIdx.x * blockDim.x + threadIdx_x; - const data_size_t* cuda_data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; - if (global_thread_index < num_data_in_leaf_ref) { - thread_split_to_left_bit_vector[threadIdx_x] = split_to_left_bit_vector[global_thread_index]; - } else { - thread_split_to_left_bit_vector[threadIdx_x] = 0; - } - __syncthreads(); - if (threadIdx_x == 0) { - const uint32_t to_right_block_offset = block_to_right_offset_buffer[blockIdx.x]; - const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x]; - thread_to_left_pos[0] = to_left_block_offset; - thread_to_right_pos[0] = to_right_block_offset; - for (unsigned int i = 0; i < blockDim.x - 1; ++i) { - const unsigned int tmp_global_thread_index = blockIdx.x * blockDim.x + i; - if (tmp_global_thread_index < num_data_in_leaf_ref) { - if (thread_split_to_left_bit_vector[i] == 0) { - thread_to_right_pos[i + 1] = thread_to_right_pos[i] + 1; - thread_to_left_pos[i + 1] = thread_to_left_pos[i]; - } else { - thread_to_left_pos[i + 1] = thread_to_left_pos[i] + 1; - thread_to_right_pos[i + 1] = thread_to_right_pos[i]; - } - } else { - thread_to_left_pos[i + 1] = thread_to_left_pos[i]; - thread_to_right_pos[i + 1] = thread_to_right_pos[i]; - } - } - } - __syncthreads(); - if (global_thread_index < num_data_in_leaf_ref) { - if (thread_split_to_left_bit_vector[threadIdx_x] == 1) { - out_data_indices_in_leaf[thread_to_left_pos[threadIdx_x]] = cuda_data_indices_in_leaf[global_thread_index]; - } else { - out_data_indices_in_leaf[thread_to_right_pos[threadIdx_x]] = cuda_data_indices_in_leaf[global_thread_index]; - } - } -}*/ - __global__ void CopyDataIndicesKernel(const int* leaf_index, const int* cuda_cur_num_leaves, const data_size_t* cuda_leaf_data_start, @@ -547,6 +982,8 @@ __global__ void CopyDataIndicesKernel(const int* leaf_index, } void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data_size_t num_data_in_leaf, + const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left, const double* best_split_gain, const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, @@ -561,7 +998,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, const int cpu_leaf_index) { + hist_t** larger_leaf_cuda_hist_pointer_pointer) { //Log::Warning("num_data_in_leaf = %d", num_data_in_leaf); const int num_blocks = std::max(80, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; @@ -571,6 +1008,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data split_indices_block_size_data_partition >>= 1; } //Log::Warning("num_blocks = %d, split_indices_block_size_data_partition_aligned = %d", num_blocks, split_indices_block_size_data_partition_aligned); + global_timer.Start("CUDADataPartition::PrepareOffsetKernel"); auto start = std::chrono::steady_clock::now(); const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; //Log::Warning("num_blocks_final = %d", num_blocks_final); @@ -580,13 +1018,15 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); + global_timer.Stop("CUDADataPartition::PrepareOffsetKernel"); + global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); //Log::Warning("CUDADataPartition::PrepareOffsetKernel time %f", duration); start = std::chrono::steady_clock::now(); AggregateBlockOffsetKernel<<<1, split_indices_block_size_data_partition_aligned / 2>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, cuda_cur_num_leaves_, - + best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, best_left_gain, best_left_leaf_value, best_right_sum_gradients, best_right_sum_hessians, best_right_count, @@ -604,16 +1044,18 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data larger_leaf_cuda_hist_pointer_pointer, cuda_num_total_bin_, cuda_hist_, - cuda_hist_pool_, split_indices_block_size_data_partition_aligned); - const auto copy_start = std::chrono::steady_clock::now(); - CopyFromCUDADeviceToHost(num_data_in_leaf_.data(), cuda_leaf_num_data_, num_leaves_); + cuda_hist_pool_, split_indices_block_size_data_partition_aligned, + + tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, + tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, + tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, + data_partition_leaf_output_); SynchronizeCUDADevice(); - const auto copy_end = std::chrono::steady_clock::now(); - const auto copy_duration = (static_cast>(copy_end - copy_start)).count(); - //Log::Warning("CUDADataPartition::CopyFromCUDADeviceToHost time %f", copy_duration); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); + global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); //Log::Warning("CUDADataPartition::AggregateBlockOffsetKernel time %f", duration); + global_timer.Start("CUDADataPartition::SplitInnerKernel"); start = std::chrono::steady_clock::now(); SplitInnerKernel<<>>( leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, @@ -622,13 +1064,16 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data SynchronizeCUDADevice(); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); + global_timer.Stop("CUDADataPartition::SplitInnerKernel"); //Log::Warning("CUDADataPartition::SplitInnerKernel time %f", duration); + global_timer.Start("CUDADataPartition::CopyDataIndicesKernel"); start = std::chrono::steady_clock::now(); CopyDataIndicesKernel<<>>( leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_data_indices_); SynchronizeCUDADevice(); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); + global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); //Log::Warning("CUDADataPartition::CopyDataIndicesKernel time %f", duration); } @@ -679,6 +1124,45 @@ void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double learning_r SynchronizeCUDADevice(); } +__global__ void PrepareCUDASplitInforBufferKernel(const int* leaf_index, const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left, + const data_size_t* cuda_leaf_num_data, const data_size_t* cuda_leaf_data_start, + int* cuda_split_info_buffer) { + const int leaf_index_ref = *leaf_index; + cuda_split_info_buffer[0] = cuda_leaf_num_data[leaf_index_ref]; + cuda_split_info_buffer[1] = best_split_feature[leaf_index_ref]; + cuda_split_info_buffer[2] = best_split_threshold[leaf_index_ref]; + cuda_split_info_buffer[3] = best_split_default_left[leaf_index_ref]; + cuda_split_info_buffer[4] = cuda_leaf_data_start[leaf_index_ref]; +} + +void CUDADataPartition::LaunchPrepareCUDASplitInforBufferKernel(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left) { + PrepareCUDASplitInforBufferKernel<<<1, 1>>>(leaf_id, best_split_feature, best_split_threshold, best_split_default_left, + cuda_leaf_num_data_, cuda_leaf_data_start_, + cuda_split_info_buffer_); + SynchronizeCUDADevice(); +} + +__global__ void CopyColWiseDataKernel(const uint8_t* row_wise_data, + const data_size_t num_data, const int num_features, + uint8_t* col_wise_data) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + const data_size_t read_offset = data_index * num_features; + for (int feature_index = 0; feature_index < num_features; ++feature_index) { + const data_size_t write_pos = feature_index * num_data + data_index; + col_wise_data[write_pos] = row_wise_data[read_offset + feature_index]; + } + } +} + +void CUDADataPartition::LaunchCopyColWiseDataKernel() { + const int block_size = 1024; + const int num_blocks = (num_data_ + block_size - 1) / block_size; + CopyColWiseDataKernel<<>>(cuda_data_, num_data_, num_features_, cuda_data_col_wise_); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 5b05c618927a..ee26e1a469b2 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -171,10 +171,26 @@ class CUDADataPartition { const double* train_data_score_tmp() const { return train_data_score_tmp_; } private: + void CopyColWiseData(); + + void LaunchCopyColWiseDataKernel(); + + void PrepareCUDASplitInforBuffer(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left); + + void LaunchPrepareCUDASplitInforBufferKernel(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left); + void GenDataToLeftBitVector(const int* leaf_id, const data_size_t num_data_in_leaf, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); + void GenDataToLeftBitVector2(const data_size_t num_data_in_leaf, + const int split_feature_index, const uint32_t split_threshold, + const uint8_t split_default_left, const data_size_t leaf_data_start); + void SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, + const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left, const double* best_split_gain, const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, @@ -189,12 +205,14 @@ class CUDADataPartition { double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, const int cpu_leaf_index); + hist_t** larger_leaf_cuda_hist_pointer_pointer); // kernel launch functions void LaunchFillDataIndicesBeforeTrain(); void LaunchSplitInnerKernel(const int* leaf_index, const data_size_t num_data_in_leaf, + const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left, const double* best_split_gain, const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, @@ -209,11 +227,15 @@ class CUDADataPartition { double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, const int cpu_leaf_index); + hist_t** larger_leaf_cuda_hist_pointer_pointer); void LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t num_data_in_leaf, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); + void LaunchGenDataToLeftBitVectorKernel2(const data_size_t num_data_in_leaf, + const int split_feature_index, const uint32_t split_threshold, + const uint8_t split_default_left, const data_size_t leaf_data_start); + void LaunchPrefixSumKernel(uint32_t* cuda_elements); void LaunchAddPredictionToScoreKernel(const double learning_rate); @@ -236,6 +258,7 @@ class CUDADataPartition { std::vector num_data_in_leaf_; int cur_num_leaves_; std::vector cpu_train_data_score_tmp_; + std::vector cpu_split_info_buffer_; // CUDA memory, held by this object data_size_t* cuda_data_indices_; @@ -257,6 +280,7 @@ class CUDADataPartition { uint8_t* cuda_feature_mfb_is_zero_; uint8_t* cuda_feature_mfb_is_na_; int* cuda_num_total_bin_; + int* cuda_split_info_buffer_; // prepared to be copied to cpu // for histogram pool hist_t** cuda_hist_pool_; // for tree structure @@ -274,6 +298,7 @@ class CUDADataPartition { double* data_partition_leaf_output_; // for train data update double* train_data_score_tmp_; + uint8_t* cuda_data_col_wise_; // CUDA memory, held by other object const data_size_t* cuda_num_data_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 033c103a6b53..94fb814fa854 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -56,20 +56,12 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, const int num_feature_groups_ref = *num_feature_groups; const int leaf_index_ref = *leaf_index; const int dim_y = gridDim.y * blockDim.y; - //const int cuda_num_total_bin_ref = *cuda_num_total_bin; hist_t* feature_histogram_ptr = *feature_histogram; - /*if (blockIdx.x == 0 && threadIdx.x == 0 && blockIdx.y == 0 && threadIdx.y == 0) { - printf("construct histogram for leaf %d\n", leaf_index_ref); - }*/ const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; const data_size_t* data_indices_ref = *data_indices_ptr; __shared__ float shared_hist[SHRAE_HIST_SIZE]; // 256 * 24 * 2, can use 24 features uint32_t num_bins_in_col_group = feature_group_offsets[blockDim.x]; - /*if (blockIdx.x == 0 && threadIdx.x == 0 && blockIdx.y == 0 && threadIdx.y == 0) { - printf("cuda_num_total_bin_ref = %d\n", cuda_num_total_bin_ref); - printf("num_bins_in_col_group %d\n", num_bins_in_col_group); - }*/ const uint32_t num_items_per_thread = (2 * num_bins_in_col_group + NUM_THRADS_PER_BLOCK - 1) / NUM_THRADS_PER_BLOCK; const int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; const uint32_t thread_start = thread_idx * num_items_per_thread; @@ -84,17 +76,6 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, const data_size_t start = (threadIdx_y + blockIdx_y * blockDim.y) * num_data_per_thread; const data_size_t end = start + num_data_per_thread > num_data_in_smaller_leaf_ref ? num_data_in_smaller_leaf_ref : start + num_data_per_thread; - /*if (blockIdx.x == 0 && threadIdx.x == 0 && blockIdx.y == 0 && threadIdx.y == 0) { - if (leaf_index_ref == 2) { - for (data_size_t i = 0; i < 10; ++i) { - printf("leaf 2 data index %d = %d\n", i, data_indices_ref[i]); - } - printf("===========================================\n"); - for (data_size_t i = 1030726 - 10; i < 1030726; ++i) { - printf("leaf 2 data index %d = %d\n", i, data_indices_ref[i]); - } - } - }*/ for (data_size_t i = start; i < end; ++i) { const data_size_t data_index = data_indices_ref[i]; const score_t grad = cuda_gradients[data_index]; diff --git a/src/treelearner/cuda/new_cuda_utils.hpp b/src/treelearner/cuda/new_cuda_utils.hpp index 98820187ed9f..846bf43c85e2 100644 --- a/src/treelearner/cuda/new_cuda_utils.hpp +++ b/src/treelearner/cuda/new_cuda_utils.hpp @@ -54,6 +54,14 @@ void CopyFromCUDADeviceToHost(T* dst_ptr, const T* src_ptr, size_t size) { CUDASUCCESS_OR_FATAL(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost)); } +template +void CopyFromCUDADeviceToHostAsync(T* dst_ptr, const T* src_ptr, size_t size) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost)); +} + template void CopyFromCUDADeviceToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size) { void* void_dst_ptr = reinterpret_cast(dst_ptr); @@ -62,6 +70,14 @@ void CopyFromCUDADeviceToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size) { CUDASUCCESS_OR_FATAL(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice)); } +template +void CopyFromCUDADeviceToCUDADeviceAsync(T* dst_ptr, const T* src_ptr, size_t size) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice)); +} + void SynchronizeCUDADevice(); template From 57547fb4d0e3324237c110a0f412af222ba08cf2 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 17 May 2021 05:57:42 +0000 Subject: [PATCH 012/166] move boosting into cuda --- CMakeLists.txt | 6 ++ src/boosting/gbdt.cpp | 2 +- .../cuda/cuda_binary_objective.cpp | 31 ++++++++ src/treelearner/cuda/cuda_binary_objective.cu | 79 +++++++++++++++++++ .../cuda/cuda_binary_objective.hpp | 47 +++++++++++ .../cuda/cuda_centralized_info.cpp | 4 +- .../cuda/cuda_centralized_info.hpp | 9 ++- src/treelearner/cuda/cuda_data_partition.cpp | 8 +- src/treelearner/cuda/cuda_data_partition.cu | 30 +++---- src/treelearner/cuda/cuda_data_partition.hpp | 4 +- .../cuda/cuda_histogram_constructor.cpp | 5 ++ .../cuda/cuda_histogram_constructor.cu | 30 ++++++- .../cuda/cuda_histogram_constructor.hpp | 6 ++ src/treelearner/cuda/cuda_objective.cpp | 17 ++++ src/treelearner/cuda/cuda_objective.hpp | 30 +++++++ src/treelearner/cuda/cuda_score_updater.cpp | 30 +++++++ src/treelearner/cuda/cuda_score_updater.cu | 40 ++++++++++ src/treelearner/cuda/cuda_score_updater.hpp | 46 +++++++++++ .../cuda/new_cuda_tree_learner.cpp | 21 ++++- .../cuda/new_cuda_tree_learner.hpp | 6 ++ 20 files changed, 418 insertions(+), 33 deletions(-) create mode 100644 src/treelearner/cuda/cuda_binary_objective.cpp create mode 100644 src/treelearner/cuda/cuda_binary_objective.cu create mode 100644 src/treelearner/cuda/cuda_binary_objective.hpp create mode 100644 src/treelearner/cuda/cuda_objective.cpp create mode 100644 src/treelearner/cuda/cuda_objective.hpp create mode 100644 src/treelearner/cuda/cuda_score_updater.cpp create mode 100644 src/treelearner/cuda/cuda_score_updater.cu create mode 100644 src/treelearner/cuda/cuda_score_updater.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 05fb49e13ddd..65696c0f4b5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -244,6 +244,12 @@ if(USE_CUDA) add_library(cuda_best_split_finder OBJECT src/treelearner/cuda/cuda_best_split_finder.cu) set_target_properties(cuda_best_split_finder PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + add_library(cuda_binary_objective OBJECT src/treelearner/cuda/cuda_binary_objective.cu) + set_target_properties(cuda_binary_objective PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + add_library(cuda_score_updater OBJECT src/treelearner/cuda/cuda_score_updater.cu) + set_target_properties(cuda_score_updater PROPERTIES CUDA_SEPARABLE_COMPILATION ON) endif(USE_CUDA) if(USE_HDFS) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index cf49060f74e1..aae328d0f819 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -374,7 +374,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true); } - Boosting(); + //Boosting(); gradients = gradients_.data(); hessians = hessians_.data(); } diff --git a/src/treelearner/cuda/cuda_binary_objective.cpp b/src/treelearner/cuda/cuda_binary_objective.cpp new file mode 100644 index 000000000000..aaedf907e454 --- /dev/null +++ b/src/treelearner/cuda/cuda_binary_objective.cpp @@ -0,0 +1,31 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_binary_objective.hpp" + +namespace LightGBM { + +CUDABinaryObjective::CUDABinaryObjective(const data_size_t num_data, const label_t* cuda_labels, const double sigmoid): +CUDAObjective(num_data), cuda_labels_(cuda_labels), sigmoid_(sigmoid) {} + +void CUDABinaryObjective::Init() { + AllocateCUDAMemory(1, &cuda_init_score_); + SetCUDAMemory(cuda_init_score_, 0, 1); +} + +void CUDABinaryObjective::GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + LaunchGetGradientsKernel(cuda_scores, cuda_out_gradients, cuda_out_hessians); +} + +void CUDABinaryObjective::CalcInitScore() { + LaunchCalcInitScoreKernel(); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_binary_objective.cu b/src/treelearner/cuda/cuda_binary_objective.cu new file mode 100644 index 000000000000..be0bb25ef062 --- /dev/null +++ b/src/treelearner/cuda/cuda_binary_objective.cu @@ -0,0 +1,79 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_binary_objective.hpp" + +namespace LightGBM { + +__global__ void CalcInitScoreKernel_1(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_init_score) { + __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE]; + const unsigned int tid = threadIdx.x; + const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; + shared_label[tid] = 0.0f; + __syncthreads(); + for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; ++j) { + if (i + j < num_data) { + shared_label[tid] += cuda_labels[i + j]; + } + } + __syncthreads(); + for (unsigned int s = 1; s < blockDim.x; s *= 2) { + if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE) { + shared_label[tid] += shared_label[tid + s]; + } + __syncthreads(); + } + if (tid == 0) { + atomicAdd_system(out_cuda_init_score, shared_label[0]); + } +} + +__global__ void CalcInitScoreKernel_2(double* out_cuda_init_score, const data_size_t num_data, const double sigmoid) { + const double suml = *out_cuda_init_score; + const double sumw = static_cast(num_data); + const double pavg = suml / sumw; + const double init_score = log(pavg / (1.0f - pavg)) / sigmoid; + *out_cuda_init_score = init_score; + printf("cuda init score suml = %f\n", suml); + printf("cuda init score sumw = %f\n", sumw); + printf("cuda init score pavg = %f\n", pavg); + printf("cuda init score = %f\n", init_score); +} + +void CUDABinaryObjective::LaunchCalcInitScoreKernel() { + const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; + const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; + CalcInitScoreKernel_1<<>>(cuda_labels_, num_data_, cuda_init_score_); + SynchronizeCUDADevice(); + CalcInitScoreKernel_2<<<1, 1>>>(cuda_init_score_, num_data_, sigmoid_); + SynchronizeCUDADevice(); +} + +__global__ void GetGradientsKernel(const double* cuda_scores, const label_t* cuda_labels, + const double sigmoid, const data_size_t num_data, + score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + const label_t cuda_label = static_cast(cuda_labels[data_index]); + const int label = cuda_label == 0 ? -1 : 1; + const double response = -label * sigmoid / (1.0f + std::exp(label * sigmoid * cuda_scores[data_index])); + const double abs_response = fabs(response); + cuda_out_gradients[data_index] = static_cast(response); + cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response)); + } +} + +void CUDABinaryObjective::LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE - 1) / GET_GRADIENTS_BLOCK_SIZE; + GetGradientsKernel<<>>(cuda_scores, cuda_labels_, sigmoid_, num_data_, + cuda_out_gradients, cuda_out_hessians); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_binary_objective.hpp b/src/treelearner/cuda/cuda_binary_objective.hpp new file mode 100644 index 000000000000..580ea67b6290 --- /dev/null +++ b/src/treelearner/cuda/cuda_binary_objective.hpp @@ -0,0 +1,47 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifndef LIGHTGBM_NEW_CUDA_BINARY_OBJECTIVE_HPP_ +#define LIGHTGBM_NEW_CUDA_BINARY_OBJECTIVE_HPP_ + +#ifdef USE_CUDA + +#define GET_GRADIENTS_BLOCK_SIZE (1024) +#define CALC_INIT_SCORE_BLOCK_SIZE (1024) +#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE (6) + +#include "cuda_objective.hpp" + +namespace LightGBM { + +class CUDABinaryObjective : public CUDAObjective { + public: + CUDABinaryObjective(const data_size_t num_data, const label_t* cuda_label, const double sigmoid); + + void Init(); + + void CalcInitScore(); + + const double* cuda_init_score() const { + return cuda_init_score_; + } + + void GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) override; + + private: + void LaunchCalcInitScoreKernel(); + + void LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians); + + const label_t* cuda_labels_; + double* cuda_init_score_; + const double sigmoid_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_NEW_CUDA_BINARY_OBJECTIVE_HPP_ diff --git a/src/treelearner/cuda/cuda_centralized_info.cpp b/src/treelearner/cuda/cuda_centralized_info.cpp index 45998445c4ac..50553a7cc5c2 100644 --- a/src/treelearner/cuda/cuda_centralized_info.cpp +++ b/src/treelearner/cuda/cuda_centralized_info.cpp @@ -13,13 +13,15 @@ namespace LightGBM { CUDACentralizedInfo::CUDACentralizedInfo(const data_size_t num_data, const int num_leaves, const int num_features): num_data_(num_data), num_leaves_(num_leaves), num_features_(num_features) {} -void CUDACentralizedInfo::Init() { +void CUDACentralizedInfo::Init(const score_t* labels) { InitCUDAMemoryFromHostMemory(&cuda_num_data_, &num_data_, 1); InitCUDAMemoryFromHostMemory(&cuda_num_leaves_, &num_leaves_, 1); InitCUDAMemoryFromHostMemory(&cuda_num_features_, &num_features_, 1); AllocateCUDAMemory(static_cast(num_data_), &cuda_gradients_); AllocateCUDAMemory(static_cast(num_data_), &cuda_hessians_); + + InitCUDAMemoryFromHostMemory(&cuda_labels_, labels, num_data_); } void CUDACentralizedInfo::BeforeTrain(const score_t* gradients, const score_t* hessians) { diff --git a/src/treelearner/cuda/cuda_centralized_info.hpp b/src/treelearner/cuda/cuda_centralized_info.hpp index 97844e2cb19b..d005975cdccb 100644 --- a/src/treelearner/cuda/cuda_centralized_info.hpp +++ b/src/treelearner/cuda/cuda_centralized_info.hpp @@ -21,7 +21,7 @@ class CUDACentralizedInfo { public: CUDACentralizedInfo(const data_size_t num_data, const int num_leaves, const int num_features); - void Init(); + void Init(const label_t* labels); void BeforeTrain(const score_t* gradients, const score_t* hessians); @@ -35,6 +35,12 @@ class CUDACentralizedInfo { const score_t* cuda_hessians() const { return cuda_hessians_; } + const label_t* cuda_labels() const { return cuda_labels_; } + + score_t* cuda_gradients_ref() { return cuda_gradients_; } + + score_t* cuda_hessians_ref() { return cuda_hessians_; } + void Test() { data_size_t test_num_data = 0; int test_num_leaves = 0; @@ -60,6 +66,7 @@ class CUDACentralizedInfo { int* cuda_num_features_; score_t* cuda_gradients_; score_t* cuda_hessians_; + label_t* cuda_labels_; }; } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 6c6fcb051e08..d051f5f7a16c 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -270,12 +270,12 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ Tree* CUDADataPartition::GetCPUTree() {} -void CUDADataPartition::UpdateTrainScore(const double learning_rate, double* train_score) { - LaunchAddPredictionToScoreKernel(learning_rate); - #pragma omp parallel for schedule(static) num_threads(num_threads_) +void CUDADataPartition::UpdateTrainScore(const double learning_rate, double* train_score, double* cuda_scores) { + LaunchAddPredictionToScoreKernel(learning_rate, cuda_scores); + /*#pragma omp parallel for schedule(static) num_threads(num_threads_) for (data_size_t i = 0; i < num_data_; ++i) { train_score[i] += cpu_train_data_score_tmp_[i]; - } + }*/ } void CUDADataPartition::PrepareCUDASplitInforBuffer(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 1f3d0d879f74..0e930acfce96 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -648,35 +648,25 @@ __global__ void PrepareOffsetKernel(const int* leaf_index, const unsigned int blockDim_x = blockDim.x; __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - //__shared__ uint32_t thread_to_right_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - // (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const unsigned int threadIdx_x = threadIdx.x; const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); const unsigned int global_read_index = blockIdx.x * blockDim.x * 2 + threadIdx_x; const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[*leaf_index]; - /*if (blockIdx.x == 0 && threadIdx_x == 0) { - printf("PrepareOffsetKernel leaf_index = %d, num_data_in_leaf = %d\n", (*leaf_index), num_data_in_leaf_ref); - }*/ if (global_read_index < num_data_in_leaf_ref) { const uint8_t bit = split_to_left_bit_vector[global_read_index]; thread_to_left_offset_cnt[conflict_free_threadIdx_x] = bit; - //thread_to_right_offset_cnt[conflict_free_threadIdx_x] = 1 - bit; } else { thread_to_left_offset_cnt[conflict_free_threadIdx_x] = 0; - //thread_to_right_offset_cnt[conflict_free_threadIdx_x] = 0; } const unsigned int conflict_free_threadIdx_x_offseted = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x); if (global_read_index + blockDim_x < num_data_in_leaf_ref) { const uint8_t bit = split_to_left_bit_vector[global_read_index + blockDim_x]; thread_to_left_offset_cnt[conflict_free_threadIdx_x_offseted] = bit; - //thread_to_right_offset_cnt[conflict_free_threadIdx_x_offseted] = 1 - bit; } else { thread_to_left_offset_cnt[conflict_free_threadIdx_x_offseted] = 0; - //thread_to_right_offset_cnt[conflict_free_threadIdx_x_offseted] = 0; } __syncthreads(); PrefixSum(thread_to_left_offset_cnt, split_indices_block_size_data_partition); - //PrefixSum(thread_to_right_offset_cnt, split_indices_block_size_data_partition); __syncthreads(); if (threadIdx_x == 0) { const unsigned int conflict_free_blockDim_x_times_2 = CONFLICT_FREE_INDEX(blockDim_x << 1); @@ -1097,31 +1087,33 @@ void CUDADataPartition::LaunchPrefixSumKernel(uint32_t* cuda_elements) { __global__ void AddPredictionToScoreKernel(const double* data_partition_leaf_output, const data_size_t* num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const data_size_t* leaf_data_start, const double learning_rate, double* output_score) { + const data_size_t* leaf_data_start, const double learning_rate, double* output_score, double* cuda_scores) { const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockIdx_x = blockIdx.x; const unsigned int blockDim_x = blockDim.x; const data_size_t num_data = num_data_in_leaf[blockIdx_x]; const data_size_t* data_indices = data_indices_in_leaf + leaf_data_start[blockIdx_x]; const double leaf_prediction_value = data_partition_leaf_output[blockIdx_x] * learning_rate; - /*if (threadIdx_x == 0) { - printf("leaf index = %d, leaf_prediction_value = %f\n", blockIdx_x, leaf_prediction_value); - }*/ for (unsigned int offset = 0; offset < static_cast(num_data); offset += blockDim_x) { const data_size_t inner_data_index = static_cast(offset + threadIdx_x); if (inner_data_index < num_data) { const data_size_t data_index = data_indices[inner_data_index]; - output_score[data_index] = leaf_prediction_value; + //output_score[data_index] = leaf_prediction_value; + cuda_scores[data_index] += leaf_prediction_value; } } } -void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double learning_rate) { +void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double learning_rate, double* cuda_scores) { + global_timer.Start("CUDADataPartition::AddPredictionToScoreKernel"); AddPredictionToScoreKernel<<>>(data_partition_leaf_output_, - cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, learning_rate, train_data_score_tmp_); - SynchronizeCUDADevice(); - CopyFromCUDADeviceToHost(cpu_train_data_score_tmp_.data(), train_data_score_tmp_, static_cast(num_data_)); + cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, learning_rate, train_data_score_tmp_, cuda_scores); SynchronizeCUDADevice(); + global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel"); + global_timer.Start("CUDADataPartition::Copy Score"); + //CopyFromCUDADeviceToHost(cpu_train_data_score_tmp_.data(), train_data_score_tmp_, static_cast(num_data_)); + //SynchronizeCUDADevice(); + global_timer.Stop("CUDADataPartition::Copy Score"); } __global__ void PrepareCUDASplitInforBufferKernel(const int* leaf_index, const int* best_split_feature, const uint32_t* best_split_threshold, diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index ee26e1a469b2..78c22b167261 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -132,7 +132,7 @@ class CUDADataPartition { } } - void UpdateTrainScore(const double learning_rate, double* train_score); + void UpdateTrainScore(const double learning_rate, double* train_score, double* cuda_scores); const data_size_t* cuda_leaf_data_start() const { return cuda_leaf_data_start_; } @@ -238,7 +238,7 @@ class CUDADataPartition { void LaunchPrefixSumKernel(uint32_t* cuda_elements); - void LaunchAddPredictionToScoreKernel(const double learning_rate); + void LaunchAddPredictionToScoreKernel(const double learning_rate, double* cuda_scores); // Host memory const data_size_t num_data_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 3e869ffc0081..f22a0596731f 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -54,6 +54,9 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data) { AllocateCUDAMemory(num_total_bin_ * 2 * num_leaves_, &cuda_hist_); SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_); + AllocateCUDAMemory(num_data_, &cuda_ordered_gradients_); + AllocateCUDAMemory(num_data_, &cuda_ordered_hessians_); + InitCUDAMemoryFromHostMemory(&cuda_num_total_bin_, &num_total_bin_, 1); InitCUDAMemoryFromHostMemory(&cuda_num_feature_groups_, &num_feature_groups_, 1); @@ -113,12 +116,14 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); //Log::Warning("LaunchConstructHistogramKernel time %f", duration); + global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); start = std::chrono::steady_clock::now(); LaunchSubtractHistogramKernel(cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_smaller_leaf_sum_gradients, cuda_smaller_leaf_sum_hessians, cuda_larger_leaf_sum_gradients, cuda_larger_leaf_sum_hessians, cuda_smaller_leaf_hist, cuda_larger_leaf_hist); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); + global_timer.Stop("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); //Log::Warning("LaunchSubtractHistogramKernel time %f", duration); /*PrintLastCUDAError(); std::vector cpu_hist(6143 * 2, 0.0f); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 94fb814fa854..03a4b7c8df66 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -67,6 +67,7 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, const uint32_t thread_start = thread_idx * num_items_per_thread; const uint32_t thread_end = thread_start + num_items_per_thread > num_bins_in_col_group * 2 ? num_bins_in_col_group * 2 : thread_start + num_items_per_thread; + const uint32_t feature_group_offset = feature_group_offsets[threadIdx_x]; for (uint32_t i = thread_start; i < thread_end; ++i) { shared_hist[i] = 0.0f; } @@ -80,8 +81,7 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, const data_size_t data_index = data_indices_ref[i]; const score_t grad = cuda_gradients[data_index]; const score_t hess = cuda_hessians[data_index]; - const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + threadIdx_x]) + - feature_group_offsets[threadIdx_x]; + const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + threadIdx_x]) + feature_group_offset; const uint32_t pos = bin << 1; float* pos_ptr = shared_hist + pos; atomicAdd_system(pos_ptr, grad); @@ -99,11 +99,13 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const data_size_t* cuda_leaf_num_data, hist_t** cuda_leaf_hist) { int smaller_leaf_index = 0; + global_timer.Start("CUDAHistogramConstructor::LaunchConstructHistogramKernel::CopyFromCUDADeviceToHost"); CopyFromCUDADeviceToHost(&smaller_leaf_index, cuda_smaller_leaf_index, 1); SynchronizeCUDADevice(); data_size_t smaller_leaf_num_data = 0; CopyFromCUDADeviceToHost(&smaller_leaf_num_data, cuda_leaf_num_data + smaller_leaf_index, 1); SynchronizeCUDADevice(); + global_timer.Stop("CUDAHistogramConstructor::LaunchConstructHistogramKernel::CopyFromCUDADeviceToHost"); const int block_dim_x = num_features_; // TODO(shiyu1994): only supports the case when the whole histogram can be loaded into shared memory const int block_dim_y = NUM_THRADS_PER_BLOCK / block_dim_x; const int min_grid_dim_y = 80; @@ -223,6 +225,30 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(const int* cuda_sma //Log::Warning("After FixHistogramKernel"); } +__global__ void GetOrderedGradientsKernel(const data_size_t num_data_in_leaf, const data_size_t** cuda_data_indices_in_leaf, + const score_t* cuda_gradients, const score_t* cuda_hessians, + score_t* cuda_ordered_gradients, score_t* cuda_ordered_hessians) { + const data_size_t* cuda_data_indices_in_leaf_ref = *cuda_data_indices_in_leaf; + const unsigned int local_data_index = threadIdx.x + blockIdx.x * blockDim.x; + if (local_data_index < static_cast(num_data_in_leaf)) { + const data_size_t global_data_index = cuda_data_indices_in_leaf_ref[local_data_index]; + cuda_ordered_gradients[local_data_index] = cuda_gradients[global_data_index]; + cuda_ordered_hessians[local_data_index] = cuda_hessians[global_data_index]; + } +} + +void CUDAHistogramConstructor::LaunchGetOrderedGradientsKernel( + const data_size_t num_data_in_leaf, + const data_size_t** cuda_data_indices_in_leaf) { + if (num_data_in_leaf < num_data_) { + const int num_data_per_block = 1024; + const int num_blocks = (num_data_in_leaf + num_data_per_block - 1) / num_data_per_block; + GetOrderedGradientsKernel<<>>(num_data_in_leaf, cuda_data_indices_in_leaf, + cuda_gradients_, cuda_hessians_, cuda_ordered_gradients_, cuda_ordered_hessians_); + SynchronizeCUDADevice(); + } +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 73b7cdd0cf45..11fb506e0859 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -77,6 +77,10 @@ class CUDAHistogramConstructor { } private: + void LaunchGetOrderedGradientsKernel( + const data_size_t num_data_in_leaf, + const data_size_t** cuda_data_indices_in_leaf); + void LaunchConstructHistogramKernel(const int* cuda_leaf_index, const data_size_t** cuda_data_indices_in_leaf, const data_size_t* cuda_leaf_num_data, @@ -117,6 +121,8 @@ class CUDAHistogramConstructor { int* cuda_num_feature_groups_; uint8_t* cuda_data_; int* cuda_num_features_; + score_t* cuda_ordered_gradients_; + score_t* cuda_ordered_hessians_; // CUDA memory, held by other objects const score_t* cuda_gradients_; diff --git a/src/treelearner/cuda/cuda_objective.cpp b/src/treelearner/cuda/cuda_objective.cpp new file mode 100644 index 000000000000..4996396a108e --- /dev/null +++ b/src/treelearner/cuda/cuda_objective.cpp @@ -0,0 +1,17 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_objective.hpp" + +namespace LightGBM { + +CUDAObjective::CUDAObjective(const data_size_t num_data): num_data_(num_data) {} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_objective.hpp b/src/treelearner/cuda/cuda_objective.hpp new file mode 100644 index 000000000000..f15fa74e89b4 --- /dev/null +++ b/src/treelearner/cuda/cuda_objective.hpp @@ -0,0 +1,30 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifndef LIGHTGBM_NEW_CUDA_OBJECTIVE_HPP_ +#define LIGHTGBM_NEW_CUDA_OBJECTIVE_HPP_ + +#ifdef USE_CUDA + +#include "new_cuda_utils.hpp" +#include + +namespace LightGBM { + +class CUDAObjective { + public: + CUDAObjective(const data_size_t num_data); + + virtual void GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) = 0; + + protected: + const data_size_t num_data_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_NEW_CUDA_OBJECTIVE_HPP_ \ No newline at end of file diff --git a/src/treelearner/cuda/cuda_score_updater.cpp b/src/treelearner/cuda/cuda_score_updater.cpp new file mode 100644 index 000000000000..18aa3b9beaf2 --- /dev/null +++ b/src/treelearner/cuda/cuda_score_updater.cpp @@ -0,0 +1,30 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_score_updater.hpp" + +namespace LightGBM { + +CUDAScoreUpdater::CUDAScoreUpdater(const data_size_t num_data): +num_data_(num_data) {} + +void CUDAScoreUpdater::Init() { + AllocateCUDAMemory(static_cast(num_data_), &cuda_scores_); +} + +void CUDAScoreUpdater::SetInitScore(const double* cuda_init_score) { + LaunchSetInitScoreKernel(cuda_init_score); +} + +void CUDAScoreUpdater::AddScore(const double* cuda_score_to_add) { + LaunchAddScoreKernel(cuda_score_to_add); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_score_updater.cu b/src/treelearner/cuda/cuda_score_updater.cu new file mode 100644 index 000000000000..9013e8e4d5e6 --- /dev/null +++ b/src/treelearner/cuda/cuda_score_updater.cu @@ -0,0 +1,40 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_score_updater.hpp" + +namespace LightGBM { + +__global__ void SetInitScoreKernel(double* cuda_scores, const double* cuda_init_score, const data_size_t num_data) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + const double init_score = *cuda_init_score; + if (data_index < num_data) { + cuda_scores[data_index] = init_score; + } +} + +void CUDAScoreUpdater::LaunchSetInitScoreKernel(const double* cuda_init_score) { + const int num_blocks = (num_data_ + SET_INIT_SCORE_BLOCK_SIZE - 1) / SET_INIT_SCORE_BLOCK_SIZE; + SetInitScoreKernel<<>>(cuda_scores_, cuda_init_score, num_data_); +} + +__global__ void AddScoreKernel(double* cuda_scores, const double* cuda_scores_to_add, const data_size_t num_data) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + cuda_scores[data_index] += cuda_scores_to_add[data_index]; + } +} + +void CUDAScoreUpdater::LaunchAddScoreKernel(const double* cuda_scores_to_add) { + const int num_blocks = (num_data_ + SET_INIT_SCORE_BLOCK_SIZE - 1) / SET_INIT_SCORE_BLOCK_SIZE; + AddScoreKernel<<>>(cuda_scores_, cuda_scores_to_add, num_data_); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_score_updater.hpp b/src/treelearner/cuda/cuda_score_updater.hpp new file mode 100644 index 000000000000..8af36a38d603 --- /dev/null +++ b/src/treelearner/cuda/cuda_score_updater.hpp @@ -0,0 +1,46 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_NEW_CUDA_SCORE_UPDATER_HPP_ +#define LIGHTGBM_NEW_CUDA_SCORE_UPDATER_HPP_ + +#ifdef USE_CUDA + +#include +#include "new_cuda_utils.hpp" + +#include + +#define SET_INIT_SCORE_BLOCK_SIZE (1024) + +namespace LightGBM { + +class CUDAScoreUpdater { + public: + CUDAScoreUpdater(const data_size_t num_data); + + void Init(); + + void SetInitScore(const double* cuda_init_score); + + void AddScore(const double* cuda_score_to_add); + + const double* cuda_scores() const { return cuda_scores_; } + + double* cuda_score_ref() { return cuda_scores_; } + + private: + void LaunchSetInitScoreKernel(const double* cuda_init_score); + + void LaunchAddScoreKernel(const double* cuda_scores_to_add); + + const data_size_t num_data_; + double* cuda_scores_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_NEW_CUDA_SCORE_UPDATER_HPP_ diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 2b14b89c4c7a..271e90dca459 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -22,8 +22,9 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia SerialTreeLearner::Init(train_data, is_constant_hessian); num_threads_ = OMP_NUM_THREADS(); CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + const label_t* labels = train_data->metadata().label(); cuda_centralized_info_.reset(new CUDACentralizedInfo(num_data_, this->config_->num_leaves, num_features_)); - cuda_centralized_info_->Init(); + cuda_centralized_info_->Init(labels); //cuda_centralized_info_->Test(); cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0, cuda_centralized_info_->cuda_gradients(), @@ -50,6 +51,16 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia this->config_->min_sum_hessian_in_leaf, this->config_->min_gain_to_split, cuda_centralized_info_->cuda_num_features())); cuda_best_split_finder_->Init(); + + cuda_score_updater_.reset(new CUDAScoreUpdater(num_data_)); + cuda_score_updater_->Init(); + + cuda_binary_objective_.reset(new CUDABinaryObjective(num_data_, + cuda_centralized_info_->cuda_labels(), config_->sigmoid)); + cuda_binary_objective_->Init(); + cuda_binary_objective_->CalcInitScore(); + + cuda_score_updater_->SetInitScore(cuda_binary_objective_->cuda_init_score()); //cuda_best_split_finder_->TestAfterInit(); } @@ -59,10 +70,14 @@ void NewCUDATreeLearner::BeforeTrain() { auto end = std::chrono::steady_clock::now(); auto duration = static_cast>(end - start); //Log::Warning("cuda_data_partition_->BeforeTrain duration = %f", duration.count()); + global_timer.Start("CUDACentralizedInfo::BeforeTrain"); start = std::chrono::steady_clock::now(); - cuda_centralized_info_->BeforeTrain(gradients_, hessians_); + //cuda_centralized_info_->BeforeTrain(gradients_, hessians_); + cuda_binary_objective_->GetGradients(cuda_score_updater_->cuda_scores(), + cuda_centralized_info_->cuda_gradients_ref(), cuda_centralized_info_->cuda_hessians_ref()); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); + global_timer.Stop("CUDACentralizedInfo::BeforeTrain"); //Log::Warning("cuda_centralized_info_->BeforeTrain duration = %f", duration.count()); cuda_smaller_leaf_splits_->InitValues(cuda_data_partition_->cuda_data_indices(), cuda_histogram_constructor_->cuda_hist_pointer()); cuda_larger_leaf_splits_->InitValues(); @@ -259,7 +274,7 @@ void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, void NewCUDATreeLearner::AddPredictionToScore(const Tree* /*tree*/, double* out_score) const { const auto start = std::chrono::steady_clock::now(); - cuda_data_partition_->UpdateTrainScore(config_->learning_rate, out_score); + cuda_data_partition_->UpdateTrainScore(config_->learning_rate, out_score, cuda_score_updater_->cuda_score_ref()); const auto end = std::chrono::steady_clock::now(); const auto duration = static_cast>(end - start).count(); Log::Warning("AddPredictionToScore time %f", duration); diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 0b181b9bbf14..4a8622bf0606 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -14,6 +14,8 @@ #include "cuda_data_partition.hpp" #include "cuda_best_split_finder.hpp" #include "cuda_centralized_info.hpp" +#include "cuda_score_updater.hpp" +#include "cuda_binary_objective.hpp" namespace LightGBM { @@ -73,6 +75,10 @@ class NewCUDATreeLearner: public SerialTreeLearner { // for best split information finding, given the histograms std::unique_ptr cuda_best_split_finder_; + std::unique_ptr cuda_score_updater_; + + std::unique_ptr cuda_binary_objective_; + /* // full data indices on CUDA devices, as the data indices of data_partition_ in CPU version std::vector device_data_indices_; From 608fd701223e230c3b04052b0f3f49a567b085a6 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 27 May 2021 14:57:57 +0000 Subject: [PATCH 013/166] parallel synchronize best split point --- .../cuda/cuda_best_split_finder.cpp | 2 + .../cuda/cuda_best_split_finder.cu | 348 +++++++----------- .../cuda/cuda_best_split_finder.hpp | 3 +- src/treelearner/cuda/cuda_data_partition.cpp | 4 + src/treelearner/cuda/cuda_data_partition.cu | 131 ++++--- src/treelearner/cuda/cuda_data_partition.hpp | 3 + .../cuda/cuda_histogram_constructor.cpp | 6 +- .../cuda/cuda_histogram_constructor.cu | 31 +- .../cuda/cuda_histogram_constructor.hpp | 3 +- .../cuda/new_cuda_tree_learner.cpp | 1 + 10 files changed, 254 insertions(+), 278 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index f2543efacac8..84be6f4839ac 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -129,8 +129,10 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le larger_leaf_splits->cuda_num_data_in_leaf(), larger_leaf_splits->cuda_hist_in_leaf_pointer_pointer()); SynchronizeCUDADevice(); + global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); LaunchSyncBestSplitForLeafKernel(smaller_leaf_splits->cuda_leaf_index(), larger_leaf_splits->cuda_leaf_index()); SynchronizeCUDADevice(); + global_timer.Stop("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); //Log::Warning("FindBestSplitsForLeaf time %f", duration); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 1f72c275a870..d5eeac62c8b9 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -89,7 +89,7 @@ __device__ void PrefixSumHistCnt(data_size_t* elements, unsigned int n) { } __device__ void ReduceBestGain(double* gain, hist_t* sum_gradients, - hist_t* sum_hessians, data_size_t* num_data, uint8_t* found, + hist_t* sum_hessians, /*data_size_t* num_data,*/ uint8_t* found, uint32_t* threshold_value) { const unsigned int tid = threadIdx.x; const unsigned int conflict_free_tid_plus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(tid + 1); @@ -101,7 +101,7 @@ __device__ void ReduceBestGain(double* gain, hist_t* sum_gradients, gain[tid] = gain[tid_s]; sum_gradients[conflict_free_tid_plus_1] = sum_gradients[conflict_free_tid_s_plus_1]; sum_hessians[conflict_free_tid_plus_1] = sum_hessians[conflict_free_tid_s_plus_1]; - num_data[conflict_free_tid_plus_1] = num_data[conflict_free_tid_s_plus_1]; + //num_data[conflict_free_tid_plus_1] = num_data[conflict_free_tid_s_plus_1]; found[tid] = found[tid_s]; threshold_value[tid] = threshold_value[tid_s]; } @@ -211,7 +211,7 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr __shared__ hist_t local_grad_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; __shared__ hist_t local_hess_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; - __shared__ data_size_t local_cnt_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; + //__shared__ data_size_t local_cnt_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; __shared__ double local_gain[MAX_NUM_BIN_IN_FEATURE]; __shared__ uint8_t threshold_found[MAX_NUM_BIN_IN_FEATURE]; __shared__ uint32_t threshold_value[MAX_NUM_BIN_IN_FEATURE]; @@ -220,28 +220,18 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr const bool skip_sum = (skip_default_bin && (threadIdx_x + feature_mfb_offset) == static_cast(feature_default_bin)); const uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; const bool skip_split = (skip_default_bin && (feature_num_bin_minus_offset - 1 - threadIdx_x + feature_mfb_offset == static_cast(feature_default_bin))); - /*if (threadIdx_x == 0) { - printf("feature_num_bin_minus_offset = %d\n", feature_num_bin_minus_offset); - }*/ const unsigned int bin_offset = threadIdx_x << 1; - /*hist_t default_bin_grad = 0.0f; - hist_t default_bin_hess = 0.0f; - if (feature_default_bin >= feature_mfb_offset) { - const uint32_t default_bin_pos = feature_default_bin - feature_mfb_offset; - default_bin_grad = feature_hist_ptr[default_bin_pos << 1]; - default_bin_hess = feature_hist_ptr[(default_bin_pos << 1) + 1]; - }*/ const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(threadIdx_x); if (!reverse) { if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { local_grad_hist[conflict_free_threadIdx_x] = feature_hist_ptr[bin_offset]; const hist_t hess = feature_hist_ptr[bin_offset + 1]; local_hess_hist[conflict_free_threadIdx_x] = hess; - local_cnt_hist[conflict_free_threadIdx_x] = static_cast(__double2int_rn(hess * cnt_factor)); + //local_cnt_hist[conflict_free_threadIdx_x] = static_cast(__double2int_rn(hess * cnt_factor)); } else { local_grad_hist[conflict_free_threadIdx_x] = 0.0f; local_hess_hist[conflict_free_threadIdx_x] = 0.0f; - local_cnt_hist[conflict_free_threadIdx_x] = 0; + //local_cnt_hist[conflict_free_threadIdx_x] = 0; } } else { if (threadIdx_x < feature_num_bin_minus_offset) { @@ -251,17 +241,17 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr local_grad_hist[conflict_free_write_index] = feature_hist_ptr[bin_offset]; const hist_t hess = feature_hist_ptr[bin_offset + 1]; local_hess_hist[conflict_free_write_index] = hess; - local_cnt_hist[conflict_free_write_index] = static_cast(__double2int_rn(hess * cnt_factor)); + //local_cnt_hist[conflict_free_write_index] = static_cast(__double2int_rn(hess * cnt_factor)); } else { //printf("unwrite gradient = %f, hessian = %f\n", feature_hist_ptr[bin_offset], feature_hist_ptr[bin_offset + 1]); local_grad_hist[conflict_free_write_index] = 0.0f; local_hess_hist[conflict_free_write_index] = 0.0f; - local_cnt_hist[conflict_free_write_index] = 0; + // /local_cnt_hist[conflict_free_write_index] = 0; } } else { local_grad_hist[conflict_free_threadIdx_x] = 0.0f; local_hess_hist[conflict_free_threadIdx_x] = 0.0f; - local_cnt_hist[conflict_free_threadIdx_x] = 0; + //local_cnt_hist[conflict_free_threadIdx_x] = 0; } } __syncthreads(); @@ -270,33 +260,16 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr } local_gain[threadIdx_x] = kMinScore; __syncthreads(); - /*if (inner_feature_index == 6) { - if (threadIdx_x == 0) { - for (unsigned i = 0; i < MAX_NUM_BIN_IN_FEATURE; ++i) { - printf("local_grad_hist[%d] = %f\n", i, local_grad_hist[i]); - } - } - }*/ - //__syncthreads(); PrefixSumHist(local_grad_hist, MAX_NUM_BIN_IN_FEATURE); PrefixSumHist(local_hess_hist, MAX_NUM_BIN_IN_FEATURE); - PrefixSumHistCnt(local_cnt_hist, MAX_NUM_BIN_IN_FEATURE); + //PrefixSumHistCnt(local_cnt_hist, MAX_NUM_BIN_IN_FEATURE); __syncthreads(); -/*if (threadIdx_x == 0) { - printf("inner_feature_index = %d, feature_mfb_offset = %d, local_grad_hist[%d] = %f, local_hess_hist[%d] = %f, local_cnt_hist[%d] = %d, total_sum_grad = %f, total_sum_hess = %f\n", - inner_feature_index, feature_mfb_offset, - MAX_NUM_BIN_IN_FEATURE, local_grad_hist[MAX_NUM_BIN_IN_FEATURE], - MAX_NUM_BIN_IN_FEATURE, local_hess_hist[MAX_NUM_BIN_IN_FEATURE], - MAX_NUM_BIN_IN_FEATURE, local_cnt_hist[MAX_NUM_BIN_IN_FEATURE], - local_grad_hist[MAX_NUM_BIN_IN_FEATURE] + default_bin_grad, - local_hess_hist[MAX_NUM_BIN_IN_FEATURE] + default_bin_hess); - }*/ const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(threadIdx_x + 1); if (reverse) { if (threadIdx_x >= na_as_missing && threadIdx_x <= feature_num_bin - 2 && !skip_split) { const double sum_right_gradient = local_grad_hist[conflict_free_threadIdx_x_plus_1]; const double sum_right_hessian = local_hess_hist[conflict_free_threadIdx_x_plus_1]; - const data_size_t right_count = local_cnt_hist[conflict_free_threadIdx_x_plus_1]; + const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor));// local_cnt_hist[conflict_free_threadIdx_x_plus_1]; const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian; const data_size_t left_count = num_data - right_count; @@ -321,11 +294,10 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr threshold_found[threadIdx_x] = 0; } } else { - //printf("error!!!!! non reverse\n"); if (threadIdx_x <= feature_num_bin_minus_offset - 2 /* TODO(shiyu1994): skip default */) { const double sum_left_gradient = local_grad_hist[conflict_free_threadIdx_x_plus_1]; const double sum_left_hessian = local_hess_hist[conflict_free_threadIdx_x_plus_1]; - const data_size_t left_count = local_cnt_hist[conflict_free_threadIdx_x_plus_1]; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor));//local_cnt_hist[conflict_free_threadIdx_x_plus_1]; const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = num_data - left_count; @@ -351,7 +323,7 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr } } __syncthreads(); - ReduceBestGain(local_gain, local_grad_hist, local_hess_hist, local_cnt_hist, threshold_found, threshold_value); + ReduceBestGain(local_gain, local_grad_hist, local_hess_hist, /*local_cnt_hist, */threshold_found, threshold_value); const uint8_t found = threshold_found[0]; if (found && threadIdx_x == 0) { *output_found = 1; @@ -361,14 +333,11 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr if (reverse) { const double sum_right_gradient = local_grad_hist[1]; const double sum_right_hessian = local_hess_hist[1]; - const data_size_t right_count = local_cnt_hist[1]; + const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); + //const data_size_t right_count = local_cnt_hist[1]; const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian; const data_size_t left_count = num_data - right_count; - /*if (threadIdx_x == 0) { - printf("sum_left_gradient = %f, sum_left_hessian = %f, left_count = %d, sum_right_gradient = %f, sum_right_hessian = %f, right_count = %d\n", - sum_left_gradient, sum_left_hessian, left_count, sum_right_gradient, sum_right_hessian, right_count); - }*/ *output_left_sum_gradients = sum_left_gradient; *output_left_sum_hessians = sum_left_hessian; *output_left_num_data = left_count; @@ -386,7 +355,8 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr } else { const double sum_left_gradient = local_grad_hist[1]; const double sum_left_hessian = local_hess_hist[1]; - const data_size_t left_count = local_cnt_hist[1]; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + //const data_size_t left_count = local_cnt_hist[1]; const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = num_data - left_count; @@ -680,7 +650,7 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* const int cuda_num_total_bin_ref = *cuda_num_total_bin; const hist_t* hist_ptr = smaller_or_larger ? *smaller_leaf_hist + feature_hist_offsets[inner_feature_index] * 2 : *larger_leaf_hist + feature_hist_offsets[inner_feature_index] * 2;// cuda_hist + (cuda_num_total_bin_ref * leaf_index + feature_hist_offsets[inner_feature_index]) * 2; - if (num_bin > 2 && missing_type != 0) { + /*if (num_bin > 2 && missing_type != 0) { if (missing_type == 1) { FindBestSplitsForLeafKernelInner2(hist_ptr, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], @@ -698,7 +668,7 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found, inner_feature_index); } - } else { + } else {*/ if (reverse) { FindBestSplitsForLeafKernelInner2(hist_ptr, num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], @@ -711,7 +681,7 @@ __global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* if (missing_type == 2) { *out_default_left = 0; } - } + //} } void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(const int* smaller_leaf_id, const int* larger_leaf_id, @@ -738,6 +708,39 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(const int* smaller_l cuda_best_split_found_); } +__device__ void ReduceBestSplit(uint8_t* found, int* feature, double* gain, uint8_t* default_left, + uint32_t* threshold, double* left_sum_gradient, double* left_sum_hessian, + data_size_t* left_count, double* left_gain, double* left_output, + double* right_sum_gradient, double* right_sum_hessian, + data_size_t* right_count, double* right_gain, double* right_output, + uint32_t num_features_aligned, uint32_t thread_offset) { + const uint32_t threadIdx_x = threadIdx.x - thread_offset; + for (unsigned int s = 1; s < num_features_aligned; s <<= 1) { + if (threadIdx_x % (2 * s) == 0 && (threadIdx_x + s) < num_features_aligned) { + const uint32_t pos_to_compare = threadIdx_x + s; + if ((!found[threadIdx_x] && found[pos_to_compare]) || + (found[threadIdx_x] && found[pos_to_compare] && gain[threadIdx_x] < gain[pos_to_compare])) { + found[threadIdx_x] = found[pos_to_compare]; + feature[threadIdx_x] = feature[pos_to_compare]; + gain[threadIdx_x] = gain[pos_to_compare]; + default_left[threadIdx_x] = default_left[pos_to_compare]; + threshold[threadIdx_x] = threshold[pos_to_compare]; + left_sum_gradient[threadIdx_x] = left_sum_gradient[pos_to_compare]; + left_sum_hessian[threadIdx_x] = left_sum_hessian[pos_to_compare]; + left_count[threadIdx_x] = left_count[pos_to_compare]; + left_gain[threadIdx_x] = left_gain[pos_to_compare]; + left_output[threadIdx_x] = left_output[pos_to_compare]; + right_sum_gradient[threadIdx_x] = right_sum_gradient[pos_to_compare]; + right_sum_hessian[threadIdx_x] = right_sum_hessian[pos_to_compare]; + right_count[threadIdx_x] = right_count[pos_to_compare]; + right_gain[threadIdx_x] = right_gain[pos_to_compare]; + right_output[threadIdx_x] = right_output[pos_to_compare]; + } + } + __syncthreads(); + } +} + __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index, const int* cuda_num_features, int* cuda_leaf_best_split_feature, uint8_t* cuda_leaf_best_split_default_left, uint32_t* cuda_leaf_best_split_threshold, double* cuda_leaf_best_split_gain, @@ -764,182 +767,91 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const const double* cuda_best_split_right_output, const uint8_t* cuda_best_split_found, const uint32_t* cuda_feature_default_bins) { - if (blockIdx.x == 0 && threadIdx.x == 0) { - const int num_features_ref = *cuda_num_features; - const int smaller_leaf_index_ref = *smaller_leaf_index; - const int larger_leaf_index_ref = *larger_leaf_index; - - double& smaller_leaf_best_gain = cuda_leaf_best_split_gain[smaller_leaf_index_ref]; - int& smaller_leaf_best_split_feature = cuda_leaf_best_split_feature[smaller_leaf_index_ref]; - uint8_t& smaller_leaf_best_split_default_left = cuda_leaf_best_split_default_left[smaller_leaf_index_ref]; - uint32_t& smaller_leaf_best_split_threshold = cuda_leaf_best_split_threshold[smaller_leaf_index_ref]; - double& smaller_leaf_best_split_left_sum_gradient = cuda_leaf_best_split_left_sum_gradient[smaller_leaf_index_ref]; - double& smaller_leaf_best_split_left_sum_hessian = cuda_leaf_best_split_left_sum_hessian[smaller_leaf_index_ref]; - data_size_t& smaller_leaf_best_split_left_count = cuda_leaf_best_split_left_count[smaller_leaf_index_ref]; - double& smaller_leaf_best_split_left_gain = cuda_leaf_best_split_left_gain[smaller_leaf_index_ref]; - double& smaller_leaf_best_split_left_output = cuda_leaf_best_split_left_output[smaller_leaf_index_ref]; - double& smaller_leaf_best_split_right_sum_gradient = cuda_leaf_best_split_right_sum_gradient[smaller_leaf_index_ref]; - double& smaller_leaf_best_split_right_sum_hessian = cuda_leaf_best_split_right_sum_hessian[smaller_leaf_index_ref]; - data_size_t& smaller_leaf_best_split_right_count = cuda_leaf_best_split_right_count[smaller_leaf_index_ref]; - double& smaller_leaf_best_split_right_gain = cuda_leaf_best_split_right_gain[smaller_leaf_index_ref]; - double& smaller_leaf_best_split_right_output = cuda_leaf_best_split_right_output[smaller_leaf_index_ref]; - double& larger_leaf_best_gain = cuda_leaf_best_split_gain[larger_leaf_index_ref]; - int& larger_leaf_best_split_feature = cuda_leaf_best_split_feature[larger_leaf_index_ref]; - uint8_t& larger_leaf_best_split_default_left = cuda_leaf_best_split_default_left[larger_leaf_index_ref]; - uint32_t& larger_leaf_best_split_threshold = cuda_leaf_best_split_threshold[larger_leaf_index_ref]; - double& larger_leaf_best_split_left_sum_gradient = cuda_leaf_best_split_left_sum_gradient[larger_leaf_index_ref]; - double& larger_leaf_best_split_left_sum_hessian = cuda_leaf_best_split_left_sum_hessian[larger_leaf_index_ref]; - data_size_t& larger_leaf_best_split_left_count = cuda_leaf_best_split_left_count[larger_leaf_index_ref]; - double& larger_leaf_best_split_left_gain = cuda_leaf_best_split_left_gain[larger_leaf_index_ref]; - double& larger_leaf_best_split_left_output = cuda_leaf_best_split_left_output[larger_leaf_index_ref]; - double& larger_leaf_best_split_right_sum_gradient = cuda_leaf_best_split_right_sum_gradient[larger_leaf_index_ref]; - double& larger_leaf_best_split_right_sum_hessian = cuda_leaf_best_split_right_sum_hessian[larger_leaf_index_ref]; - data_size_t& larger_leaf_best_split_right_count = cuda_leaf_best_split_right_count[larger_leaf_index_ref]; - double& larger_leaf_best_split_right_gain = cuda_leaf_best_split_right_gain[larger_leaf_index_ref]; - double& larger_leaf_best_split_right_output = cuda_leaf_best_split_right_output[larger_leaf_index_ref]; - - smaller_leaf_best_gain = kMinScore; - larger_leaf_best_gain = kMinScore; - int larger_leaf_offset = /*2 * */num_features_ref; - /*if (larger_leaf_index_ref == -1) { - for (int feature_index = 0; feature_index < num_features_ref; ++feature_index) { - const int smaller_reverse_index = 2 * feature_index; - const uint8_t smaller_reverse_found = cuda_best_split_found[smaller_reverse_index]; - const double gain = cuda_best_split_gain[smaller_reverse_index]; - const uint32_t threshold = cuda_best_split_threshold[smaller_reverse_index]; - printf("feature_index = %d, threshold = %d, gain = %f, found = %d, default_bin = %d\n", feature_index, threshold, gain, smaller_reverse_found, - cuda_feature_default_bins[feature_index]); - } - }*/ - for (int feature_index = 0; feature_index < num_features_ref; ++feature_index) { - const int smaller_reverse_index = /*2 * */feature_index; - const uint8_t smaller_reverse_found = cuda_best_split_found[smaller_reverse_index]; - if (smaller_reverse_found) { - const double smaller_reverse_gain = cuda_best_split_gain[smaller_reverse_index]; - if (smaller_reverse_gain > smaller_leaf_best_gain) { - //printf("reverse smaller leaf new best, feature_index = %d, split_gain = %f, default_left = %d, threshold = %d\n", - // feature_index, smaller_reverse_gain, cuda_best_split_default_left[smaller_reverse_index], - // cuda_best_split_threshold[smaller_reverse_index]); - //printf("leaf index %d gain update to %f\n", smaller_leaf_index_ref, smaller_reverse_gain); - smaller_leaf_best_gain = smaller_reverse_gain; - smaller_leaf_best_split_feature = feature_index; - smaller_leaf_best_split_default_left = cuda_best_split_default_left[smaller_reverse_index]; - smaller_leaf_best_split_threshold = cuda_best_split_threshold[smaller_reverse_index]; - smaller_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[smaller_reverse_index]; - smaller_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[smaller_reverse_index]; - smaller_leaf_best_split_left_count = cuda_best_split_left_count[smaller_reverse_index]; - smaller_leaf_best_split_left_gain = cuda_best_split_left_gain[smaller_reverse_index]; - //printf("leaf index %d split left gain update to %f\n", smaller_leaf_index_ref, smaller_leaf_best_split_left_gain); - smaller_leaf_best_split_left_output = cuda_best_split_left_output[smaller_reverse_index]; - smaller_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[smaller_reverse_index]; - smaller_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[smaller_reverse_index]; - smaller_leaf_best_split_right_count = cuda_best_split_right_count[smaller_reverse_index]; - smaller_leaf_best_split_right_gain = cuda_best_split_right_gain[smaller_reverse_index]; - //printf("leaf index %d split right gain update to %f\n", smaller_leaf_index_ref, smaller_leaf_best_split_right_gain); - smaller_leaf_best_split_right_output = cuda_best_split_right_output[smaller_reverse_index]; - /*printf("smaller_leaf_index = %d, smaller_leaf_best_gain = %f, smaller_leaf_best_split_left_sum_gradient = %f, smaller_leaf_best_split_left_sum_hessian = %f\n", - smaller_leaf_index_ref, smaller_leaf_best_gain, smaller_leaf_best_split_left_sum_gradient, smaller_leaf_best_split_left_sum_hessian); - printf("smaller_leaf_index = %d, smaller_leaf_best_gain = %f, smaller_leaf_best_split_right_sum_gradient = %f, smaller_leaf_best_split_right_sum_hessian = %f\n", - smaller_leaf_index_ref, smaller_leaf_best_gain, smaller_leaf_best_split_right_sum_gradient, smaller_leaf_best_split_right_sum_hessian);*/ - } - } - /*const int smaller_non_reverse_index = 2 * feature_index + 1; - const uint8_t smaller_non_reverse_found = cuda_best_split_found[smaller_non_reverse_index]; - if (smaller_non_reverse_found) { - const double smaller_non_reverse_gain = cuda_best_split_gain[smaller_non_reverse_index]; - if (smaller_non_reverse_gain > smaller_leaf_best_gain) { - //printf("non reverse smaller leaf new best, feature_index = %d, split_gain = %f, default_left = %d, threshold = %d\n", - // feature_index, smaller_non_reverse_gain, cuda_best_split_default_left[smaller_non_reverse_index], - // cuda_best_split_threshold[smaller_non_reverse_index]); - //printf("leaf index %d gain update to %f\n", smaller_leaf_index_ref, smaller_non_reverse_gain); - smaller_leaf_best_gain = smaller_non_reverse_gain; - smaller_leaf_best_split_feature = feature_index; - smaller_leaf_best_split_default_left = cuda_best_split_default_left[smaller_non_reverse_index]; - smaller_leaf_best_split_threshold = cuda_best_split_threshold[smaller_non_reverse_index]; - smaller_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[smaller_non_reverse_index]; - smaller_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[smaller_non_reverse_index]; - smaller_leaf_best_split_left_count = cuda_best_split_left_count[smaller_non_reverse_index]; - smaller_leaf_best_split_left_gain = cuda_best_split_left_gain[smaller_non_reverse_index]; - //printf("leaf index %d split left gain update to %f\n", smaller_leaf_index_ref, smaller_leaf_best_split_left_gain); - smaller_leaf_best_split_left_output = cuda_best_split_left_output[smaller_non_reverse_index]; - smaller_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[smaller_non_reverse_index]; - smaller_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[smaller_non_reverse_index]; - smaller_leaf_best_split_right_count = cuda_best_split_right_count[smaller_non_reverse_index]; - smaller_leaf_best_split_right_gain = cuda_best_split_right_gain[smaller_non_reverse_index]; - //printf("leaf index %d split right gain update to %f\n", smaller_leaf_index_ref, smaller_leaf_best_split_right_gain); - smaller_leaf_best_split_right_output = cuda_best_split_right_output[smaller_non_reverse_index];*/ - /*printf("smaller_leaf_index = %d, smaller_leaf_best_gain = %f, smaller_leaf_best_split_left_sum_gradient = %f, smaller_leaf_best_split_left_sum_hessian = %f\n", - smaller_leaf_index_ref, smaller_leaf_best_gain, smaller_leaf_best_split_left_sum_gradient, smaller_leaf_best_split_left_sum_hessian); - printf("smaller_leaf_index = %d, smaller_leaf_best_gain = %f, smaller_leaf_best_split_right_sum_gradient = %f, smaller_leaf_best_split_right_sum_hessian = %f\n", - smaller_leaf_index_ref, smaller_leaf_best_gain, smaller_leaf_best_split_right_sum_gradient, smaller_leaf_best_split_right_sum_hessian);*/ - //} - //} + const uint32_t threadIdx_x = threadIdx.x; + const uint32_t blockIdx_x = blockIdx.x; + int num_features_ref = *cuda_num_features; + int num_features_aligned = 1; + num_features_ref -= 1; + while (num_features_ref > 0) { + num_features_aligned <<= 1; + num_features_ref >>= 1; + } + num_features_ref = *cuda_num_features; + + __shared__ uint8_t best_found[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ int best_feature[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ double best_gain[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ uint8_t best_default_left[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ uint32_t best_threshold[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ double best_left_sum_gradient[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ double best_left_sum_hessian[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ data_size_t best_left_count[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ double best_left_gain[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ double best_left_output[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ double best_right_sum_gradient[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ double best_right_sum_hessian[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ data_size_t best_right_count[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ double best_right_gain[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ double best_right_output[NUM_FEATURES_PER_SYNC_BLOCK]; + + const bool is_smaller = (blockIdx_x == 0); + const int feature_index = static_cast(threadIdx_x); + const uint32_t read_index = is_smaller ? threadIdx_x : threadIdx_x + num_features_ref; + if (feature_index < num_features_ref) { + best_found[feature_index] = cuda_best_split_found[read_index]; + best_feature[feature_index] = feature_index; + best_gain[feature_index] = cuda_best_split_gain[read_index]; + best_default_left[feature_index] = cuda_best_split_default_left[read_index]; + best_threshold[feature_index] = cuda_best_split_threshold[read_index]; + best_left_sum_gradient[feature_index] = cuda_best_split_left_sum_gradient[read_index]; + best_left_sum_hessian[feature_index] = cuda_best_split_left_sum_hessian[read_index]; + best_left_count[feature_index] = cuda_best_split_left_count[read_index]; + best_left_gain[feature_index] = cuda_best_split_left_gain[read_index]; + best_left_output[feature_index] = cuda_best_split_left_output[read_index]; + best_right_sum_gradient[feature_index] = cuda_best_split_right_sum_gradient[read_index]; + best_right_sum_hessian[feature_index] = cuda_best_split_right_sum_hessian[read_index]; + best_right_count[feature_index] = cuda_best_split_right_count[read_index]; + best_right_gain[feature_index] = cuda_best_split_right_gain[read_index]; + best_right_output[feature_index] = cuda_best_split_right_output[read_index]; + } else { + best_found[feature_index] = 0; + } - if (larger_leaf_index_ref >= 0) { - const int larger_reverse_index = /*2 **/ feature_index + larger_leaf_offset; - const uint8_t larger_reverse_found = cuda_best_split_found[larger_reverse_index]; - if (larger_reverse_found) { - const double larger_reverse_gain = cuda_best_split_gain[larger_reverse_index]; - if (larger_reverse_gain > larger_leaf_best_gain) { - //printf("leaf index %d gain update to %f\n", larger_leaf_index_ref, larger_reverse_gain); - larger_leaf_best_gain = larger_reverse_gain; - larger_leaf_best_split_feature = feature_index; - larger_leaf_best_split_default_left = cuda_best_split_default_left[larger_reverse_index]; - larger_leaf_best_split_threshold = cuda_best_split_threshold[larger_reverse_index]; - larger_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[larger_reverse_index]; - larger_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[larger_reverse_index]; - larger_leaf_best_split_left_count = cuda_best_split_left_count[larger_reverse_index]; - larger_leaf_best_split_left_gain = cuda_best_split_left_gain[larger_reverse_index]; - //printf("leaf index %d split left gain update to %f\n", larger_leaf_index_ref, larger_leaf_best_split_left_gain); - larger_leaf_best_split_left_output = cuda_best_split_left_output[larger_reverse_index]; - larger_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[larger_reverse_index]; - larger_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[larger_reverse_index]; - larger_leaf_best_split_right_count = cuda_best_split_right_count[larger_reverse_index]; - larger_leaf_best_split_right_gain = cuda_best_split_right_gain[larger_reverse_index]; - //printf("leaf index %d split right gain update to %f\n", larger_leaf_index_ref, larger_leaf_best_split_right_gain); - larger_leaf_best_split_right_output = cuda_best_split_right_output[larger_reverse_index]; - /*printf("larger_leaf_index = %d, larger_leaf_best_gain = %f, larger_leaf_best_split_left_sum_gradient = %f, larger_leaf_best_split_left_sum_hessian = %f\n", - larger_leaf_index_ref, larger_leaf_best_gain, larger_leaf_best_split_left_sum_gradient, larger_leaf_best_split_left_sum_hessian); - printf("larger_leaf_index = %d, larger_leaf_best_gain = %f, larger_leaf_best_split_right_sum_gradient = %f, larger_leaf_best_split_right_sum_hessian = %f\n", - larger_leaf_index_ref, larger_leaf_best_gain, larger_leaf_best_split_right_sum_gradient, larger_leaf_best_split_right_sum_hessian);*/ - } - } - /*const int larger_non_reverse_index = 2 * feature_index + 1 + larger_leaf_offset; - const uint8_t larger_non_reverse_found = cuda_best_split_found[larger_non_reverse_index]; - if (larger_non_reverse_found) { - const double larger_non_reverse_gain = cuda_best_split_gain[larger_non_reverse_index]; - if (larger_non_reverse_gain > larger_leaf_best_gain) { - //printf("leaf index %d gain update to %f\n", larger_leaf_index_ref, larger_non_reverse_gain); - larger_leaf_best_gain = larger_non_reverse_gain; - larger_leaf_best_split_feature = feature_index; - larger_leaf_best_split_default_left = cuda_best_split_default_left[larger_non_reverse_index]; - larger_leaf_best_split_threshold = cuda_best_split_threshold[larger_non_reverse_index]; - larger_leaf_best_split_left_sum_gradient = cuda_best_split_left_sum_gradient[larger_non_reverse_index]; - larger_leaf_best_split_left_sum_hessian = cuda_best_split_left_sum_hessian[larger_non_reverse_index]; - larger_leaf_best_split_left_count = cuda_best_split_left_count[larger_non_reverse_index]; - larger_leaf_best_split_left_gain = cuda_best_split_left_gain[larger_non_reverse_index]; - //printf("leaf index %d split left gain update to %f\n", larger_leaf_index_ref, larger_leaf_best_split_left_gain); - larger_leaf_best_split_left_output = cuda_best_split_left_output[larger_non_reverse_index]; - larger_leaf_best_split_right_sum_gradient = cuda_best_split_right_sum_gradient[larger_non_reverse_index]; - larger_leaf_best_split_right_sum_hessian = cuda_best_split_right_sum_hessian[larger_non_reverse_index]; - larger_leaf_best_split_right_count = cuda_best_split_right_count[larger_non_reverse_index]; - larger_leaf_best_split_right_gain = cuda_best_split_right_gain[larger_non_reverse_index]; - //printf("leaf index %d split right gain update to %f\n", larger_leaf_index_ref, larger_leaf_best_split_right_gain); - larger_leaf_best_split_right_output = cuda_best_split_right_output[larger_non_reverse_index];*/ - /*printf("larger_leaf_index = %d, larger_leaf_best_gain = %f, larger_leaf_best_split_left_sum_gradient = %f, larger_leaf_best_split_left_sum_hessian = %f\n", - larger_leaf_index_ref, larger_leaf_best_gain, larger_leaf_best_split_left_sum_gradient, larger_leaf_best_split_left_sum_hessian); - printf("larger_leaf_index = %d, larger_leaf_best_gain = %f, larger_leaf_best_split_right_sum_gradient = %f, larger_leaf_best_split_right_sum_hessian = %f\n", - larger_leaf_index_ref, larger_leaf_best_gain, larger_leaf_best_split_right_sum_gradient, larger_leaf_best_split_right_sum_hessian);*/ - //} - //} - } + __syncthreads(); + ReduceBestSplit(best_found, best_feature, best_gain, + best_default_left, best_threshold, + best_left_sum_gradient, best_left_sum_hessian, + best_left_count, best_left_gain, best_left_output, + best_right_sum_gradient, best_right_sum_hessian, + best_right_count, best_right_gain, best_right_output, + num_features_aligned, 0); + + if (threadIdx.x == 0) { + const int leaf_index_ref = is_smaller ? *smaller_leaf_index : *larger_leaf_index; + if (best_found[0]) { + cuda_leaf_best_split_gain[leaf_index_ref] = best_gain[0]; + cuda_leaf_best_split_feature[leaf_index_ref] = best_feature[0]; + cuda_leaf_best_split_default_left[leaf_index_ref] = best_default_left[0]; + cuda_leaf_best_split_threshold[leaf_index_ref] = best_threshold[0]; + cuda_leaf_best_split_left_sum_gradient[leaf_index_ref] = best_left_sum_gradient[0]; + cuda_leaf_best_split_left_sum_hessian[leaf_index_ref] = best_left_sum_hessian[0]; + cuda_leaf_best_split_left_count[leaf_index_ref] = best_left_count[0]; + cuda_leaf_best_split_left_gain[leaf_index_ref] = best_left_gain[0]; + cuda_leaf_best_split_left_output[leaf_index_ref] = best_left_output[0]; + cuda_leaf_best_split_right_sum_gradient[leaf_index_ref] = best_right_sum_gradient[0]; + cuda_leaf_best_split_right_sum_hessian[leaf_index_ref] = best_right_sum_hessian[0]; + cuda_leaf_best_split_right_count[leaf_index_ref] = best_right_count[0]; + cuda_leaf_best_split_right_gain[leaf_index_ref] = best_right_gain[0]; + cuda_leaf_best_split_right_output[leaf_index_ref] = best_right_output[0]; + } else { + cuda_leaf_best_split_gain[leaf_index_ref] = kMinScore; } } } void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index) { - SyncBestSplitForLeafKernel<<<1, 1>>>(smaller_leaf_index, larger_leaf_index, + SyncBestSplitForLeafKernel<<<2, NUM_FEATURES_PER_SYNC_BLOCK>>>(smaller_leaf_index, larger_leaf_index, cuda_num_features_, cuda_leaf_best_split_feature_, cuda_leaf_best_split_default_left_, cuda_leaf_best_split_threshold_, cuda_leaf_best_split_gain_, cuda_leaf_best_split_left_sum_gradient_, cuda_leaf_best_split_left_sum_hessian_, diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 89f96936729a..d3e836ab8612 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -17,10 +17,11 @@ #include -#define MAX_NUM_BIN_IN_FEATURE (512) +#define MAX_NUM_BIN_IN_FEATURE (256) #define NUM_THREADS_FIND_BEST_LEAF (256) #define LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER (4) #define NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER (16) +#define NUM_FEATURES_PER_SYNC_BLOCK (32) namespace LightGBM { diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index d051f5f7a16c..5ac3bd424e21 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -122,6 +122,10 @@ void CUDADataPartition::Init() { cpu_train_data_score_tmp_.resize(num_data_, 0.0f); cpu_split_info_buffer_.resize(5, 0); + + cuda_streams_.resize(2); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); } void CUDADataPartition::CopyColWiseData() { diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 0e930acfce96..8ba7c2f24bf3 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -506,6 +506,7 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu split_indices_block_size_data_partition_aligned <<= 1; split_indices_block_size_data_partition >>= 1; } + const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; const uint8_t missing_is_zero = feature_missing_is_zero_[split_feature_index]; const uint8_t missing_is_na = feature_missing_is_na_[split_feature_index]; const uint8_t mfb_is_zero = feature_mfb_is_zero_[split_feature_index]; @@ -534,80 +535,80 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu const uint8_t* cuda_data_col_wise_ptr = cuda_data_col_wise_ + split_feature_index * num_data_; if (min_bin < max_bin) { if (!missing_is_zero && !missing_is_na) { - GenDataToLeftBitVectorKernel0_1_2_3<<>>( + GenDataToLeftBitVectorKernel0_1_2_3<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else { if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel4<<>>( + GenDataToLeftBitVectorKernel4<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel5<<>>( + GenDataToLeftBitVectorKernel5<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel6<<>>( + GenDataToLeftBitVectorKernel6<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel7<<>>( + GenDataToLeftBitVectorKernel7<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel8<<>>( + GenDataToLeftBitVectorKernel8<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel9<<>>( + GenDataToLeftBitVectorKernel9<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel10<<>>( + GenDataToLeftBitVectorKernel10<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel11<<>>( + GenDataToLeftBitVectorKernel11<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel12<<>>( + GenDataToLeftBitVectorKernel12<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel13<<>>( + GenDataToLeftBitVectorKernel13<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel14<<>>( + GenDataToLeftBitVectorKernel14<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, split_missing_default_to_left, cuda_data_to_left_); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel15<<>>( + GenDataToLeftBitVectorKernel15<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, @@ -791,7 +792,43 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b __syncthreads(); } if (blockIdx.x == 0 && threadIdx.x == 0) { - const int cur_max_leaf_index = (*cuda_cur_num_leaves); + ++(*cuda_cur_num_leaves); + } +} + +__global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* block_to_left_offset_buffer, + data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, + data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, + int* cuda_cur_num_leaves, + const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left, const double* best_split_gain, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, + // for leaf splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer, + const int* cuda_num_total_bin, + hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition, + + int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, + double* tree_left_output, double* tree_right_output, data_size_t* tree_left_count, data_size_t* tree_right_count, + double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, + double* data_partition_leaf_output) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + const int leaf_index_ref = *leaf_index; + const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; + const uint32_t num_blocks = (num_data_in_leaf + split_indices_block_size_data_partition - 1) / split_indices_block_size_data_partition; + const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; //printf("left_leaf_index = %d, right_leaf_index = %d\n", leaf_index_ref, cur_max_leaf_index); block_to_left_offset_buffer[0] = 0; const unsigned int to_left_total_cnt = block_to_left_offset_buffer[num_blocks]; @@ -818,7 +855,6 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b data_partition_leaf_output[leaf_index_ref] = best_left_leaf_value[leaf_index_ref]; data_partition_leaf_output[cur_max_leaf_index] = best_right_leaf_value[leaf_index_ref]; - ++(*cuda_cur_num_leaves); if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; *smaller_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; @@ -905,21 +941,17 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x]; if (threadIdx_x == 0) { thread_to_left_pos[0] = 0; - // thread_to_left_pos[0] = to_left_block_offset; } __syncthreads(); PrefixSum(thread_to_left_pos, split_indices_block_size_data_partition); - //thread_to_right_pos[threadIdx_x] = to_right_block_offset; - //thread_to_right_pos[threadIdx_x + blockDim_x] = to_right_block_offset; __syncthreads(); if (threadIdx_x > 0) { - thread_to_right_pos[threadIdx_x] = (threadIdx_x /*+ to_left_block_offset*/ - thread_to_left_pos[conflict_free_threadIdx_x_plus_1]); + thread_to_right_pos[threadIdx_x] = (threadIdx_x - thread_to_left_pos[conflict_free_threadIdx_x_plus_1]); } else { thread_to_right_pos[threadIdx_x] = 0; } - thread_to_right_pos[threadIdx_x + blockDim_x] = (threadIdx_x + blockDim_x /*+ to_left_block_offset*/ - thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]); + thread_to_right_pos[threadIdx_x + blockDim_x] = (threadIdx_x + blockDim_x - thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]); __syncthreads(); - //PrefixSum(thread_to_right_pos, split_indices_block_size_data_partition); data_size_t* left_out_data_indices_in_leaf = out_data_indices_in_leaf + to_left_block_offset; data_size_t* right_out_data_indices_in_leaf = out_data_indices_in_leaf + to_right_block_offset; if (global_thread_index < num_data_in_leaf_ref) { @@ -936,22 +968,6 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ right_out_data_indices_in_leaf[thread_to_right_pos[threadIdx_x + blockDim_x]] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x]; } } - /*if (thread_to_left_pos[conflict_free_threadIdx_x_plus_1] == 0) { - printf("thread_to_left_pos[%d] = %d, global_thread_index = %d, thread_split_to_left_bit_vector[%d] = %d\n", - conflict_free_threadIdx_x_plus_1, thread_to_left_pos[conflict_free_threadIdx_x_plus_1], global_thread_index, threadIdx_x, thread_split_to_left_bit_vector[threadIdx_x]); - } - if (thread_to_right_pos[conflict_free_threadIdx_x_plus_1] == 0) { - printf("thread_to_right_pos[%d] = %d, global_thread_index = %d, thread_split_to_left_bit_vector[%d] = %d\n", - conflict_free_threadIdx_x_plus_1, thread_to_left_pos[conflict_free_threadIdx_x_plus_1], global_thread_index, threadIdx_x, thread_split_to_left_bit_vector[threadIdx_x]); - } - if (thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] == 0) { - printf("thread_to_left_pos[%d] = %d, global_thread_index = %d, thread_split_to_left_bit_vector[%d + %ds] = %d\n", - conflict_free_threadIdx_x_plus_blockDim_x_plus_1, thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1], global_thread_index_plus_blockDim_x, threadIdx_x, blockDim_x, thread_split_to_left_bit_vector[threadIdx_x + blockDim_x]); - } - if (thread_to_right_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] == 0) { - printf("thread_to_right_pos[%d] = %d, global_thread_index = %d, thread_split_to_left_bit_vector[%d + %d] = %d\n", - conflict_free_threadIdx_x_plus_blockDim_x_plus_1, thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1], global_thread_index_plus_blockDim_x, threadIdx_x, blockDim_x, thread_split_to_left_bit_vector[threadIdx_x + blockDim_x]); - }*/ } __global__ void CopyDataIndicesKernel(const int* leaf_index, @@ -1047,7 +1063,38 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data //Log::Warning("CUDADataPartition::AggregateBlockOffsetKernel time %f", duration); global_timer.Start("CUDADataPartition::SplitInnerKernel"); start = std::chrono::steady_clock::now(); - SplitInnerKernel<<>>( + global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); + SplitTreeStructureKernel<<<1, 1, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, + cuda_leaf_num_data_, cuda_data_indices_, + cuda_cur_num_leaves_, + best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, + best_left_sum_gradients, best_left_sum_hessians, best_left_count, + best_left_gain, best_left_leaf_value, + best_right_sum_gradients, best_right_sum_hessians, best_right_count, + best_right_gain, best_right_leaf_value, + + smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, + smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, + smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, + smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + smaller_leaf_cuda_hist_pointer_pointer, + larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, + larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, + larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_hist_pointer_pointer, + cuda_num_total_bin_, + cuda_hist_, + cuda_hist_pool_, split_indices_block_size_data_partition_aligned, + + tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, + tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, + tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, + data_partition_leaf_output_); + global_timer.Stop("CUDADataPartition::SplitTreeStructureKernel"); + + SplitInnerKernel<<>>( leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); @@ -1130,10 +1177,10 @@ __global__ void PrepareCUDASplitInforBufferKernel(const int* leaf_index, const i void CUDADataPartition::LaunchPrepareCUDASplitInforBufferKernel(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left) { - PrepareCUDASplitInforBufferKernel<<<1, 1>>>(leaf_id, best_split_feature, best_split_threshold, best_split_default_left, - cuda_leaf_num_data_, cuda_leaf_data_start_, - cuda_split_info_buffer_); - SynchronizeCUDADevice(); + PrepareCUDASplitInforBufferKernel<<<1, 1>>>(leaf_id, best_split_feature, best_split_threshold, best_split_default_left, + cuda_leaf_num_data_, cuda_leaf_data_start_, + cuda_split_info_buffer_); + SynchronizeCUDADevice(); } __global__ void CopyColWiseDataKernel(const uint8_t* row_wise_data, diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 78c22b167261..dcab8ddd331d 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -260,6 +260,9 @@ class CUDADataPartition { std::vector cpu_train_data_score_tmp_; std::vector cpu_split_info_buffer_; + // CUDA streams + std::vector cuda_streams_; + // CUDA memory, held by this object data_size_t* cuda_data_indices_; data_size_t* cuda_leaf_data_start_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index f22a0596731f..7f6cb21fbe71 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -105,13 +105,13 @@ void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, data_[index] = feature_bin_value_uint8; } -void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, - const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, +void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const data_size_t* cuda_num_data_in_smaller_leaf, + const int* cuda_larger_leaf_index, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, const double* cuda_smaller_leaf_sum_gradients, const double* cuda_smaller_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, const data_size_t* cuda_leaf_num_data) { auto start = std::chrono::steady_clock::now(); - LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist); + LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_num_data_in_smaller_leaf, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist); SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 03a4b7c8df66..8e81d366f4c8 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -55,7 +55,8 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, const unsigned int threadIdx_x = threadIdx.x; const int num_feature_groups_ref = *num_feature_groups; const int leaf_index_ref = *leaf_index; - const int dim_y = gridDim.y * blockDim.y; + const unsigned int blockDim_y = blockDim.y; + const int dim_y = gridDim.y * blockDim_y; hist_t* feature_histogram_ptr = *feature_histogram; const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; @@ -71,21 +72,27 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, for (uint32_t i = thread_start; i < thread_end; ++i) { shared_hist[i] = 0.0f; } + float* shared_hist_ptr = shared_hist + (feature_group_offset << 1); __syncthreads(); const unsigned int threadIdx_y = threadIdx.y; const unsigned int blockIdx_y = blockIdx.y; - const data_size_t start = (threadIdx_y + blockIdx_y * blockDim.y) * num_data_per_thread; - const data_size_t end = start + num_data_per_thread > num_data_in_smaller_leaf_ref ? - num_data_in_smaller_leaf_ref : start + num_data_per_thread; - for (data_size_t i = start; i < end; ++i) { - const data_size_t data_index = data_indices_ref[i]; + const data_size_t block_start = (blockIdx_y * blockDim_y) * num_data_per_thread; + const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; + data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf_ref - block_start, num_data_per_thread * static_cast(blockDim_y))); + const data_size_t num_iteration_total = (block_num_data + blockDim_y - 1) / blockDim_y; + const data_size_t remainder = block_num_data % blockDim_y; + const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); + data_size_t inner_data_index = static_cast(threadIdx_y); + for (data_size_t i = 0; i < num_iteration_this; ++i) { + const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; const score_t grad = cuda_gradients[data_index]; const score_t hess = cuda_hessians[data_index]; - const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + threadIdx_x]) + feature_group_offset; + const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + threadIdx_x]); const uint32_t pos = bin << 1; - float* pos_ptr = shared_hist + pos; + float* pos_ptr = shared_hist_ptr + pos; atomicAdd_system(pos_ptr, grad); atomicAdd_system(pos_ptr + 1, hess); + inner_data_index += blockDim_y; } __syncthreads(); for (uint32_t i = thread_start; i < thread_end; ++i) { @@ -95,20 +102,18 @@ __global__ void CUDAConstructHistogramKernel(const int* leaf_index, void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const int* cuda_smaller_leaf_index, + const data_size_t* cuda_smaller_leaf_num_data, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t* cuda_leaf_num_data, hist_t** cuda_leaf_hist) { - int smaller_leaf_index = 0; global_timer.Start("CUDAHistogramConstructor::LaunchConstructHistogramKernel::CopyFromCUDADeviceToHost"); - CopyFromCUDADeviceToHost(&smaller_leaf_index, cuda_smaller_leaf_index, 1); - SynchronizeCUDADevice(); data_size_t smaller_leaf_num_data = 0; - CopyFromCUDADeviceToHost(&smaller_leaf_num_data, cuda_leaf_num_data + smaller_leaf_index, 1); + CopyFromCUDADeviceToHost(&smaller_leaf_num_data, cuda_smaller_leaf_num_data, 1); SynchronizeCUDADevice(); global_timer.Stop("CUDAHistogramConstructor::LaunchConstructHistogramKernel::CopyFromCUDADeviceToHost"); const int block_dim_x = num_features_; // TODO(shiyu1994): only supports the case when the whole histogram can be loaded into shared memory const int block_dim_y = NUM_THRADS_PER_BLOCK / block_dim_x; - const int min_grid_dim_y = 80; + const int min_grid_dim_y = 160; const int grid_dim_y = std::max(min_grid_dim_y, ((smaller_leaf_num_data + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + block_dim_y - 1) / block_dim_y); const int grid_dim_x = (static_cast(num_feature_groups_ + NUM_FEATURE_PER_THREAD_GROUP - 1) / NUM_FEATURE_PER_THREAD_GROUP); //Log::Warning("smaller_leaf_num_data = %d", smaller_leaf_num_data); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 11fb506e0859..880342079a57 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -34,7 +34,7 @@ class CUDAHistogramConstructor { void Init(const Dataset* train_data); - void ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, + void ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const data_size_t* cuda_num_data_in_smaller_leaf, const int* cuda_larger_leaf_index, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, const double* cuda_smaller_leaf_sum_gradients, const double* cuda_smaller_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, @@ -82,6 +82,7 @@ class CUDAHistogramConstructor { const data_size_t** cuda_data_indices_in_leaf); void LaunchConstructHistogramKernel(const int* cuda_leaf_index, + const data_size_t* cuda_smaller_leaf_num_data, const data_size_t** cuda_data_indices_in_leaf, const data_size_t* cuda_leaf_num_data, hist_t** cuda_leaf_hist); diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 271e90dca459..ec24311ab7af 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -364,6 +364,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, auto start = std::chrono::steady_clock::now(); cuda_histogram_constructor_->ConstructHistogramForLeaf( cuda_smaller_leaf_splits_->cuda_leaf_index(), + cuda_smaller_leaf_splits_->cuda_num_data_in_leaf(), cuda_larger_leaf_splits_->cuda_leaf_index(), cuda_smaller_leaf_splits_->cuda_data_indices_in_leaf(), cuda_larger_leaf_splits_->cuda_data_indices_in_leaf(), From 277be8b373bffb4c6f8f4f477ee2f32aacd26f68 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 1 Jun 2021 02:25:42 +0000 Subject: [PATCH 014/166] merge split data kernels --- src/treelearner/cuda/cuda_data_partition.cpp | 2 + src/treelearner/cuda/cuda_data_partition.cu | 277 ++++++++++++++++--- src/treelearner/cuda/cuda_data_partition.hpp | 2 +- 3 files changed, 237 insertions(+), 44 deletions(-) diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 5ac3bd424e21..dc22c88b340a 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -182,7 +182,9 @@ void CUDADataPartition::Split(const int* leaf_id, global_timer.Start("GenDataToLeftBitVector"); global_timer.Start("SplitInner Copy CUDA To Host"); PrepareCUDASplitInforBuffer(leaf_id, best_split_feature, best_split_threshold, best_split_default_left); + global_timer.Start("PrepareCUDASplitInforBuffer"); CopyFromCUDADeviceToHost(cpu_split_info_buffer_.data(), cuda_split_info_buffer_, 5); + global_timer.Stop("PrepareCUDASplitInforBuffer"); cpu_num_data_in_leaf = static_cast(cpu_split_info_buffer_[0]); cpu_split_feature_index = static_cast(cpu_split_info_buffer_[1]); cpu_split_threshold = static_cast(cpu_split_info_buffer_[2]); diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 8ba7c2f24bf3..0f87fc44b90a 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -88,6 +88,16 @@ __device__ void PrefixSum(uint16_t* elements, unsigned int n) { } } +__device__ void ReduceSum(uint16_t* array, const size_t size) { + const unsigned int threadIdx_x = threadIdx.x; + for (int s = 1; s < size; s <<= 1) { + if (threadIdx_x % (2 * s) == 0 && (threadIdx_x + s) < size) { + array[CONFLICT_FREE_INDEX(threadIdx_x)] += array[CONFLICT_FREE_INDEX(threadIdx_x + s)]; + } + __syncthreads(); + } +} + __global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_data, data_size_t* data_indices) { const data_size_t num_data_ref = *cuda_num_data; @@ -102,6 +112,29 @@ void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() { FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_); } +__device__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, const uint8_t* split_to_left_bit_vector, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition, + uint16_t* thread_to_left_offset_cnt) { + const unsigned int threadIdx_x = threadIdx.x; + const unsigned int blockDim_x = blockDim.x / 2; + __syncthreads(); + ReduceSum(thread_to_left_offset_cnt, split_indices_block_size_data_partition); + __syncthreads(); + if (threadIdx_x == 0) { + const data_size_t num_data_in_block = (blockIdx.x + 1) * blockDim_x * 2 <= num_data_in_leaf_ref ? static_cast(blockDim_x * 2) : + num_data_in_leaf_ref - static_cast(blockIdx.x * blockDim_x * 2); + if (num_data_in_block > 0) { + const data_size_t data_to_left = static_cast(thread_to_left_offset_cnt[0]); + block_to_left_offset_buffer[blockIdx.x + 1] = data_to_left; + block_to_right_offset_buffer[blockIdx.x + 1] = num_data_in_block - data_to_left; + } else { + block_to_left_offset_buffer[blockIdx.x + 1] = 0; + block_to_right_offset_buffer[blockIdx.x + 1] = 0; + } + } +} + // missing_is_zero = 0, missing_is_na = 0, min_bin_ref < max_bin_ref __global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, @@ -109,7 +142,11 @@ __global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t /*split_missing_default_to_left*/, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -117,12 +154,20 @@ __global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref @@ -132,7 +177,11 @@ __global__ void GenDataToLeftBitVectorKernel4(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -140,14 +189,23 @@ __global__ void GenDataToLeftBitVectorKernel4(const int best_split_feature_ref, const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref @@ -157,7 +215,11 @@ __global__ void GenDataToLeftBitVectorKernel5(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -165,12 +227,20 @@ __global__ void GenDataToLeftBitVectorKernel5(const int best_split_feature_ref, const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref @@ -180,7 +250,11 @@ __global__ void GenDataToLeftBitVectorKernel6(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -188,14 +262,23 @@ __global__ void GenDataToLeftBitVectorKernel6(const int best_split_feature_ref, const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin == max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref @@ -205,7 +288,11 @@ __global__ void GenDataToLeftBitVectorKernel7(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -213,12 +300,20 @@ __global__ void GenDataToLeftBitVectorKernel7(const int best_split_feature_ref, const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref @@ -228,7 +323,11 @@ __global__ void GenDataToLeftBitVectorKernel8(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -238,12 +337,20 @@ __global__ void GenDataToLeftBitVectorKernel8(const int best_split_feature_ref, cuda_data_to_left[local_data_index] = split_missing_default_to_left; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref @@ -253,7 +360,11 @@ __global__ void GenDataToLeftBitVectorKernel9(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -261,14 +372,23 @@ __global__ void GenDataToLeftBitVectorKernel9(const int best_split_feature_ref, const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref @@ -278,7 +398,11 @@ __global__ void GenDataToLeftBitVectorKernel10(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -286,12 +410,20 @@ __global__ void GenDataToLeftBitVectorKernel10(const int best_split_feature_ref, const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref @@ -301,7 +433,11 @@ __global__ void GenDataToLeftBitVectorKernel11(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -309,12 +445,20 @@ __global__ void GenDataToLeftBitVectorKernel11(const int best_split_feature_ref, const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref @@ -324,7 +468,11 @@ __global__ void GenDataToLeftBitVectorKernel12(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -332,14 +480,23 @@ __global__ void GenDataToLeftBitVectorKernel12(const int best_split_feature_ref, const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin == t_zero_bin || bin == max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref @@ -349,7 +506,11 @@ __global__ void GenDataToLeftBitVectorKernel13(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -357,14 +518,23 @@ __global__ void GenDataToLeftBitVectorKernel13(const int best_split_feature_ref, const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref @@ -374,7 +544,11 @@ __global__ void GenDataToLeftBitVectorKernel14(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -382,14 +556,23 @@ __global__ void GenDataToLeftBitVectorKernel14(const int best_split_feature_ref, const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin == max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } // missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref @@ -399,7 +582,11 @@ __global__ void GenDataToLeftBitVectorKernel15(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left) { + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -407,12 +594,20 @@ __global__ void GenDataToLeftBitVectorKernel15(const int best_split_feature_ref, const uint32_t bin = static_cast(cuda_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } else { cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } __global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t* cuda_leaf_data_start, @@ -539,80 +734,93 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else { if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { GenDataToLeftBitVectorKernel4<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { GenDataToLeftBitVectorKernel5<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { GenDataToLeftBitVectorKernel6<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { GenDataToLeftBitVectorKernel7<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { GenDataToLeftBitVectorKernel8<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { GenDataToLeftBitVectorKernel9<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { GenDataToLeftBitVectorKernel10<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { GenDataToLeftBitVectorKernel11<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { GenDataToLeftBitVectorKernel12<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { GenDataToLeftBitVectorKernel13<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { GenDataToLeftBitVectorKernel14<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { GenDataToLeftBitVectorKernel15<<>>( split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_); + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + split_indices_block_size_data_partition_aligned); } } } else { @@ -829,7 +1037,6 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; const uint32_t num_blocks = (num_data_in_leaf + split_indices_block_size_data_partition - 1) / split_indices_block_size_data_partition; const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; - //printf("left_leaf_index = %d, right_leaf_index = %d\n", leaf_index_ref, cur_max_leaf_index); block_to_left_offset_buffer[0] = 0; const unsigned int to_left_total_cnt = block_to_left_offset_buffer[num_blocks]; block_to_right_offset_buffer[0] = to_left_total_cnt; @@ -1005,7 +1212,6 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, hist_t** larger_leaf_cuda_hist_pointer_pointer) { - //Log::Warning("num_data_in_leaf = %d", num_data_in_leaf); const int num_blocks = std::max(80, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; int split_indices_block_size_data_partition_aligned = 1; @@ -1013,20 +1219,13 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data split_indices_block_size_data_partition_aligned <<= 1; split_indices_block_size_data_partition >>= 1; } - //Log::Warning("num_blocks = %d, split_indices_block_size_data_partition_aligned = %d", num_blocks, split_indices_block_size_data_partition_aligned); global_timer.Start("CUDADataPartition::PrepareOffsetKernel"); auto start = std::chrono::steady_clock::now(); const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; - //Log::Warning("num_blocks_final = %d", num_blocks_final); - PrepareOffsetKernel<<>>( - leaf_index, cuda_leaf_num_data_, cuda_data_to_left_, - cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, split_indices_block_size_data_partition_aligned); - SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::PrepareOffsetKernel"); global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); - //Log::Warning("CUDADataPartition::PrepareOffsetKernel time %f", duration); start = std::chrono::steady_clock::now(); AggregateBlockOffsetKernel<<<1, split_indices_block_size_data_partition_aligned / 2>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, @@ -1060,7 +1259,6 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); - //Log::Warning("CUDADataPartition::AggregateBlockOffsetKernel time %f", duration); global_timer.Start("CUDADataPartition::SplitInnerKernel"); start = std::chrono::steady_clock::now(); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); @@ -1102,7 +1300,6 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::SplitInnerKernel"); - //Log::Warning("CUDADataPartition::SplitInnerKernel time %f", duration); global_timer.Start("CUDADataPartition::CopyDataIndicesKernel"); start = std::chrono::steady_clock::now(); CopyDataIndicesKernel<<>>( @@ -1111,7 +1308,6 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); - //Log::Warning("CUDADataPartition::CopyDataIndicesKernel time %f", duration); } __global__ void PrefixSumKernel(uint32_t* cuda_elements) { @@ -1145,7 +1341,6 @@ __global__ void AddPredictionToScoreKernel(const double* data_partition_leaf_out const data_size_t inner_data_index = static_cast(offset + threadIdx_x); if (inner_data_index < num_data) { const data_size_t data_index = data_indices[inner_data_index]; - //output_score[data_index] = leaf_prediction_value; cuda_scores[data_index] += leaf_prediction_value; } } @@ -1157,10 +1352,6 @@ void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double learning_r cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, learning_rate, train_data_score_tmp_, cuda_scores); SynchronizeCUDADevice(); global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel"); - global_timer.Start("CUDADataPartition::Copy Score"); - //CopyFromCUDADeviceToHost(cpu_train_data_score_tmp_.data(), train_data_score_tmp_, static_cast(num_data_)); - //SynchronizeCUDADevice(); - global_timer.Stop("CUDADataPartition::Copy Score"); } __global__ void PrepareCUDASplitInforBufferKernel(const int* leaf_index, const int* best_split_feature, const uint32_t* best_split_threshold, diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index dcab8ddd331d..161201411662 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -14,7 +14,7 @@ #include "new_cuda_utils.hpp" #define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) -#define SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) +#define SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION (512) #define NUM_BANKS_DATA_PARTITION (32) #define LOG_NUM_BANKS_DATA_PARTITION (5) From ffcf76580a7d0646df79a994841d304d199939f2 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 1 Jun 2021 10:51:22 +0000 Subject: [PATCH 015/166] before code refactor --- .../cuda/cuda_best_split_finder.cpp | 134 ++-- .../cuda/cuda_best_split_finder.cu | 705 +++++++----------- .../cuda/cuda_best_split_finder.hpp | 41 +- src/treelearner/cuda/cuda_data_partition.cpp | 56 +- src/treelearner/cuda/cuda_data_partition.cu | 162 ++-- src/treelearner/cuda/cuda_data_partition.hpp | 23 +- .../cuda/cuda_histogram_constructor.cpp | 5 +- .../cuda/cuda_histogram_constructor.cu | 19 +- .../cuda/cuda_histogram_constructor.hpp | 4 +- .../cuda/new_cuda_tree_learner.cpp | 266 +------ .../cuda/new_cuda_tree_learner.hpp | 9 + src/treelearner/cuda/new_cuda_utils.hpp | 4 +- 12 files changed, 529 insertions(+), 899 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 84be6f4839ac..bc5feb15b64d 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -23,6 +23,7 @@ CUDABestSplitFinder::CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* feature_missing_type_.resize(num_features_); feature_mfb_offsets_.resize(num_features_); feature_default_bins_.resize(num_features_); + feature_num_bins_.resize(num_features_); max_num_bin_in_feature_ = 0; for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { const BinMapper* bin_mapper = train_data->FeatureBinMapper(inner_feature_index); @@ -30,9 +31,10 @@ CUDABestSplitFinder::CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* feature_missing_type_[inner_feature_index] = missing_type; feature_mfb_offsets_[inner_feature_index] = static_cast(bin_mapper->GetMostFreqBin() == 0); feature_default_bins_[inner_feature_index] = bin_mapper->GetDefaultBin(); - const int num_bin = bin_mapper->num_bin() - feature_mfb_offsets_[inner_feature_index]; - if (num_bin > max_num_bin_in_feature_) { - max_num_bin_in_feature_ = num_bin; + feature_num_bins_[inner_feature_index] = static_cast(bin_mapper->num_bin()); + const int num_bin_hist = bin_mapper->num_bin() - feature_mfb_offsets_[inner_feature_index]; + if (num_bin_hist > max_num_bin_in_feature_) { + max_num_bin_in_feature_ = num_bin_hist; } } if (max_num_bin_in_feature_ > MAX_NUM_BIN_IN_FEATURE) { @@ -72,62 +74,94 @@ void CUDABestSplitFinder::Init() { AllocateCUDAMemory(num_features_, &cuda_feature_missing_type_); CopyFromHostToCUDADevice(cuda_feature_missing_type_, feature_missing_type_.data(), static_cast(num_features_)); - AllocateCUDAMemory(1, &cuda_lambda_l1_); - CopyFromHostToCUDADevice(cuda_lambda_l1_, &lambda_l1_, 1); - - InitCUDAMemoryFromHostMemory(&cuda_lambda_l2_, &lambda_l2_, 1); + InitCUDAMemoryFromHostMemory(&cuda_feature_num_bins_, feature_num_bins_.data(), static_cast(num_features_)); AllocateCUDAMemory(num_total_bin_ * 2, &prefix_sum_hist_left_); AllocateCUDAMemory(num_total_bin_ * 2, &prefix_sum_hist_right_); - // * 2 for smaller and larger leaves, * 2 for default left or not - const size_t feature_best_split_info_buffer_size = static_cast(num_features_) * 4; - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_feature_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_default_left_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_threshold_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_gain_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_sum_gradient_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_sum_hessian_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_count_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_gain_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_left_output_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_sum_gradient_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_sum_hessian_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_count_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_gain_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_right_output_); - AllocateCUDAMemory(feature_best_split_info_buffer_size, &cuda_best_split_found_); - - AllocateCUDAMemory(1, &cuda_min_data_in_leaf_); - CopyFromHostToCUDADevice(cuda_min_data_in_leaf_, &min_data_in_leaf_, 1); - AllocateCUDAMemory(1, &cuda_min_sum_hessian_in_leaf_); - CopyFromHostToCUDADevice(cuda_min_sum_hessian_in_leaf_, &min_sum_hessian_in_leaf_, 1); - AllocateCUDAMemory(1, &cuda_min_gain_to_split_); - CopyFromHostToCUDADevice(cuda_min_gain_to_split_, &min_gain_to_split_, 1); + num_tasks_ = 0; + for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { + const uint32_t num_bin = feature_num_bins_[inner_feature_index]; + const uint8_t missing_type = feature_missing_type_[inner_feature_index]; + if (num_bin > 2 && missing_type != 0) { + if (missing_type == 1) { + cpu_task_reverse_.emplace_back(0); + cpu_task_reverse_.emplace_back(1); + cpu_task_skip_default_bin_.emplace_back(1); + cpu_task_skip_default_bin_.emplace_back(1); + cpu_task_na_as_missing_.emplace_back(0); + cpu_task_na_as_missing_.emplace_back(0); + cpu_task_feature_index_.emplace_back(inner_feature_index); + cpu_task_feature_index_.emplace_back(inner_feature_index); + cpu_task_out_default_left_.emplace_back(0); + cpu_task_out_default_left_.emplace_back(1); + num_tasks_ += 2; + } else { + cpu_task_reverse_.emplace_back(0); + cpu_task_reverse_.emplace_back(1); + cpu_task_skip_default_bin_.emplace_back(0); + cpu_task_skip_default_bin_.emplace_back(0); + cpu_task_na_as_missing_.emplace_back(1); + cpu_task_na_as_missing_.emplace_back(1); + cpu_task_feature_index_.emplace_back(inner_feature_index); + cpu_task_feature_index_.emplace_back(inner_feature_index); + cpu_task_out_default_left_.emplace_back(0); + cpu_task_out_default_left_.emplace_back(1); + num_tasks_ += 2; + } + } else { + cpu_task_reverse_.emplace_back(1); + cpu_task_skip_default_bin_.emplace_back(0); + cpu_task_na_as_missing_.emplace_back(0); + cpu_task_feature_index_.emplace_back(inner_feature_index); + if (missing_type != 2) { + cpu_task_out_default_left_.emplace_back(1); + } else { + cpu_task_out_default_left_.emplace_back(0); + } + ++num_tasks_; + } + } + InitCUDAMemoryFromHostMemory(&cuda_task_feature_index_, cpu_task_feature_index_.data(), cpu_task_feature_index_.size()); + InitCUDAMemoryFromHostMemory(&cuda_task_reverse_, cpu_task_reverse_.data(), cpu_task_reverse_.size()); + InitCUDAMemoryFromHostMemory(&cuda_task_skip_default_bin_, cpu_task_skip_default_bin_.data(), cpu_task_skip_default_bin_.size()); + InitCUDAMemoryFromHostMemory(&cuda_task_na_as_missing_, cpu_task_na_as_missing_.data(), cpu_task_na_as_missing_.size()); + InitCUDAMemoryFromHostMemory(&cuda_task_out_default_left_, cpu_task_out_default_left_.data(), cpu_task_out_default_left_.size()); + + const size_t output_buffer_size = 2 * static_cast(num_tasks_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_feature_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_default_left_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_threshold_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_gain_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_left_sum_gradient_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_left_sum_hessian_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_left_count_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_left_gain_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_left_output_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_right_sum_gradient_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_right_sum_hessian_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_right_count_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_right_gain_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_right_output_); + AllocateCUDAMemory(output_buffer_size, &cuda_best_split_found_); + + AllocateCUDAMemory(9, &cuda_best_split_info_buffer_); + cuda_streams_.resize(2); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); } void CUDABestSplitFinder::BeforeTrain() { const size_t feature_best_split_info_buffer_size = static_cast(num_features_) * 4; SetCUDAMemory(cuda_leaf_best_split_gain_, 0, static_cast(num_leaves_)); - SetCUDAMemory(cuda_best_split_found_, 0, feature_best_split_info_buffer_size); - SetCUDAMemory(cuda_best_split_gain_, 0, feature_best_split_info_buffer_size); + SetCUDAMemory(cuda_best_split_found_, 0, static_cast(num_tasks_)); + SetCUDAMemory(cuda_best_split_gain_, 0, static_cast(num_tasks_)); } void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, - const CUDALeafSplits* larger_leaf_splits) { + const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index) { auto start = std::chrono::steady_clock::now(); - LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits->cuda_leaf_index(), - larger_leaf_splits->cuda_leaf_index(), - smaller_leaf_splits->cuda_gain(), - larger_leaf_splits->cuda_gain(), - smaller_leaf_splits->cuda_sum_of_gradients(), - smaller_leaf_splits->cuda_sum_of_hessians(), - smaller_leaf_splits->cuda_num_data_in_leaf(), - smaller_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), - larger_leaf_splits->cuda_sum_of_gradients(), - larger_leaf_splits->cuda_sum_of_hessians(), - larger_leaf_splits->cuda_num_data_in_leaf(), - larger_leaf_splits->cuda_hist_in_leaf_pointer_pointer()); + LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits, smaller_leaf_index, larger_leaf_index); SynchronizeCUDADevice(); global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); LaunchSyncBestSplitForLeafKernel(smaller_leaf_splits->cuda_leaf_index(), larger_leaf_splits->cuda_leaf_index()); @@ -138,13 +172,15 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le //Log::Warning("FindBestSplitsForLeaf time %f", duration); } -void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves) { +void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int* smaller_leaf_index, + const int* larger_leaf_index, std::vector* leaf_best_split_feature, + std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { auto start = std::chrono::steady_clock::now(); - LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves); + LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves, smaller_leaf_index, larger_leaf_index, + leaf_best_split_feature, leaf_best_split_threshold, leaf_best_split_default_left, best_leaf_index); SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); - //Log::Warning("FindBestFromAllSplits time %f", duration); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index d5eeac62c8b9..a6f61a188fe6 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -110,10 +110,10 @@ __device__ void ReduceBestGain(double* gain, hist_t* sum_gradients, } } -__device__ void ReduceBestGainForLeaves(double* gain, int* leaves) { +__device__ void ReduceBestGainForLeaves(double* gain, int* leaves, int cuda_cur_num_leaves) { const unsigned int tid = threadIdx.x; - for (unsigned int s = 1; s < NUM_THREADS_FIND_BEST_LEAF; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < MAX_NUM_BIN_IN_FEATURE) { + for (unsigned int s = 1; s < cuda_cur_num_leaves; s *= 2) { + if (tid % (2 * s) == 0 && (tid + s) < cuda_cur_num_leaves) { const uint32_t tid_s = tid + s; if ((leaves[tid] == -1 && leaves[tid_s] != -1) || (leaves[tid] != -1 && leaves[tid_s] != -1 && gain[tid_s] > gain[tid])) { gain[tid] = gain[tid_s]; @@ -180,13 +180,13 @@ __device__ double GetSplitGains(double sum_left_gradients, l1, use_l1, l2); } -__device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr, +__device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, const uint32_t feature_num_bin, const uint8_t feature_mfb_offset, const uint32_t feature_default_bin, const uint8_t feature_missing_type, const double lambda_l1, const double lambda_l2, const double parent_gain, const data_size_t min_data_in_leaf, const double min_sum_hessian_in_leaf, const double min_gain_to_split, const double sum_gradients, const double sum_hessians, const data_size_t num_data, - const bool reverse, const bool skip_default_bin, const bool na_as_missing, + const bool reverse, const bool skip_default_bin, const bool na_as_missing, const uint8_t assume_out_default_left, // output parameters uint32_t* output_threshold, double* output_gain, @@ -211,7 +211,6 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr __shared__ hist_t local_grad_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; __shared__ hist_t local_hess_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; - //__shared__ data_size_t local_cnt_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; __shared__ double local_gain[MAX_NUM_BIN_IN_FEATURE]; __shared__ uint8_t threshold_found[MAX_NUM_BIN_IN_FEATURE]; __shared__ uint32_t threshold_value[MAX_NUM_BIN_IN_FEATURE]; @@ -227,11 +226,9 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr local_grad_hist[conflict_free_threadIdx_x] = feature_hist_ptr[bin_offset]; const hist_t hess = feature_hist_ptr[bin_offset + 1]; local_hess_hist[conflict_free_threadIdx_x] = hess; - //local_cnt_hist[conflict_free_threadIdx_x] = static_cast(__double2int_rn(hess * cnt_factor)); } else { local_grad_hist[conflict_free_threadIdx_x] = 0.0f; local_hess_hist[conflict_free_threadIdx_x] = 0.0f; - //local_cnt_hist[conflict_free_threadIdx_x] = 0; } } else { if (threadIdx_x < feature_num_bin_minus_offset) { @@ -241,17 +238,13 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr local_grad_hist[conflict_free_write_index] = feature_hist_ptr[bin_offset]; const hist_t hess = feature_hist_ptr[bin_offset + 1]; local_hess_hist[conflict_free_write_index] = hess; - //local_cnt_hist[conflict_free_write_index] = static_cast(__double2int_rn(hess * cnt_factor)); } else { - //printf("unwrite gradient = %f, hessian = %f\n", feature_hist_ptr[bin_offset], feature_hist_ptr[bin_offset + 1]); local_grad_hist[conflict_free_write_index] = 0.0f; local_hess_hist[conflict_free_write_index] = 0.0f; - // /local_cnt_hist[conflict_free_write_index] = 0; } } else { local_grad_hist[conflict_free_threadIdx_x] = 0.0f; local_hess_hist[conflict_free_threadIdx_x] = 0.0f; - //local_cnt_hist[conflict_free_threadIdx_x] = 0; } } __syncthreads(); @@ -262,14 +255,13 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr __syncthreads(); PrefixSumHist(local_grad_hist, MAX_NUM_BIN_IN_FEATURE); PrefixSumHist(local_hess_hist, MAX_NUM_BIN_IN_FEATURE); - //PrefixSumHistCnt(local_cnt_hist, MAX_NUM_BIN_IN_FEATURE); __syncthreads(); const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(threadIdx_x + 1); if (reverse) { if (threadIdx_x >= na_as_missing && threadIdx_x <= feature_num_bin - 2 && !skip_split) { const double sum_right_gradient = local_grad_hist[conflict_free_threadIdx_x_plus_1]; const double sum_right_hessian = local_hess_hist[conflict_free_threadIdx_x_plus_1]; - const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor));// local_cnt_hist[conflict_free_threadIdx_x_plus_1]; + const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian; const data_size_t left_count = num_data - right_count; @@ -297,7 +289,7 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr if (threadIdx_x <= feature_num_bin_minus_offset - 2 /* TODO(shiyu1994): skip default */) { const double sum_left_gradient = local_grad_hist[conflict_free_threadIdx_x_plus_1]; const double sum_left_hessian = local_hess_hist[conflict_free_threadIdx_x_plus_1]; - const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor));//local_cnt_hist[conflict_free_threadIdx_x_plus_1]; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = num_data - left_count; @@ -323,18 +315,17 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr } } __syncthreads(); - ReduceBestGain(local_gain, local_grad_hist, local_hess_hist, /*local_cnt_hist, */threshold_found, threshold_value); + ReduceBestGain(local_gain, local_grad_hist, local_hess_hist, threshold_found, threshold_value); const uint8_t found = threshold_found[0]; if (found && threadIdx_x == 0) { *output_found = 1; *output_threshold = threshold_value[0]; *output_gain = local_gain[0]; - *output_default_left = reverse; + *output_default_left = assume_out_default_left; if (reverse) { const double sum_right_gradient = local_grad_hist[1]; const double sum_right_hessian = local_hess_hist[1]; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); - //const data_size_t right_count = local_cnt_hist[1]; const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian; const data_size_t left_count = num_data - right_count; @@ -356,7 +347,6 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr const double sum_left_gradient = local_grad_hist[1]; const double sum_left_hessian = local_hess_hist[1]; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); - //const data_size_t left_count = local_cnt_hist[1]; const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = num_data - left_count; @@ -378,341 +368,179 @@ __device__ void FindBestSplitsForLeafKernelInner2(const hist_t* feature_hist_ptr } } -__device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, - const uint32_t feature_num_bin, const uint8_t feature_mfb_offset, - const uint32_t feature_default_bin, const uint8_t feature_missing_type, - const double lambda_l1, const double lambda_l2, const double parent_gain, const data_size_t min_data_in_leaf, - const double min_sum_hessian_in_leaf, const double min_gain_to_split, - const double sum_gradients, const double sum_hessians, const data_size_t num_data, - const bool reverse, const bool skip_default_bin, const bool na_as_missing, - // output parameters - uint32_t* output_threshold, - double* output_gain, - uint8_t* output_default_left, - double* output_left_sum_gradients, - double* output_left_sum_hessians, - data_size_t* output_left_num_data, - double* output_left_gain, - double* output_left_output, - double* output_right_sum_gradients, - double* output_right_sum_hessians, - data_size_t* output_right_num_data, - double* output_right_gain, - double* output_right_output, - uint8_t* output_found, const int inner_feature_index) { - double best_sum_left_gradient = NAN; - double best_sum_left_hessian = NAN; - double best_gain = kMinScore; - data_size_t best_left_count = 0; - uint32_t best_threshold = feature_num_bin; - const double cnt_factor = num_data / sum_hessians; - const bool use_l1 = lambda_l1 > 0.0f; - const double min_gain_shift = parent_gain + min_gain_to_split; - - *output_found = 0; - - __shared__ hist_t local_hist[2 * MAX_NUM_BIN_IN_FEATURE]; - uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; - const unsigned int threadIdx_x = threadIdx.x; - if (threadIdx_x < feature_num_bin_minus_offset * 2) { - local_hist[threadIdx_x] = feature_hist_ptr[threadIdx_x]; - } - __syncthreads(); - /*if (inner_feature_index == 6) { - if (threadIdx_x == 0) { - for (unsigned i = 0; i < MAX_NUM_BIN_IN_FEATURE; ++i) { - printf("local_grad_hist[%d] = %f\n", i, local_hist[2 * i]); - } - } - }*/ - //__syncthreads(); - if (threadIdx_x == 0) { - if (reverse) { - double sum_right_gradient = 0.0f; - double sum_right_hessian = kEpsilon; - data_size_t right_count = 0; - - int t = feature_num_bin - 1 - feature_mfb_offset - na_as_missing; - const int t_end = 1 - feature_mfb_offset; - - // from right to left, and we don't need data in bin0 - for (; t >= t_end; --t) { - // need to skip default bin - if (skip_default_bin) { - if ((t + feature_mfb_offset) == static_cast(feature_default_bin)) { - continue; - } - } - const auto grad = GET_GRAD(local_hist, t); - const auto hess = GET_HESS(local_hist, t); - data_size_t cnt = - static_cast(__double2int_rn(hess * cnt_factor)); - sum_right_gradient += grad; - sum_right_hessian += hess; - right_count += cnt; - // if data not enough, or sum hessian too small - if (right_count < min_data_in_leaf || - sum_right_hessian < min_sum_hessian_in_leaf) { - continue; - } - data_size_t left_count = num_data - right_count; - // if data not enough - if (left_count < min_data_in_leaf) { - break; - } - - double sum_left_hessian = sum_hessians - sum_right_hessian; - // if sum hessian too small - if (sum_left_hessian < min_sum_hessian_in_leaf) { - break; - } - - double sum_left_gradient = sum_gradients - sum_right_gradient; - /*if (inner_feature_index == 11) { - if (static_cast(t - 1 + feature_mfb_offset) == 252) { - printf("*************** feature_index 11, threshold = 252, sum_left_gradient = %f, sum_left_hessian = %f, sum_right_gradient = %f, sum_right_hessian = %f\n", - sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian); - } - }*/ - // current split gain - double current_gain = GetSplitGains( - sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - lambda_l2); - // gain with split is worse than without split - if (current_gain <= min_gain_shift) { - continue; - } - *output_found = 1; - // better split point - if (current_gain > best_gain) { - best_left_count = left_count; - best_sum_left_gradient = sum_left_gradient; - best_sum_left_hessian = sum_left_hessian; - // left is <= threshold, right is > threshold. so this is t-1 - best_threshold = static_cast(t - 1 + feature_mfb_offset); - best_gain = current_gain; - } - } - } else { - double sum_left_gradient = 0.0f; - double sum_left_hessian = kEpsilon; - data_size_t left_count = 0; - - int t = 0; - const int t_end = feature_num_bin - 2 - feature_mfb_offset; - if (na_as_missing) { - if (feature_mfb_offset == 1) { - sum_left_gradient = sum_gradients; - sum_left_hessian = sum_hessians - kEpsilon; - left_count = num_data; - for (int i = 0; i < feature_num_bin - feature_mfb_offset; ++i) { - const auto grad = GET_GRAD(local_hist, i); - const auto hess = GET_HESS(local_hist, i); - data_size_t cnt = - static_cast(__double2int_rn(hess * cnt_factor)); - sum_left_gradient -= grad; - sum_left_hessian -= hess; - left_count -= cnt; - } - t = -1; - } - } - - for (; t <= t_end; ++t) { - if (skip_default_bin) { - if ((t + feature_mfb_offset) == static_cast(feature_default_bin)) { - continue; - } - } - if (t >= 0) { - sum_left_gradient += GET_GRAD(local_hist, t); - const hist_t hess = GET_HESS(local_hist, t); - sum_left_hessian += hess; - left_count += static_cast( - __double2int_rn(hess * cnt_factor)); - } - // if data not enough, or sum hessian too small - if (left_count < min_data_in_leaf || - sum_left_hessian < min_sum_hessian_in_leaf) { - continue; - } - data_size_t right_count = num_data - left_count; - // if data not enough - if (right_count < min_data_in_leaf) { - break; - } - - double sum_right_hessian = sum_hessians - sum_left_hessian; - // if sum hessian too small - if (sum_right_hessian < min_sum_hessian_in_leaf) { - break; - } - - double sum_right_gradient = sum_gradients - sum_left_gradient; - - // current split gain - double current_gain = GetSplitGains( - sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - lambda_l2); - // gain with split is worse than without split - if (current_gain <= min_gain_shift) { - continue; - } - *output_found = 1; - // better split point - if (current_gain > best_gain) { - best_left_count = left_count; - best_sum_left_gradient = sum_left_gradient; - best_sum_left_hessian = sum_left_hessian; - best_threshold = static_cast(t + feature_mfb_offset); - best_gain = current_gain; - } - } - } - - if (*output_found) { - *output_threshold = best_threshold; - *output_gain = best_gain - min_gain_shift; - *output_default_left = reverse; - *output_left_sum_gradients = best_sum_left_gradient; - *output_left_sum_hessians = best_sum_left_hessian; - *output_left_num_data = best_left_count; - - const double best_sum_right_gradient = sum_gradients - best_sum_left_gradient; - const double best_sum_right_hessian = sum_hessians - best_sum_left_hessian; - *output_right_sum_gradients = best_sum_right_gradient; - *output_right_sum_hessians = best_sum_right_hessian; - *output_right_num_data = num_data - best_left_count; - - *output_left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, - best_sum_left_hessian, lambda_l1, use_l1, lambda_l2); - *output_left_gain = GetLeafGainGivenOutput(best_sum_left_gradient, - best_sum_left_hessian, lambda_l1, use_l1, lambda_l2, *output_left_output); - *output_right_output = CalculateSplittedLeafOutput(best_sum_right_gradient, - best_sum_right_hessian, lambda_l1, use_l1, lambda_l2); - *output_right_gain = GetLeafGainGivenOutput(best_sum_right_gradient, - best_sum_right_hessian, lambda_l1, use_l1, lambda_l2, *output_right_output); - } - } -} - -__global__ void FindBestSplitsForLeafKernel(const hist_t* cuda_hist, const int* cuda_num_total_bin, - const uint32_t* feature_hist_offsets, const uint8_t* feature_mfb_offsets, const uint32_t* feature_default_bins, - const uint8_t* feature_missing_types, const double* lambda_l1, const double* lambda_l2, const int* smaller_leaf_id, - const int* larger_leaf_id, const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, - const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, hist_t** smaller_leaf_hist, - const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, - const data_size_t* num_data_in_larger_leaf, hist_t** larger_leaf_hist, const data_size_t* min_data_in_leaf, - const double* min_sum_hessian_in_leaf, const double* min_gain_to_split, +__global__ void FindBestSplitsForOneLeafKernel( + // input feature information + const uint32_t* feature_hist_offsets, + const uint8_t* feature_mfb_offsets, + const uint32_t* feature_default_bins, + const uint8_t* feature_missing_types, + const uint32_t* feature_num_bins, + // input task information + const int* task_feature_index, + const uint8_t* task_reverse, + const uint8_t* task_skip_default_bin, + const uint8_t* task_na_as_missing, + const uint8_t* task_out_default_left, + // input leaf information + const bool is_larger, + const int leaf_index, + const double* leaf_gain, + const double* sum_gradients_in_leaf, + const double* sum_hessians_in_leaf, + const data_size_t* num_data_in_leaf, + hist_t** leaf_hist, + // input config parameter values + const data_size_t min_data_in_leaf, + const double min_sum_hessian_in_leaf, + const double min_gain_to_split, + const double lambda_l1, + const double lambda_l2, // output - uint32_t* cuda_best_split_threshold, uint8_t* cuda_best_split_default_left, - double* cuda_best_split_gain, double* cuda_best_split_left_sum_gradient, - double* cuda_best_split_left_sum_hessian, data_size_t* cuda_best_split_left_count, - double* cuda_best_split_left_gain, double* cuda_best_split_left_output, - double* cuda_best_split_right_sum_gradient, double* cuda_best_split_right_sum_hessian, - data_size_t* cuda_best_split_right_count, double* cuda_best_split_right_gain, - double* cuda_best_split_right_output, uint8_t* cuda_best_split_found) { - const unsigned int num_features = gridDim.x / 2; - const unsigned int inner_feature_index = (blockIdx.x /*/ 2*/) % num_features; - const unsigned int global_block_idx = blockIdx.x; - const bool reverse = true;//blockIdx.x % 2 == 0 ? true : false; - const bool smaller_or_larger = static_cast(blockIdx.x / (/*2 **/ num_features) == 0); - const int num_bin = feature_hist_offsets[inner_feature_index + 1] - feature_hist_offsets[inner_feature_index]; - const uint8_t missing_type = feature_missing_types[inner_feature_index]; - const int leaf_index = smaller_or_larger ? *smaller_leaf_id : *larger_leaf_id; - const double parent_gain = smaller_or_larger ? *smaller_leaf_gain : *larger_leaf_gain; - /*if (blockIdx.x == 0 && threadIdx.x == 0) { - printf("parent_gain = %f\n", parent_gain); - }*/ - const double sum_gradients = smaller_or_larger ? *sum_gradients_in_smaller_leaf : *sum_gradients_in_larger_leaf; - const double sum_hessians = (smaller_or_larger ? *sum_hessians_in_smaller_leaf : *sum_hessians_in_larger_leaf) + 2 * kEpsilon; - const double num_data_in_leaf = smaller_or_larger ? *num_data_in_smaller_leaf : *num_data_in_larger_leaf; - uint32_t* out_threshold = cuda_best_split_threshold + global_block_idx; - double* out_left_sum_gradients = cuda_best_split_left_sum_gradient + global_block_idx; - double* out_left_sum_hessians = cuda_best_split_left_sum_hessian + global_block_idx; - double* out_right_sum_gradients = cuda_best_split_right_sum_gradient + global_block_idx; - double* out_right_sum_hessians = cuda_best_split_right_sum_hessian + global_block_idx; - data_size_t* out_left_num_data = cuda_best_split_left_count + global_block_idx; - data_size_t* out_right_num_data = cuda_best_split_right_count + global_block_idx; - double* out_left_output = cuda_best_split_left_output + global_block_idx; - double* out_right_output = cuda_best_split_right_output + global_block_idx; - double* out_left_gain = cuda_best_split_left_gain + global_block_idx; - double* out_right_gain = cuda_best_split_right_gain + global_block_idx; - uint8_t* out_found = cuda_best_split_found + global_block_idx; - uint8_t* out_default_left = cuda_best_split_default_left + global_block_idx; - double* out_gain = cuda_best_split_gain + global_block_idx; - if (leaf_index < 0) { - *out_found = 0; - return; - } - const int cuda_num_total_bin_ref = *cuda_num_total_bin; - const hist_t* hist_ptr = smaller_or_larger ? *smaller_leaf_hist + feature_hist_offsets[inner_feature_index] * 2 : - *larger_leaf_hist + feature_hist_offsets[inner_feature_index] * 2;// cuda_hist + (cuda_num_total_bin_ref * leaf_index + feature_hist_offsets[inner_feature_index]) * 2; - /*if (num_bin > 2 && missing_type != 0) { - if (missing_type == 1) { - FindBestSplitsForLeafKernelInner2(hist_ptr, - num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], - feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, - *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, - num_data_in_leaf, reverse, true, false, out_threshold, out_gain, out_default_left, - out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found, inner_feature_index); - } else { - FindBestSplitsForLeafKernelInner2(hist_ptr, - num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], - feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, - *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, - num_data_in_leaf, reverse, false, true, out_threshold, out_gain, out_default_left, - out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found, inner_feature_index); - } - } else {*/ - if (reverse) { - FindBestSplitsForLeafKernelInner2(hist_ptr, - num_bin, feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], - feature_missing_types[inner_feature_index], *lambda_l1, *lambda_l2, parent_gain, - *min_data_in_leaf, *min_sum_hessian_in_leaf, *min_gain_to_split, sum_gradients, sum_hessians, - num_data_in_leaf, reverse, true, false, out_threshold, out_gain, out_default_left, - out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found, inner_feature_index); - } - if (missing_type == 2) { - *out_default_left = 0; - } - //} + uint32_t* cuda_best_split_threshold, + uint8_t* cuda_best_split_default_left, + double* cuda_best_split_gain, + double* cuda_best_split_left_sum_gradient, + double* cuda_best_split_left_sum_hessian, + data_size_t* cuda_best_split_left_count, + double* cuda_best_split_left_gain, + double* cuda_best_split_left_output, + double* cuda_best_split_right_sum_gradient, + double* cuda_best_split_right_sum_hessian, + data_size_t* cuda_best_split_right_count, + double* cuda_best_split_right_gain, + double* cuda_best_split_right_output, + uint8_t* cuda_best_split_found) { + + const unsigned int task_index = blockIdx.x; + const int inner_feature_index = task_feature_index[task_index]; + const bool reverse = static_cast(task_reverse[task_index]); + const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); + const bool na_as_missing = static_cast(task_na_as_missing[task_index]); + const bool assume_out_default_left = task_out_default_left[task_index]; + uint8_t* out_default_left = cuda_best_split_default_left + task_index; + const double parent_gain = *leaf_gain; + const double sum_gradients = *sum_gradients_in_leaf; + const double sum_hessians = (*sum_hessians_in_leaf) + 2 * kEpsilon; + const double num_data = *num_data_in_leaf; + const unsigned int output_offset = is_larger ? (task_index + gridDim.x) : task_index; + uint32_t* out_threshold = cuda_best_split_threshold + output_offset; + double* out_left_sum_gradients = cuda_best_split_left_sum_gradient + output_offset; + double* out_left_sum_hessians = cuda_best_split_left_sum_hessian + output_offset; + double* out_right_sum_gradients = cuda_best_split_right_sum_gradient + output_offset; + double* out_right_sum_hessians = cuda_best_split_right_sum_hessian + output_offset; + data_size_t* out_left_num_data = cuda_best_split_left_count + output_offset; + data_size_t* out_right_num_data = cuda_best_split_right_count + output_offset; + double* out_left_output = cuda_best_split_left_output + output_offset; + double* out_right_output = cuda_best_split_right_output + output_offset; + double* out_left_gain = cuda_best_split_left_gain + output_offset; + double* out_right_gain = cuda_best_split_right_gain + output_offset; + uint8_t* out_found = cuda_best_split_found + output_offset; + double* out_gain = cuda_best_split_gain + output_offset; + const hist_t* hist_ptr = *leaf_hist + feature_hist_offsets[inner_feature_index] * 2; + FindBestSplitsForLeafKernelInner(hist_ptr, + feature_num_bins[inner_feature_index], feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], + feature_missing_types[inner_feature_index], lambda_l1, lambda_l2, parent_gain, + min_data_in_leaf, min_sum_hessian_in_leaf, min_gain_to_split, sum_gradients, sum_hessians, + num_data, reverse, skip_default_bin, na_as_missing, assume_out_default_left, out_threshold, out_gain, out_default_left, + out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, + out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found, inner_feature_index); } -void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(const int* smaller_leaf_id, const int* larger_leaf_id, - const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, - const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, hist_t** smaller_leaf_hist, - const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, - const data_size_t* num_data_in_larger_leaf, hist_t** larger_leaf_hist) { - // * 2 for smaller and larger leaves, * 2 for split direction - const int num_blocks = num_features_ * 2; - FindBestSplitsForLeafKernel<<>>(cuda_hist_, - cuda_num_total_bin_, cuda_feature_hist_offsets_, - cuda_feature_mfb_offsets_, cuda_feature_default_bins_, - cuda_feature_missing_type_, cuda_lambda_l1_, cuda_lambda_l2_, - smaller_leaf_id, larger_leaf_id, smaller_leaf_gain, larger_leaf_gain, - sum_gradients_in_smaller_leaf, sum_hessians_in_smaller_leaf, num_data_in_smaller_leaf, smaller_leaf_hist, - sum_gradients_in_larger_leaf, sum_hessians_in_larger_leaf, num_data_in_larger_leaf, larger_leaf_hist, - cuda_min_data_in_leaf_, cuda_min_sum_hessian_in_leaf_, cuda_min_gain_to_split_, - - cuda_best_split_threshold_, cuda_best_split_default_left_, cuda_best_split_gain_, - cuda_best_split_left_sum_gradient_, cuda_best_split_left_sum_hessian_, - cuda_best_split_left_count_, cuda_best_split_left_gain_, cuda_best_split_left_output_, - cuda_best_split_right_sum_gradient_, cuda_best_split_right_sum_hessian_, - cuda_best_split_right_count_, cuda_best_split_right_gain_, cuda_best_split_right_output_, +void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( + const CUDALeafSplits* smaller_leaf_splits, + const CUDALeafSplits* larger_leaf_splits, + const int smaller_leaf_index, + const int larger_leaf_index) { + FindBestSplitsForOneLeafKernel<<>>( + // input feature information + cuda_feature_hist_offsets_, + cuda_feature_mfb_offsets_, + cuda_feature_default_bins_, + cuda_feature_missing_type_, + cuda_feature_num_bins_, + // input task information + cuda_task_feature_index_, + cuda_task_reverse_, + cuda_task_skip_default_bin_, + cuda_task_na_as_missing_, + cuda_task_out_default_left_, + // input leaf information + false, + smaller_leaf_index, + smaller_leaf_splits->cuda_gain(), + smaller_leaf_splits->cuda_sum_of_gradients(), + smaller_leaf_splits->cuda_sum_of_hessians(), + smaller_leaf_splits->cuda_num_data_in_leaf(), + smaller_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), + // configuration parameter values + min_data_in_leaf_, + min_sum_hessian_in_leaf_, + min_gain_to_split_, + lambda_l1_, + lambda_l2_, + // output parameters + cuda_best_split_threshold_, + cuda_best_split_default_left_, + cuda_best_split_gain_, + cuda_best_split_left_sum_gradient_, + cuda_best_split_left_sum_hessian_, + cuda_best_split_left_count_, + cuda_best_split_left_gain_, + cuda_best_split_left_output_, + cuda_best_split_right_sum_gradient_, + cuda_best_split_right_sum_hessian_, + cuda_best_split_right_count_, + cuda_best_split_right_gain_, + cuda_best_split_right_output_, cuda_best_split_found_); + + if (larger_leaf_index >= 0) { + FindBestSplitsForOneLeafKernel<<>>( + // input feature information + cuda_feature_hist_offsets_, + cuda_feature_mfb_offsets_, + cuda_feature_default_bins_, + cuda_feature_missing_type_, + cuda_feature_num_bins_, + // input task information + cuda_task_feature_index_, + cuda_task_reverse_, + cuda_task_skip_default_bin_, + cuda_task_na_as_missing_, + cuda_task_out_default_left_, + // input leaf information + true, + larger_leaf_index, + larger_leaf_splits->cuda_gain(), + larger_leaf_splits->cuda_sum_of_gradients(), + larger_leaf_splits->cuda_sum_of_hessians(), + larger_leaf_splits->cuda_num_data_in_leaf(), + larger_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), + // configuration parameter values + min_data_in_leaf_, + min_sum_hessian_in_leaf_, + min_gain_to_split_, + lambda_l1_, + lambda_l2_, + // output parameters + cuda_best_split_threshold_, + cuda_best_split_default_left_, + cuda_best_split_gain_, + cuda_best_split_left_sum_gradient_, + cuda_best_split_left_sum_hessian_, + cuda_best_split_left_count_, + cuda_best_split_left_gain_, + cuda_best_split_left_output_, + cuda_best_split_right_sum_gradient_, + cuda_best_split_right_sum_hessian_, + cuda_best_split_right_count_, + cuda_best_split_right_gain_, + cuda_best_split_right_output_, + cuda_best_split_found_); + } } -__device__ void ReduceBestSplit(uint8_t* found, int* feature, double* gain, uint8_t* default_left, - uint32_t* threshold, double* left_sum_gradient, double* left_sum_hessian, - data_size_t* left_count, double* left_gain, double* left_output, - double* right_sum_gradient, double* right_sum_hessian, - data_size_t* right_count, double* right_gain, double* right_output, +__device__ void ReduceBestSplit(uint8_t* found, double* gain, uint32_t* shared_read_index, uint32_t num_features_aligned, uint32_t thread_offset) { const uint32_t threadIdx_x = threadIdx.x - thread_offset; for (unsigned int s = 1; s < num_features_aligned; s <<= 1) { @@ -721,20 +549,8 @@ __device__ void ReduceBestSplit(uint8_t* found, int* feature, double* gain, uint if ((!found[threadIdx_x] && found[pos_to_compare]) || (found[threadIdx_x] && found[pos_to_compare] && gain[threadIdx_x] < gain[pos_to_compare])) { found[threadIdx_x] = found[pos_to_compare]; - feature[threadIdx_x] = feature[pos_to_compare]; gain[threadIdx_x] = gain[pos_to_compare]; - default_left[threadIdx_x] = default_left[pos_to_compare]; - threshold[threadIdx_x] = threshold[pos_to_compare]; - left_sum_gradient[threadIdx_x] = left_sum_gradient[pos_to_compare]; - left_sum_hessian[threadIdx_x] = left_sum_hessian[pos_to_compare]; - left_count[threadIdx_x] = left_count[pos_to_compare]; - left_gain[threadIdx_x] = left_gain[pos_to_compare]; - left_output[threadIdx_x] = left_output[pos_to_compare]; - right_sum_gradient[threadIdx_x] = right_sum_gradient[pos_to_compare]; - right_sum_hessian[threadIdx_x] = right_sum_hessian[pos_to_compare]; - right_count[threadIdx_x] = right_count[pos_to_compare]; - right_gain[threadIdx_x] = right_gain[pos_to_compare]; - right_output[threadIdx_x] = right_output[pos_to_compare]; + shared_read_index[threadIdx_x] = shared_read_index[pos_to_compare]; } } __syncthreads(); @@ -766,84 +582,51 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const const double* cuda_best_split_right_gain, const double* cuda_best_split_right_output, const uint8_t* cuda_best_split_found, - const uint32_t* cuda_feature_default_bins) { + const uint32_t* cuda_feature_default_bins, + const int num_features_ref, + const int num_features_aligned) { const uint32_t threadIdx_x = threadIdx.x; const uint32_t blockIdx_x = blockIdx.x; - int num_features_ref = *cuda_num_features; - int num_features_aligned = 1; - num_features_ref -= 1; - while (num_features_ref > 0) { - num_features_aligned <<= 1; - num_features_ref >>= 1; - } - num_features_ref = *cuda_num_features; __shared__ uint8_t best_found[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ int best_feature[NUM_FEATURES_PER_SYNC_BLOCK]; __shared__ double best_gain[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ uint8_t best_default_left[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ uint32_t best_threshold[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ double best_left_sum_gradient[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ double best_left_sum_hessian[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ data_size_t best_left_count[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ double best_left_gain[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ double best_left_output[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ double best_right_sum_gradient[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ double best_right_sum_hessian[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ data_size_t best_right_count[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ double best_right_gain[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ double best_right_output[NUM_FEATURES_PER_SYNC_BLOCK]; + __shared__ uint32_t shared_read_index[NUM_FEATURES_PER_SYNC_BLOCK]; const bool is_smaller = (blockIdx_x == 0); const int feature_index = static_cast(threadIdx_x); const uint32_t read_index = is_smaller ? threadIdx_x : threadIdx_x + num_features_ref; if (feature_index < num_features_ref) { best_found[feature_index] = cuda_best_split_found[read_index]; - best_feature[feature_index] = feature_index; best_gain[feature_index] = cuda_best_split_gain[read_index]; - best_default_left[feature_index] = cuda_best_split_default_left[read_index]; - best_threshold[feature_index] = cuda_best_split_threshold[read_index]; - best_left_sum_gradient[feature_index] = cuda_best_split_left_sum_gradient[read_index]; - best_left_sum_hessian[feature_index] = cuda_best_split_left_sum_hessian[read_index]; - best_left_count[feature_index] = cuda_best_split_left_count[read_index]; - best_left_gain[feature_index] = cuda_best_split_left_gain[read_index]; - best_left_output[feature_index] = cuda_best_split_left_output[read_index]; - best_right_sum_gradient[feature_index] = cuda_best_split_right_sum_gradient[read_index]; - best_right_sum_hessian[feature_index] = cuda_best_split_right_sum_hessian[read_index]; - best_right_count[feature_index] = cuda_best_split_right_count[read_index]; - best_right_gain[feature_index] = cuda_best_split_right_gain[read_index]; - best_right_output[feature_index] = cuda_best_split_right_output[read_index]; + shared_read_index[feature_index] = read_index; } else { best_found[feature_index] = 0; } __syncthreads(); - ReduceBestSplit(best_found, best_feature, best_gain, - best_default_left, best_threshold, - best_left_sum_gradient, best_left_sum_hessian, - best_left_count, best_left_gain, best_left_output, - best_right_sum_gradient, best_right_sum_hessian, - best_right_count, best_right_gain, best_right_output, + ReduceBestSplit(best_found, best_gain, shared_read_index, num_features_aligned, 0); if (threadIdx.x == 0) { const int leaf_index_ref = is_smaller ? *smaller_leaf_index : *larger_leaf_index; + const uint32_t best_read_index = shared_read_index[0]; if (best_found[0]) { cuda_leaf_best_split_gain[leaf_index_ref] = best_gain[0]; - cuda_leaf_best_split_feature[leaf_index_ref] = best_feature[0]; - cuda_leaf_best_split_default_left[leaf_index_ref] = best_default_left[0]; - cuda_leaf_best_split_threshold[leaf_index_ref] = best_threshold[0]; - cuda_leaf_best_split_left_sum_gradient[leaf_index_ref] = best_left_sum_gradient[0]; - cuda_leaf_best_split_left_sum_hessian[leaf_index_ref] = best_left_sum_hessian[0]; - cuda_leaf_best_split_left_count[leaf_index_ref] = best_left_count[0]; - cuda_leaf_best_split_left_gain[leaf_index_ref] = best_left_gain[0]; - cuda_leaf_best_split_left_output[leaf_index_ref] = best_left_output[0]; - cuda_leaf_best_split_right_sum_gradient[leaf_index_ref] = best_right_sum_gradient[0]; - cuda_leaf_best_split_right_sum_hessian[leaf_index_ref] = best_right_sum_hessian[0]; - cuda_leaf_best_split_right_count[leaf_index_ref] = best_right_count[0]; - cuda_leaf_best_split_right_gain[leaf_index_ref] = best_right_gain[0]; - cuda_leaf_best_split_right_output[leaf_index_ref] = best_right_output[0]; + cuda_leaf_best_split_feature[leaf_index_ref] = is_smaller ? static_cast(best_read_index) : + static_cast(best_read_index) - num_features_ref; + cuda_leaf_best_split_default_left[leaf_index_ref] = cuda_best_split_default_left[best_read_index]; + cuda_leaf_best_split_threshold[leaf_index_ref] = cuda_best_split_threshold[best_read_index]; + cuda_leaf_best_split_left_sum_gradient[leaf_index_ref] = cuda_best_split_left_sum_gradient[best_read_index]; + cuda_leaf_best_split_left_sum_hessian[leaf_index_ref] = cuda_best_split_left_sum_hessian[best_read_index]; + cuda_leaf_best_split_left_count[leaf_index_ref] = cuda_best_split_left_count[best_read_index]; + cuda_leaf_best_split_left_gain[leaf_index_ref] = cuda_best_split_left_gain[best_read_index]; + cuda_leaf_best_split_left_output[leaf_index_ref] = cuda_best_split_left_output[best_read_index]; + cuda_leaf_best_split_right_sum_gradient[leaf_index_ref] = cuda_best_split_right_sum_gradient[best_read_index]; + cuda_leaf_best_split_right_sum_hessian[leaf_index_ref] = cuda_best_split_right_sum_hessian[best_read_index]; + cuda_leaf_best_split_right_count[leaf_index_ref] = cuda_best_split_right_count[best_read_index]; + cuda_leaf_best_split_right_gain[leaf_index_ref] = cuda_best_split_right_gain[best_read_index]; + cuda_leaf_best_split_right_output[leaf_index_ref] = cuda_best_split_right_output[best_read_index]; } else { cuda_leaf_best_split_gain[leaf_index_ref] = kMinScore; } @@ -851,6 +634,13 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const } void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index) { + int num_features = num_features_; + int num_features_aligned = 1; + num_features -= 1; + while (num_features > 0) { + num_features_aligned <<= 1; + num_features >>= 1; + } SyncBestSplitForLeafKernel<<<2, NUM_FEATURES_PER_SYNC_BLOCK>>>(smaller_leaf_index, larger_leaf_index, cuda_num_features_, cuda_leaf_best_split_feature_, cuda_leaf_best_split_default_left_, cuda_leaf_best_split_threshold_, cuda_leaf_best_split_gain_, @@ -875,7 +665,9 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel(const int* smaller_le cuda_best_split_right_gain_, cuda_best_split_right_output_, cuda_best_split_found_, - cuda_feature_default_bins_); + cuda_feature_default_bins_, + num_features_, + num_features_aligned); } __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, @@ -887,9 +679,9 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, const double* cuda_leaf_best_split_right_sum_gradient, const double* cuda_leaf_best_split_right_sum_hessian, const data_size_t* cuda_leaf_best_split_left_count, - const data_size_t* cuda_leaf_best_split_right_count) { + const data_size_t* cuda_leaf_best_split_right_count, + int* cuda_best_split_info_buffer) { const int cuda_cur_num_leaves_ref = *cuda_cur_num_leaves; - double best_gain = kMinScore; __shared__ double thread_best_gain[NUM_THREADS_FIND_BEST_LEAF]; __shared__ int thread_best_leaf[NUM_THREADS_FIND_BEST_LEAF]; const unsigned int threadIdx_x = threadIdx.x; @@ -906,44 +698,67 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, } } __syncthreads(); - ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf); + ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf, cuda_cur_num_leaves_ref); if (threadIdx_x == 0) { *out_best_leaf = thread_best_leaf[0]; - best_gain = thread_best_gain[0]; - /*if (best_gain <= 0.0f) { - printf("error !!! too smaller best gain %f\n", best_gain); - }*/ + cuda_best_split_info_buffer[8] = thread_best_leaf[0]; } - /*for (int leaf_index = 0; leaf_index < cuda_cur_num_leaves_ref; ++leaf_index) { - const double leaf_best_gain = cuda_leaf_best_split_gain[leaf_index]; - //printf("cuda_leaf_best_split_gain[%d] = %f\n", leaf_index, leaf_best_gain); - if (leaf_best_gain > best_gain) { - best_gain = leaf_best_gain; - *out_best_leaf = leaf_index; - } - }*/ - //printf("find best cuda_leaf_best_split_gain[%d] = %f\n", *out_best_leaf, best_gain); - //printf("split feature = %d, threshold = %d, default_bin = %d\n", - // cuda_leaf_best_split_feature[*out_best_leaf], cuda_leaf_best_split_threshold[*out_best_leaf], cuda_feature_default_bins[*out_best_leaf]); - /*printf("left_sum_gradient = %f, left_sum_hessian = %f, left_count = %d\n", - cuda_leaf_best_split_left_sum_gradient[*out_best_leaf], - cuda_leaf_best_split_left_sum_hessian[*out_best_leaf], - cuda_leaf_best_split_left_count[*out_best_leaf]);*/ - /*printf("right_sum_gradient = %f, right_sum_hessian = %f, right_count = %d\n", - cuda_leaf_best_split_right_sum_gradient[*out_best_leaf], - cuda_leaf_best_split_right_sum_hessian[*out_best_leaf], - cuda_leaf_best_split_right_count[*out_best_leaf]);*/ } -void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves) { - FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF>>>(cuda_cur_num_leaves, cuda_leaf_best_split_gain_, cuda_best_leaf_, +__global__ void PrepareLeafBestSplitInfo(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, + int* cuda_best_split_info_buffer, const int* cuda_leaf_best_split_feature, + const uint32_t* cuda_leaf_best_split_threshold, const uint8_t* cuda_leaf_best_split_default_left) { + const int smaller_leaf_index_ref = *cuda_smaller_leaf_index; + const int larger_leaf_index_ref = *cuda_larger_leaf_index; + const unsigned int threadIdx_x = threadIdx.x; + if (threadIdx_x == 0) { + cuda_best_split_info_buffer[0] = smaller_leaf_index_ref; + } else if (threadIdx_x == 1) { + cuda_best_split_info_buffer[1] = cuda_leaf_best_split_feature[smaller_leaf_index_ref]; + } else if (threadIdx_x == 2) { + cuda_best_split_info_buffer[2] = cuda_leaf_best_split_threshold[smaller_leaf_index_ref]; + } else if (threadIdx_x == 3) { + cuda_best_split_info_buffer[3] = cuda_leaf_best_split_default_left[smaller_leaf_index_ref]; + } else if (threadIdx_x == 4) { + cuda_best_split_info_buffer[4] = larger_leaf_index_ref; + } else if (threadIdx_x == 5) { + cuda_best_split_info_buffer[5] = cuda_leaf_best_split_feature[larger_leaf_index_ref]; + } else if (threadIdx_x == 6) { + cuda_best_split_info_buffer[6] = cuda_leaf_best_split_threshold[larger_leaf_index_ref]; + } else if (threadIdx_x == 7) { + cuda_best_split_info_buffer[7] = cuda_leaf_best_split_default_left[larger_leaf_index_ref]; + } +} + +void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, + const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, std::vector* leaf_best_split_feature, + std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { + FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF, 0, cuda_streams_[1]>>>(cuda_cur_num_leaves, cuda_leaf_best_split_gain_, cuda_best_leaf_, cuda_leaf_best_split_feature_, cuda_leaf_best_split_threshold_, cuda_feature_default_bins_, cuda_leaf_best_split_left_sum_gradient_, cuda_leaf_best_split_left_sum_hessian_, cuda_leaf_best_split_right_sum_gradient_, cuda_leaf_best_split_right_sum_hessian_, cuda_leaf_best_split_left_count_, - cuda_leaf_best_split_right_count_); + cuda_leaf_best_split_right_count_, + cuda_best_split_info_buffer_); + PrepareLeafBestSplitInfo<<<1, 8, 0, cuda_streams_[0]>>>(cuda_smaller_leaf_index, cuda_larger_leaf_index, + cuda_best_split_info_buffer_, cuda_leaf_best_split_feature_, + cuda_leaf_best_split_threshold_, cuda_leaf_best_split_default_left_); + std::vector cpu_leaf_best_split_info_buffer(9); + SynchronizeCUDADevice(); + CopyFromCUDADeviceToHost(cpu_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 9); + const int smaller_leaf_index = cpu_leaf_best_split_info_buffer[0]; + const int larger_leaf_index = cpu_leaf_best_split_info_buffer[4]; + (*leaf_best_split_feature)[smaller_leaf_index] = cpu_leaf_best_split_info_buffer[1]; + (*leaf_best_split_threshold)[smaller_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[2]); + (*leaf_best_split_default_left)[smaller_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[3]); + if (larger_leaf_index >= 0) { + (*leaf_best_split_feature)[larger_leaf_index] = cpu_leaf_best_split_info_buffer[5]; + (*leaf_best_split_threshold)[larger_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[6]); + (*leaf_best_split_default_left)[larger_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[7]); + } + *best_leaf_index = cpu_leaf_best_split_info_buffer[8]; } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index d3e836ab8612..6496d08b3fda 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -37,9 +37,12 @@ class CUDABestSplitFinder { void BeforeTrain(); - void FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, const CUDALeafSplits* larger_leaf_splits); + void FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, const CUDALeafSplits* larger_leaf_splits, + const int smaller_leaf_index, const int larger_leaf_index); - void FindBestFromAllSplits(const int* cuda_cur_num_leaves); + void FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int* smaller_leaf_index, + const int* larger_leaf_index, std::vector* leaf_best_split_feature, + std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index); const int* cuda_best_leaf() const { return cuda_best_leaf_; } @@ -95,15 +98,14 @@ class CUDABestSplitFinder { } private: - void LaunchFindBestSplitsForLeafKernel(const int* smaller_leaf_id, const int* larger_leaf_id, - const double* smaller_leaf_gain, const double* larger_leaf_gain, const double* sum_gradients_in_smaller_leaf, - const double* sum_hessians_in_smaller_leaf, const data_size_t* num_data_in_smaller_leaf, hist_t** smaller_leaf_hist, - const double* sum_gradients_in_larger_leaf, const double* sum_hessians_in_larger_leaf, - const data_size_t* num_data_in_larger_leaf, hist_t** larger_leaf_hist); + void LaunchFindBestSplitsForLeafKernel(const CUDALeafSplits* smaller_leaf_splits, + const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index); void LaunchSyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index); - void LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves); + void LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, const int* cuda_smaller_leaf_index, + const int* cuda_larger_leaf_index, std::vector* leaf_best_split_feature, + std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index); // Host memory const int num_features_; @@ -113,6 +115,7 @@ class CUDABestSplitFinder { std::vector feature_hist_offsets_; std::vector feature_mfb_offsets_; std::vector feature_default_bins_; + std::vector feature_num_bins_; // None --> 0, Zero --> 1, NaN --> 2 std::vector feature_missing_type_; const double lambda_l1_; @@ -120,6 +123,14 @@ class CUDABestSplitFinder { const data_size_t min_data_in_leaf_; const double min_sum_hessian_in_leaf_; const double min_gain_to_split_; + std::vector cuda_streams_; + // for best split find tasks + std::vector cpu_task_feature_index_; + std::vector cpu_task_reverse_; + std::vector cpu_task_skip_default_bin_; + std::vector cpu_task_na_as_missing_; + std::vector cpu_task_out_default_left_; + int num_tasks_; // CUDA memory, held by this object // for per leaf best split information @@ -163,11 +174,15 @@ class CUDABestSplitFinder { uint8_t* cuda_feature_mfb_offsets_; uint32_t* cuda_feature_default_bins_; uint8_t* cuda_feature_missing_type_; - double* cuda_lambda_l1_; - double* cuda_lambda_l2_; - data_size_t* cuda_min_data_in_leaf_; - double* cuda_min_sum_hessian_in_leaf_; - double* cuda_min_gain_to_split_; + uint32_t* cuda_feature_num_bins_; + // best split information buffer, to be copied to CPU + int* cuda_best_split_info_buffer_; + // find best split task information + int* cuda_task_feature_index_; + uint8_t* cuda_task_reverse_; + uint8_t* cuda_task_skip_default_bin_; + uint8_t* cuda_task_na_as_missing_; + uint8_t* cuda_task_out_default_left_; // CUDA memory, held by other object const hist_t* cuda_hist_; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index dc22c88b340a..220dc95baec7 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -98,7 +98,7 @@ void CUDADataPartition::Init() { InitCUDAMemoryFromHostMemory(&cuda_feature_missing_is_na_, feature_missing_is_na_.data(), static_cast(num_features_)); InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_zero_, feature_mfb_is_zero_.data(), static_cast(num_features_)); InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_na_, feature_mfb_is_na_.data(), static_cast(num_features_)); - AllocateCUDAMemory(5, &cuda_split_info_buffer_); + AllocateCUDAMemory(8, &cuda_split_info_buffer_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_split_leaf_index_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_inner_feature_index_); @@ -121,11 +121,14 @@ void CUDADataPartition::Init() { CopyColWiseData(); cpu_train_data_score_tmp_.resize(num_data_, 0.0f); - cpu_split_info_buffer_.resize(5, 0); + cpu_split_info_buffer_.resize(6, 0); - cuda_streams_.resize(2); + cuda_streams_.resize(5); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[2])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[3])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[4])); } void CUDADataPartition::CopyColWiseData() { @@ -173,27 +176,25 @@ void CUDADataPartition::Split(const int* leaf_id, double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer) { - data_size_t cpu_num_data_in_leaf = 0; - int cpu_split_feature_index = 0; - uint32_t cpu_split_threshold = 0; - uint8_t cpu_split_default_left = 0; - data_size_t cpu_leaf_data_start = 0; + hist_t** larger_leaf_cuda_hist_pointer_pointer, + std::vector* cpu_leaf_num_data, + std::vector* cpu_leaf_data_start, + const std::vector& cpu_leaf_best_split_feature, + const std::vector& cpu_leaf_best_split_threshold, + const std::vector& cpu_leaf_best_split_default_left, + int* smaller_leaf_index, int* larger_leaf_index, + const int cpu_leaf_index) { global_timer.Start("GenDataToLeftBitVector"); global_timer.Start("SplitInner Copy CUDA To Host"); - PrepareCUDASplitInforBuffer(leaf_id, best_split_feature, best_split_threshold, best_split_default_left); - global_timer.Start("PrepareCUDASplitInforBuffer"); - CopyFromCUDADeviceToHost(cpu_split_info_buffer_.data(), cuda_split_info_buffer_, 5); - global_timer.Stop("PrepareCUDASplitInforBuffer"); - cpu_num_data_in_leaf = static_cast(cpu_split_info_buffer_[0]); - cpu_split_feature_index = static_cast(cpu_split_info_buffer_[1]); - cpu_split_threshold = static_cast(cpu_split_info_buffer_[2]); - cpu_split_default_left = static_cast(cpu_split_info_buffer_[3]); - cpu_leaf_data_start = static_cast(cpu_split_info_buffer_[4]); + const data_size_t num_data_in_leaf = cpu_leaf_num_data->at(cpu_leaf_index); + const int split_feature_index = cpu_leaf_best_split_feature[cpu_leaf_index]; + const uint32_t split_threshold = cpu_leaf_best_split_threshold[cpu_leaf_index]; + const uint8_t split_default_left = cpu_leaf_best_split_default_left[cpu_leaf_index]; + const data_size_t leaf_data_start = cpu_leaf_data_start->at(cpu_leaf_index); global_timer.Stop("SplitInner Copy CUDA To Host"); auto start = std::chrono::steady_clock::now(); //GenDataToLeftBitVector(leaf_id, cpu_num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left); - GenDataToLeftBitVector2(cpu_num_data_in_leaf, cpu_split_feature_index, cpu_split_threshold, cpu_split_default_left, cpu_leaf_data_start); + GenDataToLeftBitVector2(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); global_timer.Stop("GenDataToLeftBitVector"); @@ -201,7 +202,7 @@ void CUDADataPartition::Split(const int* leaf_id, global_timer.Start("SplitInner"); start = std::chrono::steady_clock::now(); - SplitInner(leaf_id, cpu_num_data_in_leaf, + SplitInner(leaf_id, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, best_left_gain, best_left_leaf_value, @@ -216,7 +217,8 @@ void CUDADataPartition::Split(const int* leaf_id, larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer); + larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, + smaller_leaf_index, larger_leaf_index); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); global_timer.Stop("SplitInner"); @@ -254,7 +256,9 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer) { + hist_t** larger_leaf_cuda_hist_pointer_pointer, + std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + int* smaller_leaf_index, int* larger_leaf_index) { LaunchSplitInnerKernel(leaf_index, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, @@ -270,7 +274,8 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer); + larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, + smaller_leaf_index, larger_leaf_index); ++cur_num_leaves_; } @@ -284,11 +289,6 @@ void CUDADataPartition::UpdateTrainScore(const double learning_rate, double* tra }*/ } -void CUDADataPartition::PrepareCUDASplitInforBuffer(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left) { - LaunchPrepareCUDASplitInforBufferKernel(leaf_id, best_split_feature, best_split_threshold, best_split_default_left); -} - } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 0f87fc44b90a..b66a2c18ee0f 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -850,48 +850,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const int* leaf_index SynchronizeCUDADevice(); } -__global__ void PrepareOffsetKernel(const int* leaf_index, - const data_size_t* cuda_leaf_num_data, const uint8_t* split_to_left_bit_vector, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { - const unsigned int blockDim_x = blockDim.x; - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const unsigned int threadIdx_x = threadIdx.x; - const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); - const unsigned int global_read_index = blockIdx.x * blockDim.x * 2 + threadIdx_x; - const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[*leaf_index]; - if (global_read_index < num_data_in_leaf_ref) { - const uint8_t bit = split_to_left_bit_vector[global_read_index]; - thread_to_left_offset_cnt[conflict_free_threadIdx_x] = bit; - } else { - thread_to_left_offset_cnt[conflict_free_threadIdx_x] = 0; - } - const unsigned int conflict_free_threadIdx_x_offseted = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x); - if (global_read_index + blockDim_x < num_data_in_leaf_ref) { - const uint8_t bit = split_to_left_bit_vector[global_read_index + blockDim_x]; - thread_to_left_offset_cnt[conflict_free_threadIdx_x_offseted] = bit; - } else { - thread_to_left_offset_cnt[conflict_free_threadIdx_x_offseted] = 0; - } - __syncthreads(); - PrefixSum(thread_to_left_offset_cnt, split_indices_block_size_data_partition); - __syncthreads(); - if (threadIdx_x == 0) { - const unsigned int conflict_free_blockDim_x_times_2 = CONFLICT_FREE_INDEX(blockDim_x << 1); - const data_size_t num_data_in_block = (blockIdx.x + 1) * blockDim.x * 2 <= num_data_in_leaf_ref ? static_cast(blockDim_x * 2) : - num_data_in_leaf_ref - static_cast(blockIdx.x * blockDim.x * 2); - if (num_data_in_block > 0) { - const data_size_t data_to_left = static_cast(thread_to_left_offset_cnt[conflict_free_blockDim_x_times_2]); - block_to_left_offset_buffer[blockIdx.x + 1] = data_to_left; - block_to_right_offset_buffer[blockIdx.x + 1] = num_data_in_block - data_to_left; - } else { - block_to_left_offset_buffer[blockIdx.x + 1] = 0; - block_to_right_offset_buffer[blockIdx.x + 1] = 0; - } - } -} - __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, @@ -1001,6 +959,17 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b } if (blockIdx.x == 0 && threadIdx.x == 0) { ++(*cuda_cur_num_leaves); + const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; + const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; + block_to_left_offset_buffer[0] = 0; + const unsigned int to_left_total_cnt = block_to_left_offset_buffer[num_blocks]; + block_to_right_offset_buffer[0] = to_left_total_cnt; + const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index_ref]; + cuda_leaf_data_end[leaf_index_ref] = cuda_leaf_data_start[leaf_index_ref] + static_cast(to_left_total_cnt); + cuda_leaf_num_data[leaf_index_ref] = static_cast(to_left_total_cnt); + cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index_ref]; + cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; + cuda_leaf_num_data[cur_max_leaf_index] = block_to_right_offset_buffer[num_blocks] - to_left_total_cnt; } } @@ -1031,21 +1000,12 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, double* tree_left_output, double* tree_right_output, data_size_t* tree_left_count, data_size_t* tree_right_count, double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, - double* data_partition_leaf_output) { + double* data_partition_leaf_output, + int* cuda_split_info_buffer) { if (blockIdx.x == 0 && threadIdx.x == 0) { const int leaf_index_ref = *leaf_index; - const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; - const uint32_t num_blocks = (num_data_in_leaf + split_indices_block_size_data_partition - 1) / split_indices_block_size_data_partition; const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; - block_to_left_offset_buffer[0] = 0; - const unsigned int to_left_total_cnt = block_to_left_offset_buffer[num_blocks]; - block_to_right_offset_buffer[0] = to_left_total_cnt; - const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index_ref]; - cuda_leaf_data_end[leaf_index_ref] = cuda_leaf_data_start[leaf_index_ref] + static_cast(to_left_total_cnt); - cuda_leaf_num_data[leaf_index_ref] = static_cast(to_left_total_cnt); - cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index_ref]; - cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; - cuda_leaf_num_data[cur_max_leaf_index] = block_to_right_offset_buffer[num_blocks] - to_left_total_cnt; + const unsigned int to_left_total_cnt = cuda_leaf_num_data[leaf_index_ref]; const int cuda_num_total_bin_ref = *cuda_num_total_bin; tree_split_leaf_index[cur_max_leaf_index - 1] = leaf_index_ref; @@ -1062,6 +1022,13 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo data_partition_leaf_output[leaf_index_ref] = best_left_leaf_value[leaf_index_ref]; data_partition_leaf_output[cur_max_leaf_index] = best_right_leaf_value[leaf_index_ref]; + cuda_split_info_buffer[0] = leaf_index_ref; + cuda_split_info_buffer[1] = cuda_leaf_num_data[leaf_index_ref]; + cuda_split_info_buffer[2] = cuda_leaf_data_start[leaf_index_ref]; + cuda_split_info_buffer[3] = cur_max_leaf_index; + cuda_split_info_buffer[4] = cuda_leaf_num_data[cur_max_leaf_index]; + cuda_split_info_buffer[5] = cuda_leaf_data_start[cur_max_leaf_index]; + if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; *smaller_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; @@ -1084,6 +1051,8 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo cuda_hist_pool[leaf_index_ref] = cuda_hist + 2 * cur_max_leaf_index * cuda_num_total_bin_ref; *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; + cuda_split_info_buffer[6] = leaf_index_ref; + cuda_split_info_buffer[7] = cur_max_leaf_index; } else { *larger_leaf_cuda_leaf_index_pointer = leaf_index_ref; *larger_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; @@ -1104,6 +1073,8 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo cuda_hist_pool[cur_max_leaf_index] = cuda_hist + 2 * cur_max_leaf_index * cuda_num_total_bin_ref; *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; + cuda_split_info_buffer[6] = cur_max_leaf_index; + cuda_split_info_buffer[7] = leaf_index_ref; } } } @@ -1211,7 +1182,9 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer) { + hist_t** larger_leaf_cuda_hist_pointer_pointer, + std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + int* smaller_leaf_index, int* larger_leaf_index) { const int num_blocks = std::max(80, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; int split_indices_block_size_data_partition_aligned = 1; @@ -1219,14 +1192,9 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data split_indices_block_size_data_partition_aligned <<= 1; split_indices_block_size_data_partition >>= 1; } - global_timer.Start("CUDADataPartition::PrepareOffsetKernel"); - auto start = std::chrono::steady_clock::now(); const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; - auto end = std::chrono::steady_clock::now(); - double duration = (static_cast>(end - start)).count(); - global_timer.Stop("CUDADataPartition::PrepareOffsetKernel"); global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); - start = std::chrono::steady_clock::now(); + auto start = std::chrono::steady_clock::now(); AggregateBlockOffsetKernel<<<1, split_indices_block_size_data_partition_aligned / 2>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, @@ -1256,10 +1224,26 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, data_partition_leaf_output_); SynchronizeCUDADevice(); - end = std::chrono::steady_clock::now(); - duration = (static_cast>(end - start)).count(); + auto end = std::chrono::steady_clock::now(); + auto duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); global_timer.Start("CUDADataPartition::SplitInnerKernel"); + + SplitInnerKernel<<>>( + leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, + cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); + end = std::chrono::steady_clock::now(); + duration = (static_cast>(end - start)).count(); + global_timer.Stop("CUDADataPartition::SplitInnerKernel"); + global_timer.Start("CUDADataPartition::CopyDataIndicesKernel"); + start = std::chrono::steady_clock::now(); + CopyDataIndicesKernel<<>>( + leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_data_indices_); + end = std::chrono::steady_clock::now(); + duration = (static_cast>(end - start)).count(); + global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); + start = std::chrono::steady_clock::now(); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); SplitTreeStructureKernel<<<1, 1, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, @@ -1289,25 +1273,23 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, - data_partition_leaf_output_); + data_partition_leaf_output_, cuda_split_info_buffer_); global_timer.Stop("CUDADataPartition::SplitTreeStructureKernel"); - - SplitInnerKernel<<>>( - leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, - cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); + std::vector cpu_split_info_buffer(8); + CopyFromCUDADeviceToHostAsync(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 8, cuda_streams_[0]); SynchronizeCUDADevice(); - end = std::chrono::steady_clock::now(); - duration = (static_cast>(end - start)).count(); - global_timer.Stop("CUDADataPartition::SplitInnerKernel"); - global_timer.Start("CUDADataPartition::CopyDataIndicesKernel"); - start = std::chrono::steady_clock::now(); - CopyDataIndicesKernel<<>>( - leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_data_indices_); - SynchronizeCUDADevice(); - end = std::chrono::steady_clock::now(); - duration = (static_cast>(end - start)).count(); - global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); + const int left_leaf_index = cpu_split_info_buffer[0]; + const data_size_t left_leaf_num_data = cpu_split_info_buffer[1]; + const data_size_t left_leaf_data_start = cpu_split_info_buffer[2]; + const int right_leaf_index = cpu_split_info_buffer[3]; + const data_size_t right_leaf_num_data = cpu_split_info_buffer[4]; + const data_size_t right_leaf_data_start = cpu_split_info_buffer[5]; + (*cpu_leaf_num_data)[left_leaf_index] = left_leaf_num_data; + (*cpu_leaf_data_start)[left_leaf_index] = left_leaf_data_start; + (*cpu_leaf_num_data)[right_leaf_index] = right_leaf_num_data; + (*cpu_leaf_data_start)[right_leaf_index] = right_leaf_data_start; + *smaller_leaf_index = cpu_split_info_buffer[6]; + *larger_leaf_index = cpu_split_info_buffer[7]; } __global__ void PrefixSumKernel(uint32_t* cuda_elements) { @@ -1354,26 +1336,6 @@ void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double learning_r global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel"); } -__global__ void PrepareCUDASplitInforBufferKernel(const int* leaf_index, const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, - const data_size_t* cuda_leaf_num_data, const data_size_t* cuda_leaf_data_start, - int* cuda_split_info_buffer) { - const int leaf_index_ref = *leaf_index; - cuda_split_info_buffer[0] = cuda_leaf_num_data[leaf_index_ref]; - cuda_split_info_buffer[1] = best_split_feature[leaf_index_ref]; - cuda_split_info_buffer[2] = best_split_threshold[leaf_index_ref]; - cuda_split_info_buffer[3] = best_split_default_left[leaf_index_ref]; - cuda_split_info_buffer[4] = cuda_leaf_data_start[leaf_index_ref]; -} - -void CUDADataPartition::LaunchPrepareCUDASplitInforBufferKernel(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left) { - PrepareCUDASplitInforBufferKernel<<<1, 1>>>(leaf_id, best_split_feature, best_split_threshold, best_split_default_left, - cuda_leaf_num_data_, cuda_leaf_data_start_, - cuda_split_info_buffer_); - SynchronizeCUDADevice(); -} - __global__ void CopyColWiseDataKernel(const uint8_t* row_wise_data, const data_size_t num_data, const int num_features, uint8_t* col_wise_data) { diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 161201411662..d078da4871cc 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -47,7 +47,14 @@ class CUDADataPartition { double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer); + hist_t** larger_leaf_cuda_hist_pointer_pointer, + std::vector* cpu_leaf_num_data, + std::vector* cpu_leaf_data_start, + const std::vector& cpu_leaf_best_split_feature, + const std::vector& cpu_leaf_best_split_threshold, + const std::vector& cpu_leaf_best_split_default_left, + int* smaller_leaf_index, int* larger_leaf_index, + const int cpu_leaf_index); Tree* GetCPUTree(); @@ -175,12 +182,6 @@ class CUDADataPartition { void LaunchCopyColWiseDataKernel(); - void PrepareCUDASplitInforBuffer(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left); - - void LaunchPrepareCUDASplitInforBufferKernel(const int* leaf_id, const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left); - void GenDataToLeftBitVector(const int* leaf_id, const data_size_t num_data_in_leaf, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); @@ -205,7 +206,9 @@ class CUDADataPartition { double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer); + hist_t** larger_leaf_cuda_hist_pointer_pointer, + std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + int* smaller_leaf_index, int* larger_leaf_index); // kernel launch functions void LaunchFillDataIndicesBeforeTrain(); @@ -227,7 +230,9 @@ class CUDADataPartition { double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer); + hist_t** larger_leaf_cuda_hist_pointer_pointer, + std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + int* smaller_leaf_index, int* larger_leaf_index); void LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t num_data_in_leaf, const int* best_split_feature, const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 7f6cb21fbe71..a4dc752247e7 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -109,9 +109,10 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller const int* cuda_larger_leaf_index, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, const double* cuda_smaller_leaf_sum_gradients, const double* cuda_smaller_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, - const data_size_t* cuda_leaf_num_data) { + const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf) { auto start = std::chrono::steady_clock::now(); - LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_num_data_in_smaller_leaf, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist); + LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_num_data_in_smaller_leaf, + cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist, num_data_in_smaller_leaf); SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); double duration = (static_cast>(end - start)).count(); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 8e81d366f4c8..ae4c51b85893 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -105,16 +105,11 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const data_size_t* cuda_smaller_leaf_num_data, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t* cuda_leaf_num_data, - hist_t** cuda_leaf_hist) { - global_timer.Start("CUDAHistogramConstructor::LaunchConstructHistogramKernel::CopyFromCUDADeviceToHost"); - data_size_t smaller_leaf_num_data = 0; - CopyFromCUDADeviceToHost(&smaller_leaf_num_data, cuda_smaller_leaf_num_data, 1); - SynchronizeCUDADevice(); - global_timer.Stop("CUDAHistogramConstructor::LaunchConstructHistogramKernel::CopyFromCUDADeviceToHost"); + hist_t** cuda_leaf_hist, const data_size_t num_data_in_smaller_leaf) { const int block_dim_x = num_features_; // TODO(shiyu1994): only supports the case when the whole histogram can be loaded into shared memory const int block_dim_y = NUM_THRADS_PER_BLOCK / block_dim_x; const int min_grid_dim_y = 160; - const int grid_dim_y = std::max(min_grid_dim_y, ((smaller_leaf_num_data + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + block_dim_y - 1) / block_dim_y); + const int grid_dim_y = std::max(min_grid_dim_y, ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + block_dim_y - 1) / block_dim_y); const int grid_dim_x = (static_cast(num_feature_groups_ + NUM_FEATURE_PER_THREAD_GROUP - 1) / NUM_FEATURE_PER_THREAD_GROUP); //Log::Warning("smaller_leaf_num_data = %d", smaller_leaf_num_data); //Log::Warning("block_dim_x = %d, block_dim_y = %d", block_dim_x, block_dim_y); @@ -188,9 +183,6 @@ __global__ void FixHistogramKernel(const int* cuda_smaller_leaf_index, num_bin_to_shift >>= 1; num_bin_aligned <<= 1; } - /*if (threadIdx.x == 0) { - printf("num_bin_aligned = %d\n", num_bin_aligned); - }*/ __syncthreads(); PrefixSum(hist_gradients, num_bin_aligned); PrefixSum(hist_hessians, num_bin_aligned); @@ -199,10 +191,6 @@ __global__ void FixHistogramKernel(const int* cuda_smaller_leaf_index, feature_hist[most_freq_bin << 1] = leaf_sum_gradients - hist_gradients[num_bin_aligned]; feature_hist[(most_freq_bin << 1) + 1] = leaf_sum_hessians - hist_hessians[num_bin_aligned]; } - if (threadIdx.x == 0) { - //printf("fix most freq bin: feature_hist_offset %d + most_freq_bin %d = %d, num_bin_aligned = %d, leaf_sum_gradients = %f, leaf_sum_hessians = %f, hist_gradients[num_bin_aligned] = %f, hist_hessians[num_bin_aligned] = %f, feature_hist[most_freq_bin << 1] = %f, feature_hist[(most_freq_bin << 1) + 1] = %f\n", - // feature_hist_offset, most_freq_bin, feature_hist_offset + most_freq_bin, num_bin_aligned, leaf_sum_gradients, leaf_sum_hessians, hist_gradients[num_bin_aligned], hist_hessians[num_bin_aligned], feature_hist[most_freq_bin << 1], feature_hist[(most_freq_bin << 1) + 1]); - } } } } @@ -213,12 +201,10 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(const int* cuda_sma hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist) { const int num_subtract_threads = 2 * num_total_bin_; const int num_subtract_blocks = (num_subtract_threads + SUBTRACT_BLOCK_SIZE - 1) / SUBTRACT_BLOCK_SIZE; - //Log::Warning("Before SubtractHistogramKernel"); SubtractHistogramKernel<<>>( cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_feature_mfb_offsets_, cuda_feature_num_bins_, cuda_num_total_bin_, cuda_smaller_leaf_hist, cuda_larger_leaf_hist); SynchronizeCUDADevice(); - //Log::Warning("After SubtractHistogramKernel"); FixHistogramKernel<<<2 * num_features_, FIX_HISTOGRAM_BLOCK_SIZE>>>( cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_feature_num_bins_, cuda_num_features_, @@ -227,7 +213,6 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(const int* cuda_sma larger_leaf_sum_gradients, larger_leaf_sum_hessians, cuda_smaller_leaf_hist, cuda_larger_leaf_hist); SynchronizeCUDADevice(); - //Log::Warning("After FixHistogramKernel"); } __global__ void GetOrderedGradientsKernel(const data_size_t num_data_in_leaf, const data_size_t** cuda_data_indices_in_leaf, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 880342079a57..235c5ac18a20 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -38,7 +38,7 @@ class CUDAHistogramConstructor { const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, const double* cuda_smaller_leaf_sum_gradients, const double* cuda_smaller_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, - const data_size_t* cuda_leaf_num_data); + const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf); void BeforeTrain(); @@ -85,7 +85,7 @@ class CUDAHistogramConstructor { const data_size_t* cuda_smaller_leaf_num_data, const data_size_t** cuda_data_indices_in_leaf, const data_size_t* cuda_leaf_num_data, - hist_t** cuda_leaf_hist); + hist_t** cuda_leaf_hist, const data_size_t num_data_in_smaller_leaf); void LaunchSubtractHistogramKernel(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index ec24311ab7af..2f128f5fe9bf 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -62,6 +62,12 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_score_updater_->SetInitScore(cuda_binary_objective_->cuda_init_score()); //cuda_best_split_finder_->TestAfterInit(); + + leaf_best_split_feature_.resize(config_->num_leaves, -1); + leaf_best_split_threshold_.resize(config_->num_leaves, 0); + leaf_best_split_default_left_.resize(config_->num_leaves, 0); + leaf_num_data_.resize(config_->num_leaves, 0); + leaf_data_start_.resize(config_->num_leaves, 0); } void NewCUDATreeLearner::BeforeTrain() { @@ -93,184 +99,28 @@ void NewCUDATreeLearner::BeforeTrain() { duration = static_cast>(end - start); //Log::Warning("cuda_best_split_finder_->BeforeTrain() duration = %f", duration.count()); //cuda_data_partition_->Test(); - - //SerialTreeLearner::BeforeTrain(); - /*#pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int device_id = 0; device_id < num_gpus_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - device_leaf_splits_initializers_[device_id]->Init(); - }*/ + leaf_num_data_[0] = num_data_; + leaf_data_start_[0] = 0; + smaller_leaf_index_ = 0; + larger_leaf_index_ = -1; } -void NewCUDATreeLearner::AllocateMemory(const bool is_constant_hessian) { - /*device_data_indices_.resize(num_gpus_, nullptr); - device_gradients_.resize(num_gpus_, nullptr); - device_gradients_and_hessians_.resize(num_gpus_, nullptr); - if (!is_constant_hessian) { - device_hessians_.resize(num_gpus_, nullptr); - } - device_histograms_.resize(num_gpus_, nullptr); - const int num_total_bin_from_dataset = train_data_->NumTotalBin(); - const int num_total_bin_from_share_states = share_state_->num_hist_total_bin(); - const int num_total_bin = std::max(num_total_bin_from_dataset, num_total_bin_from_share_states); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int device_id = 0; device_id < num_gpus_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - if (device_data_indices_[device_id] != nullptr) { - CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); - } - void* data_indices_ptr = reinterpret_cast(device_data_indices_[device_id]); - CUDASUCCESS_OR_FATAL(cudaMalloc(&data_indices_ptr, num_data_ * sizeof(data_size_t))); - device_data_indices_[device_id] = reinterpret_cast(data_indices_ptr); - if (device_gradients_[device_id] != nullptr) { - CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id])); - } - void* gradients_ptr = reinterpret_cast(device_gradients_[device_id]); - CUDASUCCESS_OR_FATAL(cudaMalloc(&gradients_ptr, num_data_ * sizeof(float))); - device_gradients_[device_id] = reinterpret_cast(gradients_ptr); - AllocateCUDAMemory(2 * num_data_ * sizeof(score_t), &device_gradients_and_hessians_[device_id]); - if (!is_constant_hessian) { - if (device_hessians_[device_id] != nullptr) { - CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id])); - } - void* hessians_ptr = reinterpret_cast(device_hessians_[device_id]); - CUDASUCCESS_OR_FATAL(cudaMalloc(&hessians_ptr, num_data_ * sizeof(float))); - device_hessians_[device_id] = reinterpret_cast(hessians_ptr); - } - if (device_histograms_[device_id] != nullptr) { - CUDASUCCESS_OR_FATAL(cudaFree(device_histograms_[device_id])); - } - void* histograms_ptr = reinterpret_cast(device_histograms_[device_id]); - CUDASUCCESS_OR_FATAL(cudaMalloc(&histograms_ptr, num_total_bin * 2 * sizeof(double))); - device_histograms_[device_id] = reinterpret_cast(histograms_ptr); - }*/ -} +void NewCUDATreeLearner::AllocateMemory(const bool is_constant_hessian) {} -void NewCUDATreeLearner::CreateCUDAHistogramConstructors() { - /*Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors num_gpus_ = %d", num_gpus_); - device_histogram_constructors_.resize(num_gpus_); - device_leaf_splits_initializers_.resize(num_gpus_); - device_best_split_finders_.resize(num_gpus_); - device_splitters_.resize(num_gpus_); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int device_id = 0; device_id < num_gpus_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 1", num_gpus_); - device_leaf_splits_initializers_[device_id].reset( - new CUDALeafSplitsInit(device_gradients_[device_id], device_hessians_[device_id], num_data_)); - Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 2", num_gpus_); - device_histogram_constructors_[device_id].reset( - new CUDAHistogramConstructor(device_feature_groups_[device_id], - train_data_, config_->num_leaves, device_histograms_[device_id])); - Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 3", num_gpus_); - device_best_split_finders_[device_id].reset( - new CUDABestSplitFinder(device_histogram_constructors_[device_id]->cuda_hist(), - train_data_, device_feature_groups_[device_id], config_->num_leaves)); - Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 4", num_gpus_); - device_splitters_[device_id].reset( - new CUDADataSplitter(num_data_, config_->num_leaves)); - Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 5", num_gpus_); - } - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int device_id = 0; device_id < num_gpus_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - device_leaf_splits_initializers_[device_id]->Init(); - device_histogram_constructors_[device_id]->Init(); - device_splitters_[device_id]->Init(); - } - Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 6", num_gpus_); - PushDataIntoDeviceHistogramConstructors(); - Log::Warning("NewCUDATreeLearner::CreateCUDAHistogramConstructors step 7", num_gpus_);*/ -} +void NewCUDATreeLearner::CreateCUDAHistogramConstructors() {} -void NewCUDATreeLearner::PushDataIntoDeviceHistogramConstructors() { - /*#pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int device_id = 0; device_id < num_gpus_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - CUDAHistogramConstructor* cuda_histogram_constructor = device_histogram_constructors_[device_id].get(); - for (int group_id : device_feature_groups_[device_id]) { - BinIterator* iter = train_data_->FeatureGroupIterator(group_id); - iter->Reset(0); - for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { - const uint32_t bin = static_cast(iter->RawGet(data_index)); - CHECK_LE(bin, 255); - cuda_histogram_constructor->PushOneData(bin, group_id, data_index); - } - } - // call finish load to tranfer data from CPU to GPU - cuda_histogram_constructor->FinishLoad(); - }*/ -} +void NewCUDATreeLearner::PushDataIntoDeviceHistogramConstructors() {} -void NewCUDATreeLearner::FindBestSplits(const Tree* tree) { - /*std::vector is_feature_used(num_features_, 1); - ConstructHistograms(is_feature_used, true); - FindBestSplitsFromHistograms(is_feature_used, true, tree);*/ -} +void NewCUDATreeLearner::FindBestSplits(const Tree* tree) {} void NewCUDATreeLearner::ConstructHistograms(const std::vector& /*is_feature_used*/, - bool /*use_subtract*/) { - /*#pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int device_id = 0; device_id < num_gpus_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - - }*/ -} + bool /*use_subtract*/) {} void NewCUDATreeLearner::FindBestSplitsFromHistograms(const std::vector& /*is_feature_used*/, - bool /*use_subtract*/, const Tree* /*tree*/) { - /*#pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int device_id = 0; device_id < num_gpus_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - device_best_split_finders_[device_id]->FindBestSplitsForLeaf( - device_leaf_splits_initializers_[device_id]->smaller_leaf_index()); - device_best_split_finders_[device_id]->FindBestSplitsForLeaf( - device_leaf_splits_initializers_[device_id]->larger_leaf_index()); - device_best_split_finders_[device_id]->FindBestFromAllSplits(); - }*/ -} + bool /*use_subtract*/, const Tree* /*tree*/) {} void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, - int* /*left_leaf*/, int* /*right_leaf*/) { - /*#pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int device_id = 0; device_id < num_gpus_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - device_splitters_[device_id]->Split( - device_best_split_finders_[device_id]->best_leaf(), - device_best_split_finders_[device_id]->best_split_feature_index(), - device_best_split_finders_[device_id]->best_split_threshold()); - }*/ -} - -/*void NewCUDATreeLearner::SplitTree(Tree* tree) { - int leaf_index = 0; - int inner_feature_index = 0; - uint32_t threshold = 0; - double left_output = 0.0f; - double right_output = 0.0f; - data_size_t left_count = 0; - data_size_t right_count = 0; - double left_sum_hessian = 0.0f; - double right_sum_hessian = 0.0f; - double gain = 0.0f; - uint8_t default_left = 0; - CopyFromCUDADeviceToHost(&leaf_index, cuda_best_split_finder_->cuda_best_leaf(), 1); - CopyFromCUDADeviceToHost(&inner_feature_index, cuda_best_split_finder_->cuda_leaf_best_split_feature() + leaf_index, 1); - CopyFromCUDADeviceToHost(&threshold, cuda_best_split_finder_->cuda_leaf_best_split_threshold() + leaf_index, 1); - CopyFromCUDADeviceToHost(&left_output, cuda_best_split_finder_->cuda_leaf_best_split_left_output() + leaf_index, 1); - CopyFromCUDADeviceToHost(&right_output, cuda_best_split_finder_->cuda_leaf_best_split_right_output() + leaf_index, 1); - CopyFromCUDADeviceToHost(&left_count, cuda_best_split_finder_->cuda_leaf_best_split_left_count() + leaf_index, 1); - CopyFromCUDADeviceToHost(&right_count, cuda_best_split_finder_->cuda_leaf_best_split_right_count() + leaf_index, 1); - CopyFromCUDADeviceToHost(&left_sum_hessian, cuda_best_split_finder_->cuda_leaf_best_split_left_sum_hessian() + leaf_index, 1); - CopyFromCUDADeviceToHost(&right_sum_hessian, cuda_best_split_finder_->cuda_leaf_best_split_right_sum_hessian() + leaf_index, 1); - CopyFromCUDADeviceToHost(&gain, cuda_best_split_finder_->cuda_leaf_best_split_gain() + leaf_index, 1); - CopyFromCUDADeviceToHost(&default_left, cuda_best_split_finder_->cuda_leaf_best_split_default_left() + leaf_index, 1); - SynchronizeCUDADevice(); - int real_feature_index = train_data_->RealFeatureIndex(inner_feature_index); - double real_split_threshold = train_data_->RealThreshold(inner_feature_index, threshold); - tree->Split(leaf_index, inner_feature_index, real_feature_index, threshold, real_split_threshold, left_output, right_output, left_count, right_count, - left_sum_hessian, right_sum_hessian, gain, train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), static_cast(default_left)); -}*/ + int* /*left_leaf*/, int* /*right_leaf*/) {} void NewCUDATreeLearner::AddPredictionToScore(const Tree* /*tree*/, double* out_score) const { const auto start = std::chrono::steady_clock::now(); @@ -293,7 +143,6 @@ Tree* NewCUDATreeLearner::BuildTree() { std::vector right_sum_hessian(config_->num_leaves); std::vector gain(config_->num_leaves); std::vector default_left(config_->num_leaves); - //Log::Warning("BuildTree step 0"); CopyFromCUDADeviceToHost(leaf_index.data(), cuda_data_partition_->tree_split_leaf_index(), config_->num_leaves); CopyFromCUDADeviceToHost(inner_feature_index.data(), cuda_data_partition_->tree_inner_feature_index(), config_->num_leaves); CopyFromCUDADeviceToHost(threshold.data(), cuda_data_partition_->tree_threshold(), config_->num_leaves); @@ -305,23 +154,7 @@ Tree* NewCUDATreeLearner::BuildTree() { CopyFromCUDADeviceToHost(right_sum_hessian.data(), cuda_data_partition_->tree_right_sum_hessian(), config_->num_leaves); CopyFromCUDADeviceToHost(gain.data(), cuda_data_partition_->tree_gain(), config_->num_leaves); CopyFromCUDADeviceToHost(default_left.data(), cuda_data_partition_->tree_default_left(), config_->num_leaves); - //Log::Warning("BuildTree step 1"); for (int i = 0; i < config_->num_leaves - 1; ++i) { - /*Log::Warning("BuildTree step 2"); - Log::Warning("leaf_index[i] = %d", leaf_index[i]); - Log::Warning("inner_feature_index[i] = %d", inner_feature_index[i]); - Log::Warning("train_data_->RealFeatureIndex(inner_feature_index[i]) = %d", train_data_->RealFeatureIndex(inner_feature_index[i])); - Log::Warning("threshold[i] = %d", threshold[i]); - Log::Warning("train_data_->RealThreshold(inner_feature_index[i], threshold[i]) = %f", train_data_->RealThreshold(inner_feature_index[i], threshold[i])); - Log::Warning("left_output[i] = %f", left_output[i]); - Log::Warning("right_output[i] = %f", right_output[i]); - Log::Warning("left_count[i] = %d", left_count[i]); - Log::Warning("right_count[i] = %d", right_count[i]); - Log::Warning("left_sum_hessian[i] = %f", left_sum_hessian[i]); - Log::Warning("right_sum_hessian[i] = %f", right_sum_hessian[i]); - Log::Warning("gain[i] = %f", gain[i]); - Log::Warning("train_data_->FeatureBinMapper(inner_feature_index[i])->missing_type() = %d", train_data_->FeatureBinMapper(inner_feature_index[i])->missing_type()); - Log::Warning("default_left[i] = %d", default_left[i]);*/ tree->Split( leaf_index[i], inner_feature_index[i], @@ -338,7 +171,6 @@ Tree* NewCUDATreeLearner::BuildTree() { train_data_->FeatureBinMapper(inner_feature_index[i])->missing_type(), static_cast(default_left[i])); } - //Log::Warning("BuildTree step 3"); return tree.release(); } @@ -374,7 +206,8 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, cuda_larger_leaf_splits_->cuda_sum_of_gradients_pointer(), cuda_larger_leaf_splits_->cuda_sum_of_hessians_pointer(), cuda_larger_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), - cuda_data_partition_->cuda_leaf_num_data()); + cuda_data_partition_->cuda_leaf_num_data(), + leaf_num_data_[smaller_leaf_index_]); /*if (i == 0) { cuda_histogram_constructor_->TestAfterConstructHistogram(); }*/ @@ -386,23 +219,21 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, global_timer.Start("NewCUDATreeLearner::FindBestSplitsForLeaf"); start = std::chrono::steady_clock::now(); cuda_best_split_finder_->FindBestSplitsForLeaf(cuda_smaller_leaf_splits_.get(), - cuda_larger_leaf_splits_.get()); + cuda_larger_leaf_splits_.get(), smaller_leaf_index_, larger_leaf_index_); //Log::Warning("Before FindBestFromAllSplits"); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::FindBestSplitsForLeaf"); find_best_split_time += duration.count(); start = std::chrono::steady_clock::now(); - cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves()); + global_timer.Start("NewCUDATreeLearner::FindBestFromAllSplits"); + cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves(), + cuda_smaller_leaf_splits_->cuda_leaf_index(), cuda_larger_leaf_splits_->cuda_leaf_index(), + &leaf_best_split_feature_, &leaf_best_split_threshold_, &leaf_best_split_default_left_, &best_leaf_index_); + global_timer.Stop("NewCUDATreeLearner::FindBestFromAllSplits"); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); find_best_split_from_all_leaves_time += duration.count(); - //Log::Warning("Before Split"); - //start = std::chrono::steady_clock::now(); - //SplitTree(tree.get()); - //end = std::chrono::steady_clock::now(); - //duration = static_cast>(end - start); - //split_tree_time += duration.count(); global_timer.Start("NewCUDATreeLearner::Split"); start = std::chrono::steady_clock::now(); cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), @@ -437,7 +268,15 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, cuda_larger_leaf_splits_->cuda_gain_pointer(), cuda_larger_leaf_splits_->cuda_leaf_value_pointer(), cuda_larger_leaf_splits_->cuda_data_indices_in_leaf_pointer_pointer(), - cuda_larger_leaf_splits_->cuda_hist_in_leaf_pointer_pointer()); + cuda_larger_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), + &leaf_num_data_, + &leaf_data_start_, + leaf_best_split_feature_, + leaf_best_split_threshold_, + leaf_best_split_default_left_, + &smaller_leaf_index_, + &larger_leaf_index_, + best_leaf_index_); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::Split"); @@ -458,43 +297,6 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, Log::Warning("split data indices time %f", split_data_indices_time); //Log::Warning("split tree time %f", split_tree_time); Log::Warning("build tree time %f", build_tre_duration); - /*cuda_data_partition_->Test(); - cuda_histogram_constructor_->ConstructHistogramForLeaf( - cuda_smaller_leaf_splits_->cuda_leaf_index(), - cuda_larger_leaf_splits_->cuda_leaf_index(), - cuda_smaller_leaf_splits_->cuda_data_indices_in_leaf(), - cuda_larger_leaf_splits_->cuda_data_indices_in_leaf(), - cuda_data_partition_->cuda_leaf_num_data()); - cuda_best_split_finder_->FindBestSplitsForLeaf(cuda_smaller_leaf_splits_.get(), - cuda_larger_leaf_splits_.get()); - cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves()); - cuda_best_split_finder_->TestAfterFindBestSplits();*/ - //cuda_data_partition_->TestPrefixSum(); - /*cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), - cuda_best_split_finder_->cuda_leaf_best_split_feature(), - cuda_best_split_finder_->cuda_leaf_best_split_threshold(), - cuda_best_split_finder_->cuda_leaf_best_split_default_left()); - cuda_data_partition_->TestAfterSplit();*/ - //cuda_histogram_constructor_->TestAfterConstructHistogram(); - /*CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); - CUDASUCCESS_OR_FATAL(cudaMemcpy(device_gradients_[0], gradients, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice)); - CUDASUCCESS_OR_FATAL(cudaMemcpy(device_hessians_[0], hessians, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice)); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (data_size_t i = 0; i < num_data_; ++i) { - gradients_and_hessians_[2 * i] = gradients[i]; - gradients_and_hessians_[2 * i + 1] = hessians[i]; - } - CopyFromHostToCUDADevice(device_gradients_and_hessians_[0], gradients_and_hessians_.data(), 2 * static_cast(num_data_)); - Log::Warning("before initialization of leaf splits"); - device_leaf_splits_initializers_[0]->Compute(); - Log::Warning("after initialization of leaf splits"); - device_splitters_[0]->BeforeTrain(nullptr); - Log::Warning("after initialization of data indices"); - device_histogram_constructors_[0]->ConstructHistogramForLeaf(device_leaf_splits_initializers_[0]->smaller_leaf_index(), - device_leaf_splits_initializers_[0]->larger_leaf_index(), - device_splitters_[0]->leaf_num_data(), device_splitters_[0]->leaf_num_data_offsets(), - device_splitters_[0]->data_indices(), device_gradients_[0], device_hessians_[0], device_gradients_and_hessians_[0]); - Log::Warning("after construction of root histograms");*/ return tree.release(); } diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 4a8622bf0606..31d34e83b114 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -79,6 +79,15 @@ class NewCUDATreeLearner: public SerialTreeLearner { std::unique_ptr cuda_binary_objective_; + std::vector leaf_best_split_feature_; + std::vector leaf_best_split_threshold_; + std::vector leaf_best_split_default_left_; + std::vector leaf_num_data_; + std::vector leaf_data_start_; + int smaller_leaf_index_; + int larger_leaf_index_; + int best_leaf_index_; + /* // full data indices on CUDA devices, as the data indices of data_partition_ in CPU version std::vector device_data_indices_; diff --git a/src/treelearner/cuda/new_cuda_utils.hpp b/src/treelearner/cuda/new_cuda_utils.hpp index 846bf43c85e2..0e7f041c80c8 100644 --- a/src/treelearner/cuda/new_cuda_utils.hpp +++ b/src/treelearner/cuda/new_cuda_utils.hpp @@ -55,11 +55,11 @@ void CopyFromCUDADeviceToHost(T* dst_ptr, const T* src_ptr, size_t size) { } template -void CopyFromCUDADeviceToHostAsync(T* dst_ptr, const T* src_ptr, size_t size) { +void CopyFromCUDADeviceToHostAsync(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream) { void* void_dst_ptr = reinterpret_cast(dst_ptr); const void* void_src_ptr = reinterpret_cast(src_ptr); size_t size_in_bytes = size * sizeof(T); - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost)); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost, stream)); } template From a58c1e19df00028049bdaad76b3ea86b59b00eed Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 2 Jun 2021 03:47:44 +0000 Subject: [PATCH 016/166] use tasks instead of features as units for split finding --- .../cuda/cuda_best_split_finder.cpp | 3 +- .../cuda/cuda_best_split_finder.cu | 277 ++++++++++-------- .../cuda/cuda_best_split_finder.hpp | 7 +- src/treelearner/cuda/cuda_binary_objective.cu | 4 - 4 files changed, 157 insertions(+), 134 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index bc5feb15b64d..d42e010d31a7 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -129,7 +129,6 @@ void CUDABestSplitFinder::Init() { InitCUDAMemoryFromHostMemory(&cuda_task_out_default_left_, cpu_task_out_default_left_.data(), cpu_task_out_default_left_.size()); const size_t output_buffer_size = 2 * static_cast(num_tasks_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_feature_); AllocateCUDAMemory(output_buffer_size, &cuda_best_split_default_left_); AllocateCUDAMemory(output_buffer_size, &cuda_best_split_threshold_); AllocateCUDAMemory(output_buffer_size, &cuda_best_split_gain_); @@ -164,7 +163,7 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits, smaller_leaf_index, larger_leaf_index); SynchronizeCUDADevice(); global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); - LaunchSyncBestSplitForLeafKernel(smaller_leaf_splits->cuda_leaf_index(), larger_leaf_splits->cuda_leaf_index()); + LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index); SynchronizeCUDADevice(); global_timer.Stop("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); auto end = std::chrono::steady_clock::now(); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index a6f61a188fe6..ff2cccd7072b 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -376,19 +376,25 @@ __global__ void FindBestSplitsForOneLeafKernel( const uint8_t* feature_missing_types, const uint32_t* feature_num_bins, // input task information + const int num_tasks, const int* task_feature_index, const uint8_t* task_reverse, const uint8_t* task_skip_default_bin, const uint8_t* task_na_as_missing, const uint8_t* task_out_default_left, // input leaf information - const bool is_larger, - const int leaf_index, - const double* leaf_gain, - const double* sum_gradients_in_leaf, - const double* sum_hessians_in_leaf, - const data_size_t* num_data_in_leaf, - hist_t** leaf_hist, + const int smaller_leaf_index, + const double* smaller_leaf_gain, + const double* smaller_sum_gradients_in_leaf, + const double* smaller_sum_hessians_in_leaf, + const data_size_t* smaller_num_data_in_leaf, + hist_t** smaller_leaf_hist, + const int larger_leaf_index, + const double* larger_leaf_gain, + const double* larger_sum_gradients_in_leaf, + const double* larger_sum_hessians_in_leaf, + const data_size_t* larger_num_data_in_leaf, + hist_t** larger_leaf_hist, // input config parameter values const data_size_t min_data_in_leaf, const double min_sum_hessian_in_leaf, @@ -411,18 +417,19 @@ __global__ void FindBestSplitsForOneLeafKernel( double* cuda_best_split_right_output, uint8_t* cuda_best_split_found) { - const unsigned int task_index = blockIdx.x; + const unsigned int task_index = blockIdx.x % num_tasks; + const bool is_larger = static_cast(blockIdx.x >= num_tasks); const int inner_feature_index = task_feature_index[task_index]; const bool reverse = static_cast(task_reverse[task_index]); const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); const bool na_as_missing = static_cast(task_na_as_missing[task_index]); const bool assume_out_default_left = task_out_default_left[task_index]; - uint8_t* out_default_left = cuda_best_split_default_left + task_index; - const double parent_gain = *leaf_gain; - const double sum_gradients = *sum_gradients_in_leaf; - const double sum_hessians = (*sum_hessians_in_leaf) + 2 * kEpsilon; - const double num_data = *num_data_in_leaf; - const unsigned int output_offset = is_larger ? (task_index + gridDim.x) : task_index; + const double parent_gain = is_larger ? *larger_leaf_gain : *smaller_leaf_gain; + const double sum_gradients = is_larger ? *larger_sum_gradients_in_leaf : *smaller_sum_gradients_in_leaf; + const double sum_hessians = (is_larger ? *larger_sum_hessians_in_leaf : *smaller_sum_hessians_in_leaf) + 2 * kEpsilon; + const double num_data = is_larger ? *larger_num_data_in_leaf : *smaller_num_data_in_leaf; + const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; + uint8_t* out_default_left = cuda_best_split_default_left + output_offset; uint32_t* out_threshold = cuda_best_split_threshold + output_offset; double* out_left_sum_gradients = cuda_best_split_left_sum_gradient + output_offset; double* out_left_sum_hessians = cuda_best_split_left_sum_hessian + output_offset; @@ -436,7 +443,7 @@ __global__ void FindBestSplitsForOneLeafKernel( double* out_right_gain = cuda_best_split_right_gain + output_offset; uint8_t* out_found = cuda_best_split_found + output_offset; double* out_gain = cuda_best_split_gain + output_offset; - const hist_t* hist_ptr = *leaf_hist + feature_hist_offsets[inner_feature_index] * 2; + const hist_t* hist_ptr = (is_larger ? *larger_leaf_hist : *smaller_leaf_hist) + feature_hist_offsets[inner_feature_index] * 2; FindBestSplitsForLeafKernelInner(hist_ptr, feature_num_bins[inner_feature_index], feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], feature_missing_types[inner_feature_index], lambda_l1, lambda_l2, parent_gain, @@ -451,7 +458,8 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index) { - FindBestSplitsForOneLeafKernel<<>>( + const int num_blocks = larger_leaf_index >= 0 ? num_tasks_ * 2 : num_tasks_; + FindBestSplitsForOneLeafKernel<<>>( // input feature information cuda_feature_hist_offsets_, cuda_feature_mfb_offsets_, @@ -459,19 +467,25 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_feature_missing_type_, cuda_feature_num_bins_, // input task information + num_tasks_, cuda_task_feature_index_, cuda_task_reverse_, cuda_task_skip_default_bin_, cuda_task_na_as_missing_, cuda_task_out_default_left_, // input leaf information - false, smaller_leaf_index, smaller_leaf_splits->cuda_gain(), smaller_leaf_splits->cuda_sum_of_gradients(), smaller_leaf_splits->cuda_sum_of_hessians(), smaller_leaf_splits->cuda_num_data_in_leaf(), smaller_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), + larger_leaf_index, + larger_leaf_splits->cuda_gain(), + larger_leaf_splits->cuda_sum_of_gradients(), + larger_leaf_splits->cuda_sum_of_hessians(), + larger_leaf_splits->cuda_num_data_in_leaf(), + larger_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), // configuration parameter values min_data_in_leaf_, min_sum_hessian_in_leaf_, @@ -493,51 +507,6 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_best_split_right_gain_, cuda_best_split_right_output_, cuda_best_split_found_); - - if (larger_leaf_index >= 0) { - FindBestSplitsForOneLeafKernel<<>>( - // input feature information - cuda_feature_hist_offsets_, - cuda_feature_mfb_offsets_, - cuda_feature_default_bins_, - cuda_feature_missing_type_, - cuda_feature_num_bins_, - // input task information - cuda_task_feature_index_, - cuda_task_reverse_, - cuda_task_skip_default_bin_, - cuda_task_na_as_missing_, - cuda_task_out_default_left_, - // input leaf information - true, - larger_leaf_index, - larger_leaf_splits->cuda_gain(), - larger_leaf_splits->cuda_sum_of_gradients(), - larger_leaf_splits->cuda_sum_of_hessians(), - larger_leaf_splits->cuda_num_data_in_leaf(), - larger_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), - // configuration parameter values - min_data_in_leaf_, - min_sum_hessian_in_leaf_, - min_gain_to_split_, - lambda_l1_, - lambda_l2_, - // output parameters - cuda_best_split_threshold_, - cuda_best_split_default_left_, - cuda_best_split_gain_, - cuda_best_split_left_sum_gradient_, - cuda_best_split_left_sum_hessian_, - cuda_best_split_left_count_, - cuda_best_split_left_gain_, - cuda_best_split_left_output_, - cuda_best_split_right_sum_gradient_, - cuda_best_split_right_sum_hessian_, - cuda_best_split_right_count_, - cuda_best_split_right_gain_, - cuda_best_split_right_output_, - cuda_best_split_found_); - } } __device__ void ReduceBestSplit(uint8_t* found, double* gain, uint32_t* shared_read_index, @@ -557,7 +526,7 @@ __device__ void ReduceBestSplit(uint8_t* found, double* gain, uint32_t* shared_r } } -__global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index, +__global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const int larger_leaf_index, const int* cuda_num_features, int* cuda_leaf_best_split_feature, uint8_t* cuda_leaf_best_split_default_left, uint32_t* cuda_leaf_best_split_threshold, double* cuda_leaf_best_split_gain, double* cuda_leaf_best_split_left_sum_gradient, double* cuda_leaf_best_split_left_sum_hessian, @@ -567,7 +536,7 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const data_size_t* cuda_leaf_best_split_right_count, double* cuda_leaf_best_split_right_gain, double* cuda_leaf_best_split_right_output, // input parameters - const int* cuda_best_split_feature, + const int* cuda_task_feature_index, const uint8_t* cuda_best_split_default_left, const uint32_t* cuda_best_split_threshold, const double* cuda_best_split_gain, @@ -583,38 +552,39 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const const double* cuda_best_split_right_output, const uint8_t* cuda_best_split_found, const uint32_t* cuda_feature_default_bins, - const int num_features_ref, - const int num_features_aligned) { + const int num_tasks, + const int num_tasks_aligned, + const int num_blocks_per_leaf) { const uint32_t threadIdx_x = threadIdx.x; const uint32_t blockIdx_x = blockIdx.x; - __shared__ uint8_t best_found[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ double best_gain[NUM_FEATURES_PER_SYNC_BLOCK]; - __shared__ uint32_t shared_read_index[NUM_FEATURES_PER_SYNC_BLOCK]; - - const bool is_smaller = (blockIdx_x == 0); - const int feature_index = static_cast(threadIdx_x); - const uint32_t read_index = is_smaller ? threadIdx_x : threadIdx_x + num_features_ref; - if (feature_index < num_features_ref) { - best_found[feature_index] = cuda_best_split_found[read_index]; - best_gain[feature_index] = cuda_best_split_gain[read_index]; - shared_read_index[feature_index] = read_index; + __shared__ uint8_t best_found[NUM_TASKS_PER_SYNC_BLOCK]; + __shared__ double best_gain[NUM_TASKS_PER_SYNC_BLOCK]; + __shared__ uint32_t shared_read_index[NUM_TASKS_PER_SYNC_BLOCK]; + + const bool is_smaller = (blockIdx_x < static_cast(num_blocks_per_leaf)); + const uint32_t leaf_block_index = is_smaller ? blockIdx_x : (blockIdx_x - static_cast(num_blocks_per_leaf)); + const int task_index = static_cast(leaf_block_index * num_blocks_per_leaf + threadIdx_x); + const uint32_t read_index = is_smaller ? static_cast(task_index) : static_cast(task_index + num_tasks); + if (task_index < num_tasks) { + best_found[task_index] = cuda_best_split_found[read_index]; + best_gain[task_index] = cuda_best_split_gain[read_index]; + shared_read_index[task_index] = read_index; } else { - best_found[feature_index] = 0; + best_found[task_index] = 0; } __syncthreads(); - ReduceBestSplit(best_found, best_gain, shared_read_index, - num_features_aligned, 0); + ReduceBestSplit(best_found, best_gain, shared_read_index, num_tasks_aligned, 0); if (threadIdx.x == 0) { - const int leaf_index_ref = is_smaller ? *smaller_leaf_index : *larger_leaf_index; + const int leaf_index_ref = is_smaller ? smaller_leaf_index : larger_leaf_index; const uint32_t best_read_index = shared_read_index[0]; if (best_found[0]) { cuda_leaf_best_split_gain[leaf_index_ref] = best_gain[0]; - cuda_leaf_best_split_feature[leaf_index_ref] = is_smaller ? static_cast(best_read_index) : - static_cast(best_read_index) - num_features_ref; + cuda_leaf_best_split_feature[leaf_index_ref] = is_smaller ? cuda_task_feature_index[best_read_index] : + cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; cuda_leaf_best_split_default_left[leaf_index_ref] = cuda_best_split_default_left[best_read_index]; cuda_leaf_best_split_threshold[leaf_index_ref] = cuda_best_split_threshold[best_read_index]; cuda_leaf_best_split_left_sum_gradient[leaf_index_ref] = cuda_best_split_left_sum_gradient[best_read_index]; @@ -633,41 +603,95 @@ __global__ void SyncBestSplitForLeafKernel(const int* smaller_leaf_index, const } } -void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index) { - int num_features = num_features_; - int num_features_aligned = 1; - num_features -= 1; - while (num_features > 0) { - num_features_aligned <<= 1; - num_features >>= 1; +void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( + const int cpu_smaller_leaf_index, + const int cpu_larger_leaf_index) { + + int num_tasks = num_tasks_; + int num_tasks_aligned = 1; + num_tasks -= 1; + while (num_tasks > 0) { + num_tasks_aligned <<= 1; + num_tasks >>= 1; + } + const int num_blocks_per_leaf = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; + if (cpu_larger_leaf_index >= 0) { + SyncBestSplitForLeafKernel<<<2 * num_blocks_per_leaf, NUM_TASKS_PER_SYNC_BLOCK>>>( + cpu_smaller_leaf_index, + cpu_larger_leaf_index, + cuda_num_features_, + cuda_leaf_best_split_feature_, + cuda_leaf_best_split_default_left_, + cuda_leaf_best_split_threshold_, + cuda_leaf_best_split_gain_, + cuda_leaf_best_split_left_sum_gradient_, + cuda_leaf_best_split_left_sum_hessian_, + cuda_leaf_best_split_left_count_, + cuda_leaf_best_split_left_gain_, + cuda_leaf_best_split_left_output_, + cuda_leaf_best_split_right_sum_gradient_, + cuda_leaf_best_split_right_sum_hessian_, + cuda_leaf_best_split_right_count_, + cuda_leaf_best_split_right_gain_, + cuda_leaf_best_split_right_output_, + cuda_task_feature_index_, + cuda_best_split_default_left_, + cuda_best_split_threshold_, + cuda_best_split_gain_, + cuda_best_split_left_sum_gradient_, + cuda_best_split_left_sum_hessian_, + cuda_best_split_left_count_, + cuda_best_split_left_gain_, + cuda_best_split_left_output_, + cuda_best_split_right_sum_gradient_, + cuda_best_split_right_sum_hessian_, + cuda_best_split_right_count_, + cuda_best_split_right_gain_, + cuda_best_split_right_output_, + cuda_best_split_found_, + cuda_feature_default_bins_, + num_tasks_, + num_tasks_aligned, + num_blocks_per_leaf); + } else { + SyncBestSplitForLeafKernel<<>>( + cpu_smaller_leaf_index, + cpu_larger_leaf_index, + cuda_num_features_, + cuda_leaf_best_split_feature_, + cuda_leaf_best_split_default_left_, + cuda_leaf_best_split_threshold_, + cuda_leaf_best_split_gain_, + cuda_leaf_best_split_left_sum_gradient_, + cuda_leaf_best_split_left_sum_hessian_, + cuda_leaf_best_split_left_count_, + cuda_leaf_best_split_left_gain_, + cuda_leaf_best_split_left_output_, + cuda_leaf_best_split_right_sum_gradient_, + cuda_leaf_best_split_right_sum_hessian_, + cuda_leaf_best_split_right_count_, + cuda_leaf_best_split_right_gain_, + cuda_leaf_best_split_right_output_, + cuda_task_feature_index_, + cuda_best_split_default_left_, + cuda_best_split_threshold_, + cuda_best_split_gain_, + cuda_best_split_left_sum_gradient_, + cuda_best_split_left_sum_hessian_, + cuda_best_split_left_count_, + cuda_best_split_left_gain_, + cuda_best_split_left_output_, + cuda_best_split_right_sum_gradient_, + cuda_best_split_right_sum_hessian_, + cuda_best_split_right_count_, + cuda_best_split_right_gain_, + cuda_best_split_right_output_, + cuda_best_split_found_, + cuda_feature_default_bins_, + num_tasks_, + num_tasks_aligned, + num_blocks_per_leaf); } - SyncBestSplitForLeafKernel<<<2, NUM_FEATURES_PER_SYNC_BLOCK>>>(smaller_leaf_index, larger_leaf_index, - cuda_num_features_, cuda_leaf_best_split_feature_, cuda_leaf_best_split_default_left_, - cuda_leaf_best_split_threshold_, cuda_leaf_best_split_gain_, - cuda_leaf_best_split_left_sum_gradient_, cuda_leaf_best_split_left_sum_hessian_, - cuda_leaf_best_split_left_count_, cuda_leaf_best_split_left_gain_, - cuda_leaf_best_split_left_output_, - cuda_leaf_best_split_right_sum_gradient_, cuda_leaf_best_split_right_sum_hessian_, - cuda_leaf_best_split_right_count_, cuda_leaf_best_split_right_gain_, - cuda_leaf_best_split_right_output_, - cuda_best_split_feature_, - cuda_best_split_default_left_, - cuda_best_split_threshold_, - cuda_best_split_gain_, - cuda_best_split_left_sum_gradient_, - cuda_best_split_left_sum_hessian_, - cuda_best_split_left_count_, - cuda_best_split_left_gain_, - cuda_best_split_left_output_, - cuda_best_split_right_sum_gradient_, - cuda_best_split_right_sum_hessian_, - cuda_best_split_right_count_, - cuda_best_split_right_gain_, - cuda_best_split_right_output_, - cuda_best_split_found_, - cuda_feature_default_bins_, - num_features_, - num_features_aligned); } __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, @@ -688,17 +712,20 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, thread_best_gain[threadIdx_x] = kMinScore; thread_best_leaf[threadIdx_x] = -1; const int num_leaves_per_thread = (cuda_cur_num_leaves_ref + NUM_THREADS_FIND_BEST_LEAF - 1) / NUM_THREADS_FIND_BEST_LEAF; - const int start = num_leaves_per_thread * threadIdx_x; - const int end = min(start + num_leaves_per_thread, cuda_cur_num_leaves_ref); - for (int leaf_index = start; leaf_index < end; ++leaf_index) { - const double leaf_best_gain = cuda_leaf_best_split_gain[leaf_index]; - if (leaf_best_gain > thread_best_gain[threadIdx_x]) { - thread_best_gain[threadIdx_x] = leaf_best_gain; - thread_best_leaf[threadIdx_x] = leaf_index; + const int cur_num_valid_threads = (cuda_cur_num_leaves_ref + num_leaves_per_thread - 1) / num_leaves_per_thread; + if (threadIdx_x < static_cast(cur_num_valid_threads)) { + const int start = num_leaves_per_thread * threadIdx_x; + const int end = min(start + num_leaves_per_thread, cuda_cur_num_leaves_ref); + for (int leaf_index = start; leaf_index < end; ++leaf_index) { + const double leaf_best_gain = cuda_leaf_best_split_gain[leaf_index]; + if (leaf_best_gain > thread_best_gain[threadIdx_x]) { + thread_best_gain[threadIdx_x] = leaf_best_gain; + thread_best_leaf[threadIdx_x] = leaf_index; + } } } __syncthreads(); - ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf, cuda_cur_num_leaves_ref); + ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf, cur_num_valid_threads); if (threadIdx_x == 0) { *out_best_leaf = thread_best_leaf[0]; cuda_best_split_info_buffer[8] = thread_best_leaf[0]; diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 6496d08b3fda..2713d7224598 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -21,7 +21,7 @@ #define NUM_THREADS_FIND_BEST_LEAF (256) #define LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER (4) #define NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER (16) -#define NUM_FEATURES_PER_SYNC_BLOCK (32) +#define NUM_TASKS_PER_SYNC_BLOCK (256) namespace LightGBM { @@ -101,7 +101,9 @@ class CUDABestSplitFinder { void LaunchFindBestSplitsForLeafKernel(const CUDALeafSplits* smaller_leaf_splits, const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index); - void LaunchSyncBestSplitForLeafKernel(const int* smaller_leaf_index, const int* larger_leaf_index); + void LaunchSyncBestSplitForLeafKernel( + const int cpu_smaller_leaf_index, + const int cpu_larger_leaf_index); void LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, std::vector* leaf_best_split_feature, @@ -150,7 +152,6 @@ class CUDABestSplitFinder { double* cuda_leaf_best_split_right_gain_; double* cuda_leaf_best_split_right_output_; // for best split information when finding best split - int* cuda_best_split_feature_; uint8_t* cuda_best_split_default_left_; uint32_t* cuda_best_split_threshold_; double* cuda_best_split_gain_; diff --git a/src/treelearner/cuda/cuda_binary_objective.cu b/src/treelearner/cuda/cuda_binary_objective.cu index be0bb25ef062..2034614638e8 100644 --- a/src/treelearner/cuda/cuda_binary_objective.cu +++ b/src/treelearner/cuda/cuda_binary_objective.cu @@ -39,10 +39,6 @@ __global__ void CalcInitScoreKernel_2(double* out_cuda_init_score, const data_si const double pavg = suml / sumw; const double init_score = log(pavg / (1.0f - pavg)) / sigmoid; *out_cuda_init_score = init_score; - printf("cuda init score suml = %f\n", suml); - printf("cuda init score sumw = %f\n", sumw); - printf("cuda init score pavg = %f\n", pavg); - printf("cuda init score = %f\n", init_score); } void CUDABinaryObjective::LaunchCalcInitScoreKernel() { From 72d41c966b9b262a4a17e62111339464070c0edc Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 2 Jun 2021 06:49:08 +0000 Subject: [PATCH 017/166] refactor cuda best split finder --- .../cuda/cuda_best_split_finder.cpp | 6 +- .../cuda/cuda_best_split_finder.cu | 183 +++++++++++------- .../cuda/cuda_best_split_finder.hpp | 8 +- src/treelearner/cuda/cuda_leaf_splits.cu | 4 +- .../cuda/new_cuda_tree_learner.cpp | 2 +- 5 files changed, 123 insertions(+), 80 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index d42e010d31a7..e0893b53ef4d 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -144,7 +144,7 @@ void CUDABestSplitFinder::Init() { AllocateCUDAMemory(output_buffer_size, &cuda_best_split_right_output_); AllocateCUDAMemory(output_buffer_size, &cuda_best_split_found_); - AllocateCUDAMemory(9, &cuda_best_split_info_buffer_); + AllocateCUDAMemory(7, &cuda_best_split_info_buffer_); cuda_streams_.resize(2); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); @@ -171,8 +171,8 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le //Log::Warning("FindBestSplitsForLeaf time %f", duration); } -void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int* smaller_leaf_index, - const int* larger_leaf_index, std::vector* leaf_best_split_feature, +void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int smaller_leaf_index, + const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { auto start = std::chrono::steady_clock::now(); LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves, smaller_leaf_index, larger_leaf_index, diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index ff2cccd7072b..243af12411a6 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -37,8 +37,6 @@ __device__ void PrefixSumHist(hist_t* elements, unsigned int n) { if (threadIdx_x < d) { const unsigned int dst_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 2) - 1); const unsigned int src_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 1) - 1); - //const unsigned int conflict_free_dst_pos = (dst_pos); - //const unsigned int conflict_free_src_pos = (src_pos); const hist_t src_val = elements[src_pos]; elements[src_pos] = elements[dst_pos]; elements[dst_pos] += src_val; @@ -75,8 +73,6 @@ __device__ void PrefixSumHistCnt(data_size_t* elements, unsigned int n) { if (threadIdx_x < d) { const unsigned int dst_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 2) - 1); const unsigned int src_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 1) - 1); - //const unsigned int conflict_free_dst_pos = (dst_pos); - //const unsigned int conflict_free_src_pos = (src_pos); const data_size_t src_val = elements[src_pos]; elements[src_pos] = elements[dst_pos]; elements[dst_pos] += src_val; @@ -101,7 +97,6 @@ __device__ void ReduceBestGain(double* gain, hist_t* sum_gradients, gain[tid] = gain[tid_s]; sum_gradients[conflict_free_tid_plus_1] = sum_gradients[conflict_free_tid_s_plus_1]; sum_hessians[conflict_free_tid_plus_1] = sum_hessians[conflict_free_tid_s_plus_1]; - //num_data[conflict_free_tid_plus_1] = num_data[conflict_free_tid_s_plus_1]; found[tid] = found[tid_s]; threshold_value[tid] = threshold_value[tid_s]; } @@ -180,13 +175,30 @@ __device__ double GetSplitGains(double sum_left_gradients, l1, use_l1, l2); } -__device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, - const uint32_t feature_num_bin, const uint8_t feature_mfb_offset, - const uint32_t feature_default_bin, const uint8_t feature_missing_type, - const double lambda_l1, const double lambda_l2, const double parent_gain, const data_size_t min_data_in_leaf, - const double min_sum_hessian_in_leaf, const double min_gain_to_split, - const double sum_gradients, const double sum_hessians, const data_size_t num_data, - const bool reverse, const bool skip_default_bin, const bool na_as_missing, const uint8_t assume_out_default_left, +__device__ void FindBestSplitsForLeafKernelInner( + // input feature information + const hist_t* feature_hist_ptr, + const uint32_t feature_num_bin, + const uint8_t feature_mfb_offset, + const uint32_t feature_default_bin, + const uint8_t feature_missing_type, + const int inner_feature_index, + // input config parameter values + const double lambda_l1, + const double lambda_l2, + const data_size_t min_data_in_leaf, + const double min_sum_hessian_in_leaf, + const double min_gain_to_split, + // input parent node information + const double parent_gain, + const double sum_gradients, + const double sum_hessians, + const data_size_t num_data, + // input task information + const bool reverse, + const bool skip_default_bin, + const bool na_as_missing, + const uint8_t assume_out_default_left, // output parameters uint32_t* output_threshold, double* output_gain, @@ -201,7 +213,7 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, data_size_t* output_right_num_data, double* output_right_gain, double* output_right_output, - uint8_t* output_found, const int inner_feature_index) { + uint8_t* output_found) { const double cnt_factor = num_data / sum_hessians; const bool use_l1 = lambda_l1 > 0.0f; @@ -258,7 +270,7 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, __syncthreads(); const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(threadIdx_x + 1); if (reverse) { - if (threadIdx_x >= na_as_missing && threadIdx_x <= feature_num_bin - 2 && !skip_split) { + if (threadIdx_x >= static_cast(na_as_missing) && threadIdx_x <= feature_num_bin - 2 && !skip_split) { const double sum_right_gradient = local_grad_hist[conflict_free_threadIdx_x_plus_1]; const double sum_right_hessian = local_hess_hist[conflict_free_threadIdx_x_plus_1]; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); @@ -324,51 +336,55 @@ __device__ void FindBestSplitsForLeafKernelInner(const hist_t* feature_hist_ptr, *output_default_left = assume_out_default_left; if (reverse) { const double sum_right_gradient = local_grad_hist[1]; - const double sum_right_hessian = local_hess_hist[1]; + const double sum_right_hessian = local_hess_hist[1] - kEpsilon; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double sum_left_gradient = sum_gradients - sum_right_gradient; - const double sum_left_hessian = sum_hessians - sum_right_hessian; + const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon; const data_size_t left_count = num_data - right_count; + const double left_output = CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2); + const double right_output = CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2); *output_left_sum_gradients = sum_left_gradient; *output_left_sum_hessians = sum_left_hessian; *output_left_num_data = left_count; *output_right_sum_gradients = sum_right_gradient; *output_right_sum_hessians = sum_right_hessian; *output_right_num_data = right_count; - *output_left_output = CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2); + *output_left_output = left_output; *output_left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2, *output_left_output); - *output_right_output = CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2); + sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + *output_right_output = right_output; *output_right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2, *output_right_output); + sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); } else { const double sum_left_gradient = local_grad_hist[1]; - const double sum_left_hessian = local_hess_hist[1]; + const double sum_left_hessian = local_hess_hist[1] - kEpsilon; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); const double sum_right_gradient = sum_gradients - sum_left_gradient; - const double sum_right_hessian = sum_hessians - sum_left_hessian; + const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon; const data_size_t right_count = num_data - left_count; + const double left_output = CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2); + const double right_output = CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2); *output_left_sum_gradients = sum_left_gradient; *output_left_sum_hessians = sum_left_hessian; *output_left_num_data = left_count; *output_right_sum_gradients = sum_right_gradient; *output_right_sum_hessians = sum_right_hessian; *output_right_num_data = right_count; - *output_left_output = CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2); + *output_left_output = left_output; *output_left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2, *output_left_output); - *output_right_output = CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2); + sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + *output_right_output = right_output; *output_right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2, *output_right_output); + sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); } } } -__global__ void FindBestSplitsForOneLeafKernel( +__global__ void FindBestSplitsForLeafKernel( // input feature information const uint32_t* feature_hist_offsets, const uint8_t* feature_mfb_offsets, @@ -444,13 +460,45 @@ __global__ void FindBestSplitsForOneLeafKernel( uint8_t* out_found = cuda_best_split_found + output_offset; double* out_gain = cuda_best_split_gain + output_offset; const hist_t* hist_ptr = (is_larger ? *larger_leaf_hist : *smaller_leaf_hist) + feature_hist_offsets[inner_feature_index] * 2; - FindBestSplitsForLeafKernelInner(hist_ptr, - feature_num_bins[inner_feature_index], feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], - feature_missing_types[inner_feature_index], lambda_l1, lambda_l2, parent_gain, - min_data_in_leaf, min_sum_hessian_in_leaf, min_gain_to_split, sum_gradients, sum_hessians, - num_data, reverse, skip_default_bin, na_as_missing, assume_out_default_left, out_threshold, out_gain, out_default_left, - out_left_sum_gradients, out_left_sum_hessians, out_left_num_data, out_left_gain, out_left_output, - out_right_sum_gradients, out_right_sum_hessians, out_right_num_data, out_right_gain, out_right_output, out_found, inner_feature_index); + FindBestSplitsForLeafKernelInner( + // input feature information + hist_ptr, + feature_num_bins[inner_feature_index], + feature_mfb_offsets[inner_feature_index], + feature_default_bins[inner_feature_index], + feature_missing_types[inner_feature_index], + inner_feature_index, + // input config parameter values + lambda_l1, + lambda_l2, + min_data_in_leaf, + min_sum_hessian_in_leaf, + min_gain_to_split, + // input parent node information + parent_gain, + sum_gradients, + sum_hessians, + num_data, + // input task information + reverse, + skip_default_bin, + na_as_missing, + assume_out_default_left, + // output parameters + out_threshold, + out_gain, + out_default_left, + out_left_sum_gradients, + out_left_sum_hessians, + out_left_num_data, + out_left_gain, + out_left_output, + out_right_sum_gradients, + out_right_sum_hessians, + out_right_num_data, + out_right_gain, + out_right_output, + out_found); } void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( @@ -459,7 +507,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( const int smaller_leaf_index, const int larger_leaf_index) { const int num_blocks = larger_leaf_index >= 0 ? num_tasks_ * 2 : num_tasks_; - FindBestSplitsForOneLeafKernel<<>>( + FindBestSplitsForLeafKernel<<>>( // input feature information cuda_feature_hist_offsets_, cuda_feature_mfb_offsets_, @@ -728,37 +776,34 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf, cur_num_valid_threads); if (threadIdx_x == 0) { *out_best_leaf = thread_best_leaf[0]; - cuda_best_split_info_buffer[8] = thread_best_leaf[0]; + cuda_best_split_info_buffer[6] = thread_best_leaf[0]; } } -__global__ void PrepareLeafBestSplitInfo(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, +__global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int larger_leaf_index, int* cuda_best_split_info_buffer, const int* cuda_leaf_best_split_feature, const uint32_t* cuda_leaf_best_split_threshold, const uint8_t* cuda_leaf_best_split_default_left) { - const int smaller_leaf_index_ref = *cuda_smaller_leaf_index; - const int larger_leaf_index_ref = *cuda_larger_leaf_index; const unsigned int threadIdx_x = threadIdx.x; if (threadIdx_x == 0) { - cuda_best_split_info_buffer[0] = smaller_leaf_index_ref; + cuda_best_split_info_buffer[0] = cuda_leaf_best_split_feature[smaller_leaf_index]; } else if (threadIdx_x == 1) { - cuda_best_split_info_buffer[1] = cuda_leaf_best_split_feature[smaller_leaf_index_ref]; + cuda_best_split_info_buffer[1] = cuda_leaf_best_split_threshold[smaller_leaf_index]; } else if (threadIdx_x == 2) { - cuda_best_split_info_buffer[2] = cuda_leaf_best_split_threshold[smaller_leaf_index_ref]; - } else if (threadIdx_x == 3) { - cuda_best_split_info_buffer[3] = cuda_leaf_best_split_default_left[smaller_leaf_index_ref]; - } else if (threadIdx_x == 4) { - cuda_best_split_info_buffer[4] = larger_leaf_index_ref; - } else if (threadIdx_x == 5) { - cuda_best_split_info_buffer[5] = cuda_leaf_best_split_feature[larger_leaf_index_ref]; - } else if (threadIdx_x == 6) { - cuda_best_split_info_buffer[6] = cuda_leaf_best_split_threshold[larger_leaf_index_ref]; - } else if (threadIdx_x == 7) { - cuda_best_split_info_buffer[7] = cuda_leaf_best_split_default_left[larger_leaf_index_ref]; - } + cuda_best_split_info_buffer[2] = cuda_leaf_best_split_default_left[smaller_leaf_index]; + } + if (larger_leaf_index >= 0) { + if (threadIdx_x == 3) { + cuda_best_split_info_buffer[3] = cuda_leaf_best_split_feature[larger_leaf_index]; + } else if (threadIdx_x == 4) { + cuda_best_split_info_buffer[4] = cuda_leaf_best_split_threshold[larger_leaf_index]; + } else if (threadIdx_x == 5) { + cuda_best_split_info_buffer[5] = cuda_leaf_best_split_default_left[larger_leaf_index]; + } + } } void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, - const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, std::vector* leaf_best_split_feature, + const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF, 0, cuda_streams_[1]>>>(cuda_cur_num_leaves, cuda_leaf_best_split_gain_, cuda_best_leaf_, cuda_leaf_best_split_feature_, cuda_leaf_best_split_threshold_, cuda_feature_default_bins_, @@ -769,23 +814,21 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_ cuda_leaf_best_split_left_count_, cuda_leaf_best_split_right_count_, cuda_best_split_info_buffer_); - PrepareLeafBestSplitInfo<<<1, 8, 0, cuda_streams_[0]>>>(cuda_smaller_leaf_index, cuda_larger_leaf_index, + PrepareLeafBestSplitInfo<<<1, 6, 0, cuda_streams_[0]>>>(smaller_leaf_index, larger_leaf_index, cuda_best_split_info_buffer_, cuda_leaf_best_split_feature_, cuda_leaf_best_split_threshold_, cuda_leaf_best_split_default_left_); - std::vector cpu_leaf_best_split_info_buffer(9); + std::vector cpu_leaf_best_split_info_buffer(7); SynchronizeCUDADevice(); - CopyFromCUDADeviceToHost(cpu_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 9); - const int smaller_leaf_index = cpu_leaf_best_split_info_buffer[0]; - const int larger_leaf_index = cpu_leaf_best_split_info_buffer[4]; - (*leaf_best_split_feature)[smaller_leaf_index] = cpu_leaf_best_split_info_buffer[1]; - (*leaf_best_split_threshold)[smaller_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[2]); - (*leaf_best_split_default_left)[smaller_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[3]); + CopyFromCUDADeviceToHost(cpu_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 7); + (*leaf_best_split_feature)[smaller_leaf_index] = cpu_leaf_best_split_info_buffer[0]; + (*leaf_best_split_threshold)[smaller_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[1]); + (*leaf_best_split_default_left)[smaller_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[2]); if (larger_leaf_index >= 0) { - (*leaf_best_split_feature)[larger_leaf_index] = cpu_leaf_best_split_info_buffer[5]; - (*leaf_best_split_threshold)[larger_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[6]); - (*leaf_best_split_default_left)[larger_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[7]); + (*leaf_best_split_feature)[larger_leaf_index] = cpu_leaf_best_split_info_buffer[3]; + (*leaf_best_split_threshold)[larger_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[4]); + (*leaf_best_split_default_left)[larger_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[5]); } - *best_leaf_index = cpu_leaf_best_split_info_buffer[8]; + *best_leaf_index = cpu_leaf_best_split_info_buffer[6]; } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 2713d7224598..a748db5b2183 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -40,8 +40,8 @@ class CUDABestSplitFinder { void FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index); - void FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int* smaller_leaf_index, - const int* larger_leaf_index, std::vector* leaf_best_split_feature, + void FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int smaller_leaf_index, + const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index); const int* cuda_best_leaf() const { return cuda_best_leaf_; } @@ -105,8 +105,8 @@ class CUDABestSplitFinder { const int cpu_smaller_leaf_index, const int cpu_larger_leaf_index); - void LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, const int* cuda_smaller_leaf_index, - const int* cuda_larger_leaf_index, std::vector* leaf_best_split_feature, + void LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, const int smaller_leaf_index, + const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index); // Host memory diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index af844623f4fb..f1f1dadc96a1 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -64,7 +64,7 @@ void CUDALeafSplits::LaunchInitValuesKernal() { SynchronizeCUDADevice(); auto end = std::chrono::steady_clock::now(); auto duration = static_cast>(end - start); - Log::Warning("CUDAInitValuesKernel1 duration = %f", duration.count()); + //Log::Warning("CUDAInitValuesKernel1 duration = %f", duration.count()); start = std::chrono::steady_clock::now(); CUDAInitValuesKernel2<<>>( cuda_sum_of_gradients_, cuda_sum_of_hessians_); @@ -72,7 +72,7 @@ void CUDALeafSplits::LaunchInitValuesKernal() { end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); //Log::Warning("cuda_sum_of_gradients_ = %f, cuda_sum_of_hessians_ = %f", *cuda_sum_of_gradients_, *cuda_sum_of_hessians_); - Log::Warning("CUDAInitValuesKernel2 duration = %f", duration.count()); + //Log::Warning("CUDAInitValuesKernel2 duration = %f", duration.count()); } } // namespace LightGBM diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 2f128f5fe9bf..c0ee69b984b0 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -228,7 +228,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, start = std::chrono::steady_clock::now(); global_timer.Start("NewCUDATreeLearner::FindBestFromAllSplits"); cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves(), - cuda_smaller_leaf_splits_->cuda_leaf_index(), cuda_larger_leaf_splits_->cuda_leaf_index(), + smaller_leaf_index_, larger_leaf_index_, &leaf_best_split_feature_, &leaf_best_split_threshold_, &leaf_best_split_default_left_, &best_leaf_index_); global_timer.Stop("NewCUDATreeLearner::FindBestFromAllSplits"); end = std::chrono::steady_clock::now(); From f7a76582bbf103551408a74dceef406bbb0147f9 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 2 Jun 2021 12:42:28 +0000 Subject: [PATCH 018/166] fix configuration error with small leaves in data split --- .../cuda/cuda_best_split_finder.cu | 26 ++-- src/treelearner/cuda/cuda_data_partition.cpp | 15 ++ src/treelearner/cuda/cuda_data_partition.cu | 139 +++++++++++++++++- src/treelearner/cuda/cuda_data_partition.hpp | 22 +++ .../cuda/new_cuda_tree_learner.cpp | 3 + 5 files changed, 193 insertions(+), 12 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 243af12411a6..a29e30ba015d 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -10,6 +10,10 @@ namespace LightGBM { +#define K_MIN_SCORE (-1000000.0) + +#define K_EPSILON (1e-15f) + #define CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(n) \ ((n) + ((n) >> LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER)) \ @@ -261,9 +265,9 @@ __device__ void FindBestSplitsForLeafKernelInner( } __syncthreads(); if (threadIdx_x == 0) { - local_hess_hist[conflict_free_threadIdx_x] += kEpsilon; + local_hess_hist[conflict_free_threadIdx_x] += K_EPSILON; } - local_gain[threadIdx_x] = kMinScore; + local_gain[threadIdx_x] = K_MIN_SCORE; __syncthreads(); PrefixSumHist(local_grad_hist, MAX_NUM_BIN_IN_FEATURE); PrefixSumHist(local_hess_hist, MAX_NUM_BIN_IN_FEATURE); @@ -336,10 +340,10 @@ __device__ void FindBestSplitsForLeafKernelInner( *output_default_left = assume_out_default_left; if (reverse) { const double sum_right_gradient = local_grad_hist[1]; - const double sum_right_hessian = local_hess_hist[1] - kEpsilon; + const double sum_right_hessian = local_hess_hist[1] - K_EPSILON; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double sum_left_gradient = sum_gradients - sum_right_gradient; - const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon; + const double sum_left_hessian = sum_hessians - sum_right_hessian - K_EPSILON; const data_size_t left_count = num_data - right_count; const double left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian, lambda_l1, use_l1, lambda_l2); @@ -359,10 +363,10 @@ __device__ void FindBestSplitsForLeafKernelInner( sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); } else { const double sum_left_gradient = local_grad_hist[1]; - const double sum_left_hessian = local_hess_hist[1] - kEpsilon; + const double sum_left_hessian = local_hess_hist[1] - K_EPSILON; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); const double sum_right_gradient = sum_gradients - sum_left_gradient; - const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon; + const double sum_right_hessian = sum_hessians - sum_left_hessian - K_EPSILON; const data_size_t right_count = num_data - left_count; const double left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian, lambda_l1, use_l1, lambda_l2); @@ -442,7 +446,7 @@ __global__ void FindBestSplitsForLeafKernel( const bool assume_out_default_left = task_out_default_left[task_index]; const double parent_gain = is_larger ? *larger_leaf_gain : *smaller_leaf_gain; const double sum_gradients = is_larger ? *larger_sum_gradients_in_leaf : *smaller_sum_gradients_in_leaf; - const double sum_hessians = (is_larger ? *larger_sum_hessians_in_leaf : *smaller_sum_hessians_in_leaf) + 2 * kEpsilon; + const double sum_hessians = (is_larger ? *larger_sum_hessians_in_leaf : *smaller_sum_hessians_in_leaf) + 2 * K_EPSILON; const double num_data = is_larger ? *larger_num_data_in_leaf : *smaller_num_data_in_leaf; const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; uint8_t* out_default_left = cuda_best_split_default_left + output_offset; @@ -460,6 +464,10 @@ __global__ void FindBestSplitsForLeafKernel( uint8_t* out_found = cuda_best_split_found + output_offset; double* out_gain = cuda_best_split_gain + output_offset; const hist_t* hist_ptr = (is_larger ? *larger_leaf_hist : *smaller_leaf_hist) + feature_hist_offsets[inner_feature_index] * 2; + if (num_data <= min_data_in_leaf) { + *out_found = 0; + return; + } FindBestSplitsForLeafKernelInner( // input feature information hist_ptr, @@ -646,7 +654,7 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i cuda_leaf_best_split_right_gain[leaf_index_ref] = cuda_best_split_right_gain[best_read_index]; cuda_leaf_best_split_right_output[leaf_index_ref] = cuda_best_split_right_output[best_read_index]; } else { - cuda_leaf_best_split_gain[leaf_index_ref] = kMinScore; + cuda_leaf_best_split_gain[leaf_index_ref] = K_MIN_SCORE; } } } @@ -757,7 +765,7 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, __shared__ double thread_best_gain[NUM_THREADS_FIND_BEST_LEAF]; __shared__ int thread_best_leaf[NUM_THREADS_FIND_BEST_LEAF]; const unsigned int threadIdx_x = threadIdx.x; - thread_best_gain[threadIdx_x] = kMinScore; + thread_best_gain[threadIdx_x] = K_MIN_SCORE; thread_best_leaf[threadIdx_x] = -1; const int num_leaves_per_thread = (cuda_cur_num_leaves_ref + NUM_THREADS_FIND_BEST_LEAF - 1) / NUM_THREADS_FIND_BEST_LEAF; const int cur_num_valid_threads = (cuda_cur_num_leaves_ref + num_leaves_per_thread - 1) / num_leaves_per_thread; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 220dc95baec7..bbfd2b6f4f1d 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -129,6 +129,10 @@ void CUDADataPartition::Init() { CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[2])); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[3])); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[4])); + + const size_t max_num_blocks_in_debug = static_cast((num_data_ + 1023) / 1024); + AllocateCUDAMemory(max_num_blocks_in_debug, &cuda_gradients_sum_buffer_); + AllocateCUDAMemory(max_num_blocks_in_debug, &cuda_hessians_sum_buffer_); } void CUDADataPartition::CopyColWiseData() { @@ -289,6 +293,17 @@ void CUDADataPartition::UpdateTrainScore(const double learning_rate, double* tra }*/ } +void CUDADataPartition::CUDACheck( + const int smaller_leaf_index, + const int larger_leaf_index, + const std::vector& num_data_in_leaf, + const CUDALeafSplits* smaller_leaf_splits, + const CUDALeafSplits* larger_leaf_splits, + const score_t* gradients, + const score_t* hessians) { + LaunchCUDACheckKernel(smaller_leaf_index, larger_leaf_index, num_data_in_leaf, smaller_leaf_splits, larger_leaf_splits, gradients, hessians); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index b66a2c18ee0f..b53cce119a41 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -98,6 +98,16 @@ __device__ void ReduceSum(uint16_t* array, const size_t size) { } } +__device__ void ReduceSum(double* array, const size_t size) { + const unsigned int threadIdx_x = threadIdx.x; + for (int s = 1; s < size; s <<= 1) { + if (threadIdx_x % (2 * s) == 0 && (threadIdx_x + s) < size) { + array[threadIdx_x] += array[threadIdx_x + s]; + } + __syncthreads(); + } +} + __global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_data, data_size_t* data_indices) { const data_size_t num_data_ref = *cuda_num_data; @@ -694,7 +704,8 @@ __global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_s void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start) { - const int num_blocks = std::max(80, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; + const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; int split_indices_block_size_data_partition_aligned = 1; while (split_indices_block_size_data_partition > 0) { @@ -959,8 +970,9 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b } if (blockIdx.x == 0 && threadIdx.x == 0) { ++(*cuda_cur_num_leaves); - const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; + /*printf("leaf_index_ref = %d, cuda_cur_num_leaves = %d, cur_max_leaf_index = %d\n", + leaf_index_ref, *cuda_cur_num_leaves, cur_max_leaf_index);*/ block_to_left_offset_buffer[0] = 0; const unsigned int to_left_total_cnt = block_to_left_offset_buffer[num_blocks]; block_to_right_offset_buffer[0] = to_left_total_cnt; @@ -1029,6 +1041,18 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo cuda_split_info_buffer[4] = cuda_leaf_num_data[cur_max_leaf_index]; cuda_split_info_buffer[5] = cuda_leaf_data_start[cur_max_leaf_index]; + /*if (cuda_leaf_num_data[leaf_index_ref] <= 0) { + printf("error !!! leaf %d has count %d\n", leaf_index_ref, cuda_leaf_num_data[leaf_index_ref]); + } + + if (cuda_leaf_num_data[cur_max_leaf_index] <= 0) { + printf("error !!! leaf %d has count %d\n", cur_max_leaf_index, cuda_leaf_num_data[cur_max_leaf_index]); + } + + printf("splitting %d into %d with num data %d and %d with num data %d\n", + leaf_index_ref, leaf_index_ref, cuda_leaf_num_data[leaf_index_ref], + cur_max_leaf_index, cuda_leaf_num_data[cur_max_leaf_index]);*/ + if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; *smaller_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; @@ -1185,7 +1209,8 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data hist_t** larger_leaf_cuda_hist_pointer_pointer, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, int* smaller_leaf_index, int* larger_leaf_index) { - const int num_blocks = std::max(80, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; + const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; int split_indices_block_size_data_partition_aligned = 1; while (split_indices_block_size_data_partition > 0) { @@ -1195,6 +1220,10 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); auto start = std::chrono::steady_clock::now(); + /*int cpu_leaf_index = 0, cpu_cur_num_leaves = 0; + CopyFromCUDADeviceToHost(&cpu_leaf_index, leaf_index, 1); + CopyFromCUDADeviceToHost(&cpu_cur_num_leaves, cuda_cur_num_leaves_, 1); + Log::Warning("cpu_leaf_index = %d, cpu_cur_num_leaves = %d before aggregate", cpu_leaf_index, cpu_cur_num_leaves);*/ AggregateBlockOffsetKernel<<<1, split_indices_block_size_data_partition_aligned / 2>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, @@ -1224,6 +1253,10 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, data_partition_leaf_output_); SynchronizeCUDADevice(); + /*PrintLastCUDAError(); + CopyFromCUDADeviceToHost(&cpu_leaf_index, leaf_index, 1); + CopyFromCUDADeviceToHost(&cpu_cur_num_leaves, cuda_cur_num_leaves_, 1); + Log::Warning("cpu_leaf_index = %d, cpu_cur_num_leaves = %d after aggregate", cpu_leaf_index, cpu_cur_num_leaves);*/ auto end = std::chrono::steady_clock::now(); auto duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); @@ -1355,6 +1388,106 @@ void CUDADataPartition::LaunchCopyColWiseDataKernel() { CopyColWiseDataKernel<<>>(cuda_data_, num_data_, num_features_, cuda_data_col_wise_); } +__global__ void CUDACheckKernel(const data_size_t** data_indices_in_leaf_ptr, + const data_size_t num_data_in_leaf, + const score_t* gradients, + const score_t* hessians, + double* gradients_sum_buffer, + double* hessians_sum_buffer) { + const data_size_t* data_indices_in_leaf = *data_indices_in_leaf_ptr; + const data_size_t local_data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + __shared__ double local_gradients[1024]; + __shared__ double local_hessians[1024]; + if (local_data_index < num_data_in_leaf) { + const data_size_t global_data_index = data_indices_in_leaf[local_data_index]; + local_gradients[threadIdx.x] = gradients[global_data_index]; + local_hessians[threadIdx.x] = hessians[global_data_index]; + } else { + local_gradients[threadIdx.x] = 0.0f; + local_hessians[threadIdx.x] = 0.0f; + } + __syncthreads(); + ReduceSum(local_gradients, 1024); + __syncthreads(); + ReduceSum(local_hessians, 1024); + __syncthreads(); + if (threadIdx.x == 0) { + gradients_sum_buffer[blockIdx.x] = local_gradients[0]; + hessians_sum_buffer[blockIdx.x] = local_hessians[0]; + } +} + +__global__ void CUDACheckKernel2( + const int leaf_index, + const data_size_t* num_data_expected, + const double* sum_gradients_expected, + const double* sum_hessians_expected, + const double* gradients_sum_buffer, + const double* hessians_sum_buffer, + const int num_blocks) { + double sum_gradients = 0.0f; + double sum_hessians = 0.0f; + for (int i = 0; i < num_blocks; ++i) { + sum_gradients += gradients_sum_buffer[i]; + sum_hessians += hessians_sum_buffer[i]; + } + if (fabs(sum_gradients - *sum_gradients_expected) >= 1.0f) { + printf("error in leaf_index = %d\n", leaf_index); + printf("num data expected = %d\n", *num_data_expected); + printf("error sum_gradients: %f vs %f\n", sum_gradients, *sum_gradients_expected); + } + if (fabs(sum_hessians - *sum_hessians_expected) >= 1.0f) { + printf("error in leaf_index = %d\n", leaf_index); + printf("num data expected = %d\n", *num_data_expected); + printf("error sum_hessians: %f vs %f\n", sum_hessians, *sum_hessians_expected); + } +} + +void CUDADataPartition::LaunchCUDACheckKernel( + const int smaller_leaf_index, + const int larger_leaf_index, + const std::vector& num_data_in_leaf, + const CUDALeafSplits* smaller_leaf_splits, + const CUDALeafSplits* larger_leaf_splits, + const score_t* gradients, + const score_t* hessians) { + const data_size_t num_data_in_smaller_leaf = num_data_in_leaf[smaller_leaf_index]; + const int block_dim = 1024; + const int smaller_num_blocks = (num_data_in_smaller_leaf + block_dim - 1) / block_dim; + CUDACheckKernel<<>>(smaller_leaf_splits->cuda_data_indices_in_leaf(), + num_data_in_smaller_leaf, + gradients, + hessians, + cuda_gradients_sum_buffer_, + cuda_hessians_sum_buffer_); + CUDACheckKernel2<<<1, 1>>>( + smaller_leaf_index, + smaller_leaf_splits->cuda_num_data_in_leaf(), + smaller_leaf_splits->cuda_sum_of_gradients(), + smaller_leaf_splits->cuda_sum_of_hessians(), + cuda_gradients_sum_buffer_, + cuda_hessians_sum_buffer_, + smaller_num_blocks); + if (larger_leaf_index >= 0) { + const data_size_t num_data_in_larger_leaf = num_data_in_leaf[larger_leaf_index]; + const int larger_num_blocks = (num_data_in_larger_leaf + block_dim - 1) / block_dim; + CUDACheckKernel<<>>(larger_leaf_splits->cuda_data_indices_in_leaf(), + num_data_in_larger_leaf, + gradients, + hessians, + cuda_gradients_sum_buffer_, + cuda_hessians_sum_buffer_); + CUDACheckKernel2<<<1, 1>>>( + larger_leaf_index, + larger_leaf_splits->cuda_num_data_in_leaf(), + larger_leaf_splits->cuda_sum_of_gradients(), + larger_leaf_splits->cuda_sum_of_hessians(), + cuda_gradients_sum_buffer_, + cuda_hessians_sum_buffer_, + larger_num_blocks); + } +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index d078da4871cc..9ce054c8b4b7 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -12,6 +12,7 @@ #include #include #include "new_cuda_utils.hpp" +#include "cuda_leaf_splits.hpp" #define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) #define SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION (512) @@ -56,6 +57,15 @@ class CUDADataPartition { int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index); + void CUDACheck( + const int smaller_leaf_index, + const int larger_leaf_index, + const std::vector& num_data_in_leaf, + const CUDALeafSplits* smaller_leaf_splits, + const CUDALeafSplits* larger_leaf_splits, + const score_t* gradients, + const score_t* hessians); + Tree* GetCPUTree(); void Test() { @@ -245,6 +255,15 @@ class CUDADataPartition { void LaunchAddPredictionToScoreKernel(const double learning_rate, double* cuda_scores); + void LaunchCUDACheckKernel( + const int smaller_leaf_index, + const int larger_leaf_index, + const std::vector& num_data_in_leaf, + const CUDALeafSplits* smaller_leaf_splits, + const CUDALeafSplits* larger_leaf_splits, + const score_t* gradients, + const score_t* hessians); + // Host memory const data_size_t num_data_; const int num_features_; @@ -307,6 +326,9 @@ class CUDADataPartition { // for train data update double* train_data_score_tmp_; uint8_t* cuda_data_col_wise_; + // for debug + double* cuda_gradients_sum_buffer_; + double* cuda_hessians_sum_buffer_; // CUDA memory, held by other object const data_size_t* cuda_num_data_; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index c0ee69b984b0..a5c2c12b472e 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -277,6 +277,9 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, &smaller_leaf_index_, &larger_leaf_index_, best_leaf_index_); + /*cuda_data_partition_->CUDACheck(smaller_leaf_index_, larger_leaf_index_, + leaf_num_data_, cuda_smaller_leaf_splits_.get(), cuda_larger_leaf_splits_.get(), + cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians());*/ end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::Split"); From b6efd100e3af136ffe7d42d8e21b40d60c656163 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 2 Jun 2021 12:57:03 +0000 Subject: [PATCH 019/166] skip histogram construction of too small leaf --- src/treelearner/cuda/cuda_histogram_constructor.cpp | 6 ++++-- src/treelearner/cuda/cuda_histogram_constructor.cu | 3 +++ src/treelearner/cuda/cuda_histogram_constructor.hpp | 4 +++- src/treelearner/cuda/new_cuda_tree_learner.cpp | 3 ++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index a4dc752247e7..40d3493ee466 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -13,10 +13,12 @@ namespace LightGBM { CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, const score_t* cuda_gradients, const score_t* cuda_hessians, - const std::vector& feature_hist_offsets): num_data_(train_data->num_data()), + const std::vector& feature_hist_offsets, + const int min_data_in_leaf): num_data_(train_data->num_data()), num_features_(train_data->num_features()), num_leaves_(num_leaves), num_threads_(num_threads), num_feature_groups_(train_data->num_feature_groups()), - cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians) { + cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians), + min_data_in_leaf_(min_data_in_leaf) { int offset = 0; for (int group_id = 0; group_id < train_data->num_feature_groups(); ++group_id) { feature_group_bin_offsets_.emplace_back(offset); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index ae4c51b85893..985ebef06c88 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -114,6 +114,9 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( //Log::Warning("smaller_leaf_num_data = %d", smaller_leaf_num_data); //Log::Warning("block_dim_x = %d, block_dim_y = %d", block_dim_x, block_dim_y); //Log::Warning("gid_dim_x = %d, grid_dim_y = %d", grid_dim_x, grid_dim_y); + if (num_data_in_smaller_leaf <= min_data_in_leaf_) { + return; + } dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); CUDAConstructHistogramKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 235c5ac18a20..4fd0173f618b 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -30,7 +30,8 @@ namespace LightGBM { class CUDAHistogramConstructor { public: CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, - const score_t* cuda_gradients, const score_t* cuda_hessians, const std::vector& feature_hist_offsets); + const score_t* cuda_gradients, const score_t* cuda_hessians, const std::vector& feature_hist_offsets, + const int min_data_in_leaf); void Init(const Dataset* train_data); @@ -110,6 +111,7 @@ class CUDAHistogramConstructor { std::vector feature_num_bins_; std::vector feature_hist_offsets_; std::vector feature_most_freq_bins_; + const int min_data_in_leaf_; // CUDA memory, held by this object uint32_t* cuda_feature_group_bin_offsets_; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index a5c2c12b472e..2c70a8f5033a 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -35,7 +35,8 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_larger_leaf_splits_->Init(); cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, - cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), share_state_->feature_hist_offsets())); + cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), share_state_->feature_hist_offsets(), + config_->min_data_in_leaf)); cuda_histogram_constructor_->Init(train_data_); //cuda_histogram_constructor_->TestAfterInit(); From 6f4e39d83f8b5bc3119a0373634c2df59e15603e Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 3 Jun 2021 06:42:41 +0000 Subject: [PATCH 020/166] skip split finding of invalid leaves stop when no leaf to split --- .../cuda/cuda_best_split_finder.cpp | 14 ++++-- .../cuda/cuda_best_split_finder.cu | 50 +++++++++++++------ .../cuda/cuda_best_split_finder.hpp | 14 ++++-- src/treelearner/cuda/cuda_data_partition.cpp | 15 +++--- src/treelearner/cuda/cuda_data_partition.cu | 20 ++++++-- src/treelearner/cuda/cuda_data_partition.hpp | 8 ++- .../cuda/cuda_histogram_constructor.cpp | 11 ++-- .../cuda/cuda_histogram_constructor.cu | 6 +-- .../cuda/cuda_histogram_constructor.hpp | 9 ++-- src/treelearner/cuda/cuda_leaf_splits.cpp | 14 ++++-- src/treelearner/cuda/cuda_leaf_splits.hpp | 4 +- .../cuda/new_cuda_tree_learner.cpp | 28 +++++++++-- .../cuda/new_cuda_tree_learner.hpp | 1 + src/treelearner/cuda/new_cuda_utils.hpp | 8 +++ 14 files changed, 148 insertions(+), 54 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index e0893b53ef4d..686607a487b9 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -58,6 +58,7 @@ void CUDABestSplitFinder::Init() { AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_count_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_gain_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_output_); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_found_); AllocateCUDAMemory(feature_hist_offsets_.size(), &cuda_feature_hist_offsets_); CopyFromHostToCUDADevice(cuda_feature_hist_offsets_, feature_hist_offsets_.data(), feature_hist_offsets_.size()); @@ -151,19 +152,24 @@ void CUDABestSplitFinder::Init() { } void CUDABestSplitFinder::BeforeTrain() { - const size_t feature_best_split_info_buffer_size = static_cast(num_features_) * 4; SetCUDAMemory(cuda_leaf_best_split_gain_, 0, static_cast(num_leaves_)); SetCUDAMemory(cuda_best_split_found_, 0, static_cast(num_tasks_)); SetCUDAMemory(cuda_best_split_gain_, 0, static_cast(num_tasks_)); + SetCUDAMemory(cuda_leaf_best_split_found_, 0, static_cast(num_leaves_)); } void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, - const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index) { + const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, + const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, + const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { auto start = std::chrono::steady_clock::now(); - LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits, smaller_leaf_index, larger_leaf_index); + const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ && sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_); + const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ && sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_); + LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits, + smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); SynchronizeCUDADevice(); global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); - LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index); + LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); SynchronizeCUDADevice(); global_timer.Stop("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); auto end = std::chrono::steady_clock::now(); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index a29e30ba015d..ea9a593411bd 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -396,6 +396,7 @@ __global__ void FindBestSplitsForLeafKernel( const uint8_t* feature_missing_types, const uint32_t* feature_num_bins, // input task information + const bool larger_only, const int num_tasks, const int* task_feature_index, const uint8_t* task_reverse, @@ -438,7 +439,7 @@ __global__ void FindBestSplitsForLeafKernel( uint8_t* cuda_best_split_found) { const unsigned int task_index = blockIdx.x % num_tasks; - const bool is_larger = static_cast(blockIdx.x >= num_tasks); + const bool is_larger = static_cast(blockIdx.x >= num_tasks || larger_only); const int inner_feature_index = task_feature_index[task_index]; const bool reverse = static_cast(task_reverse[task_index]); const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); @@ -464,10 +465,6 @@ __global__ void FindBestSplitsForLeafKernel( uint8_t* out_found = cuda_best_split_found + output_offset; double* out_gain = cuda_best_split_gain + output_offset; const hist_t* hist_ptr = (is_larger ? *larger_leaf_hist : *smaller_leaf_hist) + feature_hist_offsets[inner_feature_index] * 2; - if (num_data <= min_data_in_leaf) { - *out_found = 0; - return; - } FindBestSplitsForLeafKernelInner( // input feature information hist_ptr, @@ -513,8 +510,17 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( const CUDALeafSplits* smaller_leaf_splits, const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, - const int larger_leaf_index) { - const int num_blocks = larger_leaf_index >= 0 ? num_tasks_ * 2 : num_tasks_; + const int larger_leaf_index, + const bool is_smaller_leaf_valid, + const bool is_larger_leaf_valid) { + if (!is_smaller_leaf_valid && !is_larger_leaf_valid) { + return; + } + bool larger_only = false; + if (!is_smaller_leaf_valid) { + larger_only = true; + } + const int num_blocks = (larger_leaf_index >= 0 && !larger_only) ? num_tasks_ * 2 : num_tasks_; FindBestSplitsForLeafKernel<<>>( // input feature information cuda_feature_hist_offsets_, @@ -523,6 +529,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_feature_missing_type_, cuda_feature_num_bins_, // input task information + larger_only, num_tasks_, cuda_task_feature_index_, cuda_task_reverse_, @@ -591,6 +598,7 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i double* cuda_leaf_best_split_right_sum_gradient, double* cuda_leaf_best_split_right_sum_hessian, data_size_t* cuda_leaf_best_split_right_count, double* cuda_leaf_best_split_right_gain, double* cuda_leaf_best_split_right_output, + uint8_t* cuda_leaf_best_split_found, // input parameters const int* cuda_task_feature_index, const uint8_t* cuda_best_split_default_left, @@ -610,7 +618,8 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i const uint32_t* cuda_feature_default_bins, const int num_tasks, const int num_tasks_aligned, - const int num_blocks_per_leaf) { + const int num_blocks_per_leaf, + const bool larger_only) { const uint32_t threadIdx_x = threadIdx.x; const uint32_t blockIdx_x = blockIdx.x; @@ -619,8 +628,8 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i __shared__ double best_gain[NUM_TASKS_PER_SYNC_BLOCK]; __shared__ uint32_t shared_read_index[NUM_TASKS_PER_SYNC_BLOCK]; - const bool is_smaller = (blockIdx_x < static_cast(num_blocks_per_leaf)); - const uint32_t leaf_block_index = is_smaller ? blockIdx_x : (blockIdx_x - static_cast(num_blocks_per_leaf)); + const bool is_smaller = (blockIdx_x < static_cast(num_blocks_per_leaf) && !larger_only); + const uint32_t leaf_block_index = (is_smaller || larger_only) ? blockIdx_x : (blockIdx_x - static_cast(num_blocks_per_leaf)); const int task_index = static_cast(leaf_block_index * num_blocks_per_leaf + threadIdx_x); const uint32_t read_index = is_smaller ? static_cast(task_index) : static_cast(task_index + num_tasks); if (task_index < num_tasks) { @@ -653,15 +662,19 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i cuda_leaf_best_split_right_count[leaf_index_ref] = cuda_best_split_right_count[best_read_index]; cuda_leaf_best_split_right_gain[leaf_index_ref] = cuda_best_split_right_gain[best_read_index]; cuda_leaf_best_split_right_output[leaf_index_ref] = cuda_best_split_right_output[best_read_index]; + cuda_leaf_best_split_found[leaf_index_ref] = 1; } else { cuda_leaf_best_split_gain[leaf_index_ref] = K_MIN_SCORE; + cuda_leaf_best_split_found[leaf_index_ref] = 0; } } } void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( const int cpu_smaller_leaf_index, - const int cpu_larger_leaf_index) { + const int cpu_larger_leaf_index, + const bool is_smaller_leaf_valid, + const bool is_larger_leaf_valid) { int num_tasks = num_tasks_; int num_tasks_aligned = 1; @@ -671,7 +684,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_tasks >>= 1; } const int num_blocks_per_leaf = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; - if (cpu_larger_leaf_index >= 0) { + if (cpu_larger_leaf_index >= 0 && is_smaller_leaf_valid && is_larger_leaf_valid) { SyncBestSplitForLeafKernel<<<2 * num_blocks_per_leaf, NUM_TASKS_PER_SYNC_BLOCK>>>( cpu_smaller_leaf_index, cpu_larger_leaf_index, @@ -690,6 +703,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cuda_leaf_best_split_right_count_, cuda_leaf_best_split_right_gain_, cuda_leaf_best_split_right_output_, + cuda_leaf_best_split_found_, cuda_task_feature_index_, cuda_best_split_default_left_, cuda_best_split_threshold_, @@ -708,8 +722,10 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cuda_feature_default_bins_, num_tasks_, num_tasks_aligned, - num_blocks_per_leaf); + num_blocks_per_leaf, + false); } else { + const bool larger_only = (!is_smaller_leaf_valid && is_larger_leaf_valid); SyncBestSplitForLeafKernel<<>>( cpu_smaller_leaf_index, cpu_larger_leaf_index, @@ -728,6 +744,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cuda_leaf_best_split_right_count_, cuda_leaf_best_split_right_gain_, cuda_leaf_best_split_right_output_, + cuda_leaf_best_split_found_, cuda_task_feature_index_, cuda_best_split_default_left_, cuda_best_split_threshold_, @@ -746,7 +763,8 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cuda_feature_default_bins_, num_tasks_, num_tasks_aligned, - num_blocks_per_leaf); + num_blocks_per_leaf, + larger_only); } } @@ -760,6 +778,7 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, const double* cuda_leaf_best_split_right_sum_hessian, const data_size_t* cuda_leaf_best_split_left_count, const data_size_t* cuda_leaf_best_split_right_count, + const uint8_t* cuda_leaf_best_split_found, int* cuda_best_split_info_buffer) { const int cuda_cur_num_leaves_ref = *cuda_cur_num_leaves; __shared__ double thread_best_gain[NUM_THREADS_FIND_BEST_LEAF]; @@ -774,7 +793,7 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, const int end = min(start + num_leaves_per_thread, cuda_cur_num_leaves_ref); for (int leaf_index = start; leaf_index < end; ++leaf_index) { const double leaf_best_gain = cuda_leaf_best_split_gain[leaf_index]; - if (leaf_best_gain > thread_best_gain[threadIdx_x]) { + if (cuda_leaf_best_split_found[leaf_index] && leaf_best_gain > thread_best_gain[threadIdx_x]) { thread_best_gain[threadIdx_x] = leaf_best_gain; thread_best_leaf[threadIdx_x] = leaf_index; } @@ -821,6 +840,7 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_ cuda_leaf_best_split_right_sum_hessian_, cuda_leaf_best_split_left_count_, cuda_leaf_best_split_right_count_, + cuda_leaf_best_split_found_, cuda_best_split_info_buffer_); PrepareLeafBestSplitInfo<<<1, 6, 0, cuda_streams_[0]>>>(smaller_leaf_index, larger_leaf_index, cuda_best_split_info_buffer_, cuda_leaf_best_split_feature_, diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index a748db5b2183..93e96ae91b78 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -38,7 +38,9 @@ class CUDABestSplitFinder { void BeforeTrain(); void FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, const CUDALeafSplits* larger_leaf_splits, - const int smaller_leaf_index, const int larger_leaf_index); + const int smaller_leaf_index, const int larger_leaf_index, + const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, + const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); void FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, @@ -74,6 +76,8 @@ class CUDABestSplitFinder { const double* cuda_leaf_best_split_right_output() const { return cuda_leaf_best_split_right_output_; } + uint8_t* cuda_leaf_best_split_found() const { return cuda_leaf_best_split_found_; } + void TestAfterInit() { PrintLastCUDAError(); } @@ -99,11 +103,14 @@ class CUDABestSplitFinder { private: void LaunchFindBestSplitsForLeafKernel(const CUDALeafSplits* smaller_leaf_splits, - const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index); + const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, + const bool is_smaller_leaf_valid, const bool is_larger_leaf_valid); void LaunchSyncBestSplitForLeafKernel( const int cpu_smaller_leaf_index, - const int cpu_larger_leaf_index); + const int cpu_larger_leaf_index, + const bool is_smaller_leaf_valid, + const bool is_larger_leaf_valid); void LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, @@ -151,6 +158,7 @@ class CUDABestSplitFinder { data_size_t* cuda_leaf_best_split_right_count_; double* cuda_leaf_best_split_right_gain_; double* cuda_leaf_best_split_right_output_; + uint8_t* cuda_leaf_best_split_found_; // for best split information when finding best split uint8_t* cuda_best_split_default_left_; uint32_t* cuda_best_split_threshold_; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index bbfd2b6f4f1d..4239f3bd8cd1 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -98,7 +98,7 @@ void CUDADataPartition::Init() { InitCUDAMemoryFromHostMemory(&cuda_feature_missing_is_na_, feature_missing_is_na_.data(), static_cast(num_features_)); InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_zero_, feature_mfb_is_zero_.data(), static_cast(num_features_)); InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_na_, feature_mfb_is_na_.data(), static_cast(num_features_)); - AllocateCUDAMemory(8, &cuda_split_info_buffer_); + AllocateCUDAMemory(12, &cuda_split_info_buffer_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_split_leaf_index_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_inner_feature_index_); @@ -170,6 +170,7 @@ void CUDADataPartition::Split(const int* leaf_id, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, const double* best_right_gain, const double* best_right_leaf_value, + uint8_t* best_split_found, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -183,6 +184,7 @@ void CUDADataPartition::Split(const int* leaf_id, hist_t** larger_leaf_cuda_hist_pointer_pointer, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + std::vector* cpu_leaf_sum_hessians, const std::vector& cpu_leaf_best_split_feature, const std::vector& cpu_leaf_best_split_threshold, const std::vector& cpu_leaf_best_split_default_left, @@ -211,7 +213,7 @@ void CUDADataPartition::Split(const int* leaf_id, best_left_sum_gradients, best_left_sum_hessians, best_left_count, best_left_gain, best_left_leaf_value, best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, + best_right_gain, best_right_leaf_value, best_split_found, smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, @@ -221,7 +223,7 @@ void CUDADataPartition::Split(const int* leaf_id, larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, + larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, smaller_leaf_index, larger_leaf_index); end = std::chrono::steady_clock::now(); duration = (static_cast>(end - start)).count(); @@ -249,7 +251,7 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, + const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -262,13 +264,14 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, hist_t** larger_leaf_cuda_hist_pointer_pointer, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index) { LaunchSplitInnerKernel(leaf_index, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, best_left_gain, best_left_leaf_value, best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, + best_right_gain, best_right_leaf_value, best_split_found, smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, @@ -278,7 +281,7 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, + larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, smaller_leaf_index, larger_leaf_index); ++cur_num_leaves_; } diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index b53cce119a41..9afbabfac143 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -994,7 +994,7 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, + const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -1019,6 +1019,7 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; const unsigned int to_left_total_cnt = cuda_leaf_num_data[leaf_index_ref]; const int cuda_num_total_bin_ref = *cuda_num_total_bin; + double* cuda_split_info_buffer_for_hessians = reinterpret_cast(cuda_split_info_buffer + 8); tree_split_leaf_index[cur_max_leaf_index - 1] = leaf_index_ref; tree_inner_feature_index[cur_max_leaf_index - 1] = best_split_feature[leaf_index_ref]; @@ -1040,6 +1041,11 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo cuda_split_info_buffer[3] = cur_max_leaf_index; cuda_split_info_buffer[4] = cuda_leaf_num_data[cur_max_leaf_index]; cuda_split_info_buffer[5] = cuda_leaf_data_start[cur_max_leaf_index]; + cuda_split_info_buffer_for_hessians[0] = best_left_sum_hessians[leaf_index_ref]; + cuda_split_info_buffer_for_hessians[1] = best_right_sum_hessians[leaf_index_ref]; + + best_split_found[leaf_index_ref] = 0; + best_split_found[cur_max_leaf_index] = 0; /*if (cuda_leaf_num_data[leaf_index_ref] <= 0) { printf("error !!! leaf %d has count %d\n", leaf_index_ref, cuda_leaf_num_data[leaf_index_ref]); @@ -1195,7 +1201,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, + const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -1208,6 +1214,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, hist_t** larger_leaf_cuda_hist_pointer_pointer, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index) { const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); @@ -1287,7 +1294,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data best_left_sum_gradients, best_left_sum_hessians, best_left_count, best_left_gain, best_left_leaf_value, best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, + best_right_gain, best_right_leaf_value, best_split_found, smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -1308,8 +1315,9 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, data_partition_leaf_output_, cuda_split_info_buffer_); global_timer.Stop("CUDADataPartition::SplitTreeStructureKernel"); - std::vector cpu_split_info_buffer(8); - CopyFromCUDADeviceToHostAsync(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 8, cuda_streams_[0]); + std::vector cpu_split_info_buffer(12); + const double* cpu_sum_hessians_info = reinterpret_cast(cpu_split_info_buffer.data() + 8); + CopyFromCUDADeviceToHostAsync(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 12, cuda_streams_[0]); SynchronizeCUDADevice(); const int left_leaf_index = cpu_split_info_buffer[0]; const data_size_t left_leaf_num_data = cpu_split_info_buffer[1]; @@ -1321,6 +1329,8 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data (*cpu_leaf_data_start)[left_leaf_index] = left_leaf_data_start; (*cpu_leaf_num_data)[right_leaf_index] = right_leaf_num_data; (*cpu_leaf_data_start)[right_leaf_index] = right_leaf_data_start; + (*cpu_leaf_sum_hessians)[left_leaf_index] = cpu_sum_hessians_info[0]; + (*cpu_leaf_sum_hessians)[right_leaf_index] = cpu_sum_hessians_info[1]; *smaller_leaf_index = cpu_split_info_buffer[6]; *larger_leaf_index = cpu_split_info_buffer[7]; } diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 9ce054c8b4b7..dbe334ac0797 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -38,6 +38,7 @@ class CUDADataPartition { const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, const double* best_right_gain, const double* best_right_leaf_value, + uint8_t* best_split_found, // for splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -51,6 +52,7 @@ class CUDADataPartition { hist_t** larger_leaf_cuda_hist_pointer_pointer, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + std::vector* cpu_leaf_sum_hessians, const std::vector& cpu_leaf_best_split_feature, const std::vector& cpu_leaf_best_split_threshold, const std::vector& cpu_leaf_best_split_default_left, @@ -205,7 +207,7 @@ class CUDADataPartition { const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, + const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -218,6 +220,7 @@ class CUDADataPartition { const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, hist_t** larger_leaf_cuda_hist_pointer_pointer, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index); // kernel launch functions @@ -229,7 +232,7 @@ class CUDADataPartition { const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, const double* best_left_gain, const double* best_left_leaf_value, const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, + const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -242,6 +245,7 @@ class CUDADataPartition { const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, hist_t** larger_leaf_cuda_hist_pointer_pointer, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index); void LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t num_data_in_leaf, const int* best_split_feature, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 40d3493ee466..864607be76b7 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -14,11 +14,11 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, const score_t* cuda_gradients, const score_t* cuda_hessians, const std::vector& feature_hist_offsets, - const int min_data_in_leaf): num_data_(train_data->num_data()), + const int min_data_in_leaf, const double min_sum_hessian_in_leaf): num_data_(train_data->num_data()), num_features_(train_data->num_features()), num_leaves_(num_leaves), num_threads_(num_threads), num_feature_groups_(train_data->num_feature_groups()), cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians), - min_data_in_leaf_(min_data_in_leaf) { + min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf) { int offset = 0; for (int group_id = 0; group_id < train_data->num_feature_groups(); ++group_id) { feature_group_bin_offsets_.emplace_back(offset); @@ -111,8 +111,13 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller const int* cuda_larger_leaf_index, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, const double* cuda_smaller_leaf_sum_gradients, const double* cuda_smaller_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, - const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf) { + const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, + const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { auto start = std::chrono::steady_clock::now(); + if ((num_data_in_smaller_leaf <= min_data_in_leaf_ || sum_hessians_in_smaller_leaf <= min_sum_hessian_in_leaf_) && + (num_data_in_larger_leaf <= min_data_in_leaf_ || sum_hessians_in_larger_leaf <= min_sum_hessian_in_leaf_)) { + return; + } LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_num_data_in_smaller_leaf, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist, num_data_in_smaller_leaf); SynchronizeCUDADevice(); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 985ebef06c88..7b90681cb4f1 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -105,7 +105,8 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const data_size_t* cuda_smaller_leaf_num_data, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t* cuda_leaf_num_data, - hist_t** cuda_leaf_hist, const data_size_t num_data_in_smaller_leaf) { + hist_t** cuda_leaf_hist, + const data_size_t num_data_in_smaller_leaf) { const int block_dim_x = num_features_; // TODO(shiyu1994): only supports the case when the whole histogram can be loaded into shared memory const int block_dim_y = NUM_THRADS_PER_BLOCK / block_dim_x; const int min_grid_dim_y = 160; @@ -114,9 +115,6 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( //Log::Warning("smaller_leaf_num_data = %d", smaller_leaf_num_data); //Log::Warning("block_dim_x = %d, block_dim_y = %d", block_dim_x, block_dim_y); //Log::Warning("gid_dim_x = %d, grid_dim_y = %d", grid_dim_x, grid_dim_y); - if (num_data_in_smaller_leaf <= min_data_in_leaf_) { - return; - } dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); CUDAConstructHistogramKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 4fd0173f618b..e8196e91b60c 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -31,7 +31,7 @@ class CUDAHistogramConstructor { public: CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, const score_t* cuda_gradients, const score_t* cuda_hessians, const std::vector& feature_hist_offsets, - const int min_data_in_leaf); + const int min_data_in_leaf, const double min_sum_hessian_in_leaf); void Init(const Dataset* train_data); @@ -39,7 +39,8 @@ class CUDAHistogramConstructor { const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, const double* cuda_smaller_leaf_sum_gradients, const double* cuda_smaller_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, - const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf); + const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, + const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); void BeforeTrain(); @@ -86,7 +87,8 @@ class CUDAHistogramConstructor { const data_size_t* cuda_smaller_leaf_num_data, const data_size_t** cuda_data_indices_in_leaf, const data_size_t* cuda_leaf_num_data, - hist_t** cuda_leaf_hist, const data_size_t num_data_in_smaller_leaf); + hist_t** cuda_leaf_hist, + const data_size_t num_data_in_smaller_leaf); void LaunchSubtractHistogramKernel(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, @@ -112,6 +114,7 @@ class CUDAHistogramConstructor { std::vector feature_hist_offsets_; std::vector feature_most_freq_bins_; const int min_data_in_leaf_; + const double min_sum_hessian_in_leaf_; // CUDA memory, held by this object uint32_t* cuda_feature_group_bin_offsets_; diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index a27fa834b766..f5e7edb89422 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -42,6 +42,10 @@ void CUDALeafSplits::Init() { AllocateCUDAMemory(1, &cuda_hist_in_leaf_); InitCUDAMemoryFromHostMemory(&cuda_leaf_index_, &leaf_index_, 1); + + cuda_streams_.resize(2); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); } void CUDALeafSplits::InitValues(const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, @@ -67,14 +71,16 @@ void CUDALeafSplits::InitValues() { SynchronizeCUDADevice(); } -void CUDALeafSplits::InitValues(const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf) { +void CUDALeafSplits::InitValues(const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, + double* root_sum_hessians) { SetCUDAMemory(cuda_sum_of_gradients_, 0, num_blocks_init_from_gradients_); SetCUDAMemory(cuda_sum_of_hessians_, 0, num_blocks_init_from_gradients_); LaunchInitValuesKernal(); SetCUDAMemory(cuda_leaf_index_, 0, 1); - CopyFromHostToCUDADevice(cuda_data_indices_in_leaf_, &cuda_data_indices_in_leaf, 1); - CopyFromHostToCUDADevice(cuda_hist_in_leaf_, &cuda_hist_in_leaf, 1); - CopyFromHostToCUDADevice(cuda_num_data_in_leaf_, &num_data_, 1); + CopyFromHostToCUDADeviceAsync(cuda_data_indices_in_leaf_, &cuda_data_indices_in_leaf, 1, cuda_streams_[0]); + CopyFromHostToCUDADeviceAsync(cuda_hist_in_leaf_, &cuda_hist_in_leaf, 1, cuda_streams_[0]); + CopyFromHostToCUDADeviceAsync(cuda_num_data_in_leaf_, &num_data_, 1, cuda_streams_[0]); + CopyFromCUDADeviceToHostAsync(root_sum_hessians, cuda_sum_of_hessians_, 1, cuda_streams_[1]); SetCUDAMemory(cuda_gain_, 0, 1); SetCUDAMemory(cuda_leaf_value_, 0, 1); SynchronizeCUDADevice(); diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index 5af6c1ae5480..4aa26aab9af8 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -33,7 +33,8 @@ class CUDALeafSplits { const data_size_t* cuda_num_data_in_leaf, const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, const double* cuda_gain, const double* cuda_leaf_value); - void InitValues(const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf); + void InitValues(const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, + double* root_sum_hessians); void InitValues(); @@ -81,6 +82,7 @@ class CUDALeafSplits { const int num_data_; const int leaf_index_; int num_blocks_init_from_gradients_; + std::vector cuda_streams_; // CUDA memory, held by this object int* cuda_leaf_index_; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 2c70a8f5033a..030b67377e6e 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -36,7 +36,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), share_state_->feature_hist_offsets(), - config_->min_data_in_leaf)); + config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf)); cuda_histogram_constructor_->Init(train_data_); //cuda_histogram_constructor_->TestAfterInit(); @@ -69,6 +69,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia leaf_best_split_default_left_.resize(config_->num_leaves, 0); leaf_num_data_.resize(config_->num_leaves, 0); leaf_data_start_.resize(config_->num_leaves, 0); + leaf_sum_hessians_.resize(config_->num_leaves, 0.0f); } void NewCUDATreeLearner::BeforeTrain() { @@ -86,7 +87,9 @@ void NewCUDATreeLearner::BeforeTrain() { duration = static_cast>(end - start); global_timer.Stop("CUDACentralizedInfo::BeforeTrain"); //Log::Warning("cuda_centralized_info_->BeforeTrain duration = %f", duration.count()); - cuda_smaller_leaf_splits_->InitValues(cuda_data_partition_->cuda_data_indices(), cuda_histogram_constructor_->cuda_hist_pointer()); + cuda_smaller_leaf_splits_->InitValues(cuda_data_partition_->cuda_data_indices(), + cuda_histogram_constructor_->cuda_hist_pointer(), + &leaf_sum_hessians_[0]); cuda_larger_leaf_splits_->InitValues(); //cuda_smaller_leaf_splits_->Test(); start = std::chrono::steady_clock::now(); @@ -195,6 +198,10 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, //Log::Warning("Before ConstructHistogramForLeaf"); global_timer.Start("NewCUDATreeLearner::ConstructHistogramForLeaf"); auto start = std::chrono::steady_clock::now(); + const data_size_t num_data_in_smaller_leaf = leaf_num_data_[smaller_leaf_index_]; + const data_size_t num_data_in_larger_leaf = larger_leaf_index_ < 0 ? 0 : leaf_num_data_[larger_leaf_index_]; + const double sum_hessians_in_smaller_leaf = leaf_sum_hessians_[smaller_leaf_index_]; + const double sum_hessians_in_larger_leaf = larger_leaf_index_ < 0 ? 0 : leaf_sum_hessians_[larger_leaf_index_]; cuda_histogram_constructor_->ConstructHistogramForLeaf( cuda_smaller_leaf_splits_->cuda_leaf_index(), cuda_smaller_leaf_splits_->cuda_num_data_in_leaf(), @@ -208,7 +215,10 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, cuda_larger_leaf_splits_->cuda_sum_of_hessians_pointer(), cuda_larger_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), cuda_data_partition_->cuda_leaf_num_data(), - leaf_num_data_[smaller_leaf_index_]); + num_data_in_smaller_leaf, + num_data_in_larger_leaf, + sum_hessians_in_smaller_leaf, + sum_hessians_in_larger_leaf); /*if (i == 0) { cuda_histogram_constructor_->TestAfterConstructHistogram(); }*/ @@ -220,7 +230,9 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, global_timer.Start("NewCUDATreeLearner::FindBestSplitsForLeaf"); start = std::chrono::steady_clock::now(); cuda_best_split_finder_->FindBestSplitsForLeaf(cuda_smaller_leaf_splits_.get(), - cuda_larger_leaf_splits_.get(), smaller_leaf_index_, larger_leaf_index_); + cuda_larger_leaf_splits_.get(), smaller_leaf_index_, larger_leaf_index_, + num_data_in_smaller_leaf, num_data_in_larger_leaf, + sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf); //Log::Warning("Before FindBestFromAllSplits"); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); @@ -235,6 +247,11 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); find_best_split_from_all_leaves_time += duration.count(); + + if (best_leaf_index_ == -1) { + Log::Warning("No further splits with positive gain, training stopped with %d leaves.", (i + 1)); + } + global_timer.Start("NewCUDATreeLearner::Split"); start = std::chrono::steady_clock::now(); cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), @@ -254,6 +271,8 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, cuda_best_split_finder_->cuda_leaf_best_split_right_gain(), cuda_best_split_finder_->cuda_leaf_best_split_right_output(), + cuda_best_split_finder_->cuda_leaf_best_split_found(), + cuda_smaller_leaf_splits_->cuda_leaf_index_pointer(), cuda_smaller_leaf_splits_->cuda_sum_of_gradients_pointer(), cuda_smaller_leaf_splits_->cuda_sum_of_hessians_pointer(), @@ -272,6 +291,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, cuda_larger_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), &leaf_num_data_, &leaf_data_start_, + &leaf_sum_hessians_, leaf_best_split_feature_, leaf_best_split_threshold_, leaf_best_split_default_left_, diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 31d34e83b114..fa9245a07d89 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -84,6 +84,7 @@ class NewCUDATreeLearner: public SerialTreeLearner { std::vector leaf_best_split_default_left_; std::vector leaf_num_data_; std::vector leaf_data_start_; + std::vector leaf_sum_hessians_; int smaller_leaf_index_; int larger_leaf_index_; int best_leaf_index_; diff --git a/src/treelearner/cuda/new_cuda_utils.hpp b/src/treelearner/cuda/new_cuda_utils.hpp index 0e7f041c80c8..74236c668bf7 100644 --- a/src/treelearner/cuda/new_cuda_utils.hpp +++ b/src/treelearner/cuda/new_cuda_utils.hpp @@ -34,6 +34,14 @@ void CopyFromHostToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size) { CUDASUCCESS_OR_FATAL(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice)); } +template +void CopyFromHostToCUDADeviceAsync(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice, stream)); +} + template void InitCUDAMemoryFromHostMemory(T** dst_ptr, const T* src_ptr, size_t size) { AllocateCUDAMemory(size, dst_ptr); From 4072bb8ff9f22c666e6142723caf3fc9c3ce20ed Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 4 Jun 2021 08:00:43 +0000 Subject: [PATCH 021/166] support row wise with CUDA --- include/LightGBM/bin.h | 2 + include/LightGBM/train_share_states.h | 30 ++++ src/io/config.cpp | 4 +- src/io/multi_val_sparse_bin.hpp | 83 +++++++++ .../cuda/cuda_histogram_constructor.cpp | 166 +++++++++++++++++- .../cuda/cuda_histogram_constructor.cu | 96 +++++----- .../cuda/cuda_histogram_constructor.hpp | 31 +++- .../cuda/new_cuda_tree_learner.cpp | 11 +- .../cuda/new_cuda_tree_learner.hpp | 2 +- 9 files changed, 361 insertions(+), 64 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index e7ba45a83aa1..1c0ba699ef47 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -459,6 +459,8 @@ class MultiValBin { static constexpr double multi_val_bin_sparse_threshold = 0.25f; virtual MultiValBin* Clone() = 0; + + virtual const uint8_t* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const = 0; }; inline uint32_t BinMapper::ValueToBin(double value) const { diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index b2a08ff413ff..15fbd7f86d52 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -125,6 +125,17 @@ class MultiValBinWrapper { is_subrow_copied_ = is_subrow_copied; } + const uint8_t* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + if (multi_val_bin_ == nullptr) { + *bit_type = 0; + *total_size = 0; + *is_sparse = false; + return nullptr; + } else { + return multi_val_bin_->GetRowWiseData(bit_type, total_size, is_sparse); + } + } + private: bool is_use_subcol_ = false; bool is_use_subrow_ = false; @@ -164,7 +175,12 @@ struct TrainingShareStates { const std::vector& feature_hist_offsets() { return feature_hist_offsets_; } + const std::vector& column_hist_offsets() { return column_hist_offsets_; } + bool IsSparseRowwise() { + if (multi_val_bin_wrapper_ == nullptr) { + Log::Warning("in share states get row wise data, and multi_val_bin_wrapper_ == nullptr"); + } return (multi_val_bin_wrapper_ != nullptr && multi_val_bin_wrapper_->IsSparse()); } @@ -211,8 +227,22 @@ struct TrainingShareStates { } } + const uint8_t* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) { + if (multi_val_bin_wrapper_ != nullptr) { + Log::Warning("in share states get row wise data, and multi_val_bin_wrapper_ != nullptr"); + return multi_val_bin_wrapper_->GetRowWiseData(bit_type, total_size, is_sparse); + } else { + Log::Warning("in share states get row wise data, and multi_val_bin_wrapper_ == nullptr"); + *bit_type = 0; + *total_size = 0; + *is_sparse = false; + return nullptr; + } + } + private: std::vector feature_hist_offsets_; + std::vector column_hist_offsets_; int num_hist_total_bin_ = 0; std::unique_ptr multi_val_bin_wrapper_; std::vector> hist_buf_; diff --git a/src/io/config.cpp b/src/io/config.cpp index fbb9e339933f..86b67ec9c980 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -334,8 +334,8 @@ void Config::CheckParamConflict() { num_leaves = static_cast(full_num_leaves); } } - // force col-wise for gpu & CUDA - if (device_type == std::string("gpu") || device_type == std::string("cuda")) { + // force col-wise for gpu + if (device_type == std::string("gpu")/* || device_type == std::string("cuda")*/) { force_col_wise = true; force_row_wise = false; if (deterministic) { diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index 1699380732c6..58e063ba432b 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -290,6 +290,8 @@ class MultiValSparseBin : public MultiValBin { MultiValSparseBin* Clone() override; + const uint8_t* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const override; + private: data_size_t num_data_; int num_bin_; @@ -316,5 +318,86 @@ MultiValSparseBin* MultiValSparseBin::Clone() { return new MultiValSparseBin(*this); } +template <> +const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint8_t* to_return = data_.data(); + *bit_type = 8; + *total_size = data_.size(); + *is_sparse = true; + return to_return; +} + +template <> +const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 16; + *total_size = data_.size(); + *is_sparse = true; + return to_return; +} + +template <> +const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 32; + *total_size = data_.size(); + *is_sparse = true; + return to_return; +} + +template <> +const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint8_t* to_return = data_.data(); + *bit_type = 8; + *total_size = data_.size(); + *is_sparse = true; + return to_return; +} + +template <> +const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 16; + *total_size = data_.size(); + *is_sparse = true; + return to_return; +} + +template <> +const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 32; + *total_size = data_.size(); + *is_sparse = true; + return to_return; +} + +template <> +const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint8_t* to_return = data_.data(); + *bit_type = 8; + *total_size = data_.size(); + *is_sparse = true; + return to_return; +} + +template <> +const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 16; + *total_size = data_.size(); + *is_sparse = true; + return to_return; +} + +template <> +const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 32; + *total_size = data_.size(); + *is_sparse = true; + return to_return; +} + } // namespace LightGBM #endif // LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_ diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 864607be76b7..39b76c1fe230 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -8,7 +8,7 @@ #include "cuda_histogram_constructor.hpp" -namespace LightGBM { +namespace LightGBM { CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, @@ -47,11 +47,11 @@ void CUDAHistogramConstructor::BeforeTrain() { SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_); } -void CUDAHistogramConstructor::Init(const Dataset* train_data) { +void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStates* share_state) { // allocate CPU memory - data_.resize(num_data_ * num_feature_groups_, 0); + //data_.resize(num_data_ * num_feature_groups_, 0); // allocate GPU memory - AllocateCUDAMemory(num_feature_groups_ * num_data_, &cuda_data_); + //AllocateCUDAMemory(num_feature_groups_ * num_data_, &cuda_data_); AllocateCUDAMemory(num_total_bin_ * 2 * num_leaves_, &cuda_hist_); SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_); @@ -80,11 +80,13 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data) { InitCUDAValueFromConstant(&cuda_num_features_, num_features_); - InitCUDAData(train_data); + InitCUDAData(train_data, share_state); + + DivideCUDAFeatureGroups(train_data, share_state); } -void CUDAHistogramConstructor::InitCUDAData(const Dataset* train_data) { - std::vector> bin_iterators(num_feature_groups_); +void CUDAHistogramConstructor::InitCUDAData(const Dataset* train_data, TrainingShareStates* share_state) { + /*std::vector> bin_iterators(num_feature_groups_); #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int group_id = 0; group_id < num_feature_groups_; ++group_id) { bin_iterators[group_id].reset(train_data->FeatureGroupIterator(group_id)); @@ -93,8 +95,14 @@ void CUDAHistogramConstructor::InitCUDAData(const Dataset* train_data) { const uint32_t bin = static_cast(bin_iterators[group_id]->RawGet(data_index)); PushOneData(bin, group_id, data_index); } - } - CopyFromHostToCUDADevice(cuda_data_, data_.data(), data_.size()); + }*/ + uint8_t bit_type = 0; + size_t total_size = 0; + //CopyFromHostToCUDADevice(cuda_data_, data_.data(), data_.size()); + Log::Warning("share_state_->IsSparse() = %d", static_cast(share_state->IsSparseRowwise())); + const uint8_t* cpu_data_ptr = share_state->GetRowWiseData(&bit_type, &total_size, &is_sparse_); + CHECK_EQ(bit_type, 8); + InitCUDAMemoryFromHostMemory(&cuda_data_uint8_t_, cpu_data_ptr, total_size); SynchronizeCUDADevice(); } @@ -138,6 +146,146 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller CopyFromCUDADeviceToHost(cpu_hist.data(), cuda_hist_, 6143 * 2);*/ } +void CUDAHistogramConstructor::CalcConstructHistogramKernelDim( + int* grid_dim_x, int* grid_dim_y, int* block_dim_x, int* block_dim_y, + const data_size_t num_data_in_smaller_leaf) { + *block_dim_x = max_num_column_per_partition_; + *block_dim_y = NUM_THRADS_PER_BLOCK / max_num_column_per_partition_; + *grid_dim_x = num_feature_partitions_; + const int min_grid_dim_y = 160; + *grid_dim_y = std::max(min_grid_dim_y, + ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + (*block_dim_y) - 1) / (*block_dim_y)); +} + +void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state) { + const uint32_t max_num_bin_per_partition = SHRAE_HIST_SIZE / 2; + const std::vector& feature_hist_offsets = share_state->feature_hist_offsets(); + const std::vector& column_hist_offsets = share_state->column_hist_offsets(); + std::vector feature_group_num_feature_offsets; + int offsets = 0; + int prev_group_index = -1; + for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + const int feature_group_index = train_data->Feature2Group(feature_index); + if (prev_group_index == -1 || feature_group_index != prev_group_index) { + feature_group_num_feature_offsets.emplace_back(offsets); + } + ++offsets; + } + CHECK_EQ(offsets, num_features_); + feature_group_num_feature_offsets.emplace_back(offsets); + + uint32_t start_hist_offset = 0; + uint32_t column_start_hist_offset = 0; + feature_partition_hist_offsets_.clear(); + feature_partition_feature_index_offsets_.clear(); + feature_partition_column_index_offsets_.clear(); + column_hist_offsets_.clear(); + column_hist_offsets_full_.clear(); + feature_partition_hist_offsets_.emplace_back(0); + feature_partition_feature_index_offsets_.emplace_back(0); + feature_partition_column_index_offsets_.emplace_back(0); + column_hist_offsets_full_.emplace_back(0); + const int num_feature_groups = train_data->num_feature_groups(); + int column_index = 0; + num_feature_partitions_ = 0; + for (int feature_group_index = 0; feature_group_index < num_feature_groups; ++feature_group_index) { + const int group_feature_index_start = feature_group_num_feature_offsets[feature_group_index]; + const int group_feature_index_end = feature_group_num_feature_offsets[feature_group_index + 1]; + const int num_features_in_group = group_feature_index_end - group_feature_index_start; + if (!train_data->IsMultiGroup(feature_group_index)) { + const uint32_t group_feature_hist_start = feature_hist_offsets[group_feature_index_start]; + const uint32_t group_feature_hist_end = feature_hist_offsets[group_feature_index_end]; + const uint32_t num_bin_in_dense_group = group_feature_hist_end - group_feature_hist_start; + if (num_bin_in_dense_group > max_num_bin_per_partition) { + Log::Fatal("Too many bins in a dense feature group."); + } + const uint32_t cur_hist_num_bin = group_feature_hist_end - start_hist_offset; + if (cur_hist_num_bin > max_num_bin_per_partition) { + feature_partition_hist_offsets_.emplace_back(group_feature_hist_start); + feature_partition_feature_index_offsets_.emplace_back(group_feature_index_start); + feature_partition_column_index_offsets_.emplace_back(column_index); + start_hist_offset = group_feature_hist_start; + column_start_hist_offset = column_hist_offsets[column_index]; + column_hist_offsets_full_.emplace_back(column_start_hist_offset); + ++num_feature_partitions_; + } + column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - column_start_hist_offset); + if (feature_group_index == num_feature_groups - 1) { + CHECK_EQ(group_feature_index_end, num_features_); + feature_partition_hist_offsets_.emplace_back(group_feature_hist_end); + feature_partition_feature_index_offsets_.emplace_back(group_feature_index_end); + feature_partition_column_index_offsets_.emplace_back(column_index + 1); + column_hist_offsets_full_.emplace_back(column_hist_offsets.back()); + ++num_feature_partitions_; + } + ++column_index; + } else { + for (int sub_feature_index = 0; sub_feature_index < num_features_in_group; ++sub_feature_index) { + const int feature_index = group_feature_index_start + sub_feature_index; + const uint32_t feature_hist_start = feature_hist_offsets[feature_index]; + const uint32_t feature_hist_end = feature_hist_offsets[feature_index + 1]; + const uint32_t cur_hist_num_bin = feature_hist_end - start_hist_offset; + if (cur_hist_num_bin > max_num_bin_per_partition) { + feature_partition_hist_offsets_.emplace_back(feature_hist_start); + feature_partition_feature_index_offsets_.emplace_back(feature_index); + feature_partition_column_index_offsets_.emplace_back(column_index); + start_hist_offset = feature_hist_start; + column_start_hist_offset = column_hist_offsets[column_index]; + column_hist_offsets_full_.emplace_back(column_start_hist_offset); + ++num_feature_partitions_; + } + column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - column_start_hist_offset); + if (feature_group_index == num_feature_groups - 1 && sub_feature_index == num_features_in_group - 1) { + CHECK_EQ(feature_index, num_features_ - 1); + feature_partition_hist_offsets_.emplace_back(feature_hist_end); + feature_partition_feature_index_offsets_.emplace_back(feature_index + 1); + feature_partition_column_index_offsets_.emplace_back(column_index + 1); + column_hist_offsets_full_.emplace_back(column_hist_offsets.back()); + ++num_feature_partitions_; + } + ++column_index; + } + } + } + max_num_column_per_partition_ = 0; + for (size_t i = 0; i < feature_partition_column_index_offsets_.size() - 1; ++i) { + const int num_column = feature_partition_column_index_offsets_[i + 1] - feature_partition_column_index_offsets_[i]; + if (num_column > max_num_column_per_partition_) { + max_num_column_per_partition_ = num_column; + } + } + + for (size_t i = 0; i < column_hist_offsets_.size(); ++i) { + Log::Warning("column_hist_offsets[%d] = %d, feature_group_bin_offsets_[%d] = %d", + i, column_hist_offsets_[i], i, feature_group_bin_offsets_[i]); + } + + CHECK_EQ(feature_partition_column_index_offsets_.size(), feature_partition_hist_offsets_.size()); + CHECK_EQ(feature_partition_column_index_offsets_.size(), feature_partition_feature_index_offsets_.size()); + for (size_t i = 0; i < feature_partition_column_index_offsets_.size(); ++i) { + Log::Warning("%d column %d hist %d feature %d", i, + feature_partition_column_index_offsets_[i], + feature_partition_feature_index_offsets_[i], + feature_partition_hist_offsets_[i]); + } + + InitCUDAMemoryFromHostMemory(&cuda_feature_partition_hist_offsets_, + feature_partition_hist_offsets_.data(), + feature_partition_hist_offsets_.size()); + + InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, + feature_partition_column_index_offsets_.data(), + feature_partition_column_index_offsets_.size()); + + InitCUDAMemoryFromHostMemory(&cuda_column_hist_offsets_, + column_hist_offsets_.data(), + column_hist_offsets_.size()); + + InitCUDAMemoryFromHostMemory(&cuda_column_hist_offsets_full_, + column_hist_offsets_full_.data(), + column_hist_offsets_full_.size()); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 7b90681cb4f1..095fd23a36a3 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -47,54 +47,67 @@ __device__ void PrefixSum(hist_t* elements, unsigned int n) { } } -__global__ void CUDAConstructHistogramKernel(const int* leaf_index, - const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t** data_indices_ptr, hist_t** feature_histogram, const int* num_feature_groups, - const data_size_t* leaf_num_data, const uint8_t* data, const uint32_t* feature_group_offsets, - const int* /*cuda_num_total_bin*/) { - const unsigned int threadIdx_x = threadIdx.x; +__global__ void CUDAConstructHistogramKernel( + const int* leaf_index, + const score_t* cuda_gradients, + const score_t* cuda_hessians, + const data_size_t** data_indices_ptr, + hist_t** feature_histogram, + const int* num_feature_groups, + const data_size_t* leaf_num_data, + const uint8_t* data, + const uint32_t* column_hist_offsets, + const uint32_t* column_hist_offsets_full, + const int* feature_partition_column_index_offsets) { + const int num_feature_groups_ref = *num_feature_groups; const int leaf_index_ref = *leaf_index; - const unsigned int blockDim_y = blockDim.y; - const int dim_y = gridDim.y * blockDim_y; - hist_t* feature_histogram_ptr = *feature_histogram; + const int dim_y = static_cast(gridDim.y * blockDim.y); const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; const data_size_t* data_indices_ref = *data_indices_ptr; - __shared__ float shared_hist[SHRAE_HIST_SIZE]; // 256 * 24 * 2, can use 24 features - uint32_t num_bins_in_col_group = feature_group_offsets[blockDim.x]; - const uint32_t num_items_per_thread = (2 * num_bins_in_col_group + NUM_THRADS_PER_BLOCK - 1) / NUM_THRADS_PER_BLOCK; - const int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; + __shared__ float shared_hist[SHRAE_HIST_SIZE]; + const unsigned int num_threads_per_block = blockDim.x * blockDim.y; + const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; + const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; + const uint32_t num_bins_in_partition = partition_hist_end - partition_hist_start; + const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x]; + const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1]; + const int num_columns_in_partition = partition_column_end - partition_column_start; + const uint32_t num_items_per_thread = (2 * num_bins_in_partition + num_threads_per_block - 1) / num_threads_per_block; + const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; const uint32_t thread_start = thread_idx * num_items_per_thread; - const uint32_t thread_end = thread_start + num_items_per_thread > num_bins_in_col_group * 2 ? - num_bins_in_col_group * 2 : thread_start + num_items_per_thread; - const uint32_t feature_group_offset = feature_group_offsets[threadIdx_x]; + const uint32_t thread_end = thread_start + num_items_per_thread > num_bins_in_partition * 2 ? + num_bins_in_partition * 2 : thread_start + num_items_per_thread; for (uint32_t i = thread_start; i < thread_end; ++i) { shared_hist[i] = 0.0f; } - float* shared_hist_ptr = shared_hist + (feature_group_offset << 1); __syncthreads(); const unsigned int threadIdx_y = threadIdx.y; const unsigned int blockIdx_y = blockIdx.y; - const data_size_t block_start = (blockIdx_y * blockDim_y) * num_data_per_thread; + const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread; const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; - data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf_ref - block_start, num_data_per_thread * static_cast(blockDim_y))); - const data_size_t num_iteration_total = (block_num_data + blockDim_y - 1) / blockDim_y; - const data_size_t remainder = block_num_data % blockDim_y; + data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf_ref - block_start, num_data_per_thread * static_cast(blockDim.y))); + const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y; + const data_size_t remainder = block_num_data % blockDim.y; const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); data_size_t inner_data_index = static_cast(threadIdx_y); - for (data_size_t i = 0; i < num_iteration_this; ++i) { - const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; - const score_t grad = cuda_gradients[data_index]; - const score_t hess = cuda_hessians[data_index]; - const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + threadIdx_x]); - const uint32_t pos = bin << 1; - float* pos_ptr = shared_hist_ptr + pos; - atomicAdd_system(pos_ptr, grad); - atomicAdd_system(pos_ptr + 1, hess); - inner_data_index += blockDim_y; - } + float* shared_hist_ptr = shared_hist + (column_hist_offsets[threadIdx.x] << 1); + //if (threadIdx.x < static_cast(num_columns_in_partition)) { + for (data_size_t i = 0; i < num_iteration_this; ++i) { + const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; + const score_t grad = cuda_gradients[data_index]; + const score_t hess = cuda_hessians[data_index]; + const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + threadIdx.x/* + partition_column_start*/]); + const uint32_t pos = bin << 1; + float* pos_ptr = shared_hist_ptr + pos; + atomicAdd_system(pos_ptr, grad); + atomicAdd_system(pos_ptr + 1, hess); + inner_data_index += blockDim.y; + } + //} __syncthreads(); + hist_t* feature_histogram_ptr = (*feature_histogram) + (partition_hist_start << 1); for (uint32_t i = thread_start; i < thread_end; ++i) { atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); } @@ -107,19 +120,18 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const data_size_t* cuda_leaf_num_data, hist_t** cuda_leaf_hist, const data_size_t num_data_in_smaller_leaf) { - const int block_dim_x = num_features_; // TODO(shiyu1994): only supports the case when the whole histogram can be loaded into shared memory - const int block_dim_y = NUM_THRADS_PER_BLOCK / block_dim_x; - const int min_grid_dim_y = 160; - const int grid_dim_y = std::max(min_grid_dim_y, ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + block_dim_y - 1) / block_dim_y); - const int grid_dim_x = (static_cast(num_feature_groups_ + NUM_FEATURE_PER_THREAD_GROUP - 1) / NUM_FEATURE_PER_THREAD_GROUP); - //Log::Warning("smaller_leaf_num_data = %d", smaller_leaf_num_data); - //Log::Warning("block_dim_x = %d, block_dim_y = %d", block_dim_x, block_dim_y); - //Log::Warning("gid_dim_x = %d, grid_dim_y = %d", grid_dim_x, grid_dim_y); + int grid_dim_x = 0; + int grid_dim_y = 0; + int block_dim_x = 0; + int block_dim_y = 0; + CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf); dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); CUDAConstructHistogramKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_, - cuda_feature_group_bin_offsets_, cuda_num_total_bin_); + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, + cuda_column_hist_offsets_, + cuda_column_hist_offsets_full_, + cuda_feature_partition_column_index_offsets_); } __global__ void SubtractHistogramKernel(const int* /*cuda_smaller_leaf_index*/, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index e8196e91b60c..f2c6edb46e6f 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -33,7 +33,7 @@ class CUDAHistogramConstructor { const score_t* cuda_gradients, const score_t* cuda_hessians, const std::vector& feature_hist_offsets, const int min_data_in_leaf, const double min_sum_hessian_in_leaf); - void Init(const Dataset* train_data); + void Init(const Dataset* train_data, TrainingShareStates* share_state); void ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const data_size_t* cuda_num_data_in_smaller_leaf, const int* cuda_larger_leaf_index, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, @@ -50,14 +50,14 @@ class CUDAHistogramConstructor { hist_t* cuda_hist_pointer() { return cuda_hist_; } - const uint8_t* cuda_data() const { return cuda_data_; } + const uint8_t* cuda_data() const { return cuda_data_uint8_t_; } void TestAfterInit() { - std::vector test_data(data_.size(), 0); + /*std::vector test_data(data_.size(), 0); CopyFromCUDADeviceToHost(test_data.data(), cuda_data_, data_.size()); for (size_t i = 0; i < 100; ++i) { Log::Warning("CUDAHistogramConstructor::TestAfterInit test_data[%d] = %d", i, test_data[i]); - } + }*/ } void TestAfterConstructHistogram() { @@ -83,6 +83,9 @@ class CUDAHistogramConstructor { const data_size_t num_data_in_leaf, const data_size_t** cuda_data_indices_in_leaf); + void CalcConstructHistogramKernelDim(int* grid_dim_x, int* grid_dim_y, int* block_dim_x, int* block_dim_y, + const data_size_t num_data_in_smaller_leaf); + void LaunchConstructHistogramKernel(const int* cuda_leaf_index, const data_size_t* cuda_smaller_leaf_num_data, const data_size_t** cuda_data_indices_in_leaf, @@ -95,10 +98,12 @@ class CUDAHistogramConstructor { const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist); - void InitCUDAData(const Dataset* train_data); + void InitCUDAData(const Dataset* train_data, TrainingShareStates* share_state); void PushOneData(const uint32_t feature_bin_value, const int feature_group_id, const data_size_t data_index); + void DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state); + // Host memory // data on CPU, stored in row-wise style const data_size_t num_data_; @@ -115,6 +120,14 @@ class CUDAHistogramConstructor { std::vector feature_most_freq_bins_; const int min_data_in_leaf_; const double min_sum_hessian_in_leaf_; + std::vector feature_partition_hist_offsets_; + std::vector feature_partition_feature_index_offsets_; + std::vector feature_partition_column_index_offsets_; + std::vector column_hist_offsets_; + std::vector column_hist_offsets_full_; + bool is_sparse_; + int num_feature_partitions_; + int max_num_column_per_partition_; // CUDA memory, held by this object uint32_t* cuda_feature_group_bin_offsets_; @@ -125,10 +138,16 @@ class CUDAHistogramConstructor { hist_t* cuda_hist_; int* cuda_num_total_bin_; int* cuda_num_feature_groups_; - uint8_t* cuda_data_; + uint8_t* cuda_data_uint8_t_; + uint16_t* cuda_data_uint16_t_; + uint32_t* cuda_data_uint32_t_; int* cuda_num_features_; score_t* cuda_ordered_gradients_; score_t* cuda_ordered_hessians_; + uint32_t* cuda_feature_partition_hist_offsets_; + int* cuda_feature_partition_column_index_offsets_; + uint32_t* cuda_column_hist_offsets_; + uint32_t* cuda_column_hist_offsets_full_; // CUDA memory, held by other objects const score_t* cuda_gradients_; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 030b67377e6e..26acf417328e 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -37,7 +37,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), share_state_->feature_hist_offsets(), config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf)); - cuda_histogram_constructor_->Init(train_data_); + cuda_histogram_constructor_->Init(train_data_, share_state_.get()); //cuda_histogram_constructor_->TestAfterInit(); cuda_data_partition_.reset(new CUDADataPartition(num_data_, num_features_, this->config_->num_leaves, num_threads_, @@ -134,7 +134,7 @@ void NewCUDATreeLearner::AddPredictionToScore(const Tree* /*tree*/, double* out_ Log::Warning("AddPredictionToScore time %f", duration); } -Tree* NewCUDATreeLearner::BuildTree() { +Tree* NewCUDATreeLearner::BuildTree(const int num_leaves) { std::unique_ptr tree(new Tree(config_->num_leaves, false, false)); std::vector leaf_index(config_->num_leaves); std::vector inner_feature_index(config_->num_leaves); @@ -158,7 +158,7 @@ Tree* NewCUDATreeLearner::BuildTree() { CopyFromCUDADeviceToHost(right_sum_hessian.data(), cuda_data_partition_->tree_right_sum_hessian(), config_->num_leaves); CopyFromCUDADeviceToHost(gain.data(), cuda_data_partition_->tree_gain(), config_->num_leaves); CopyFromCUDADeviceToHost(default_left.data(), cuda_data_partition_->tree_default_left(), config_->num_leaves); - for (int i = 0; i < config_->num_leaves - 1; ++i) { + for (int i = 0; i < num_leaves - 1; ++i) { tree->Split( leaf_index[i], inner_feature_index[i], @@ -194,6 +194,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, double split_data_indices_time = 0.0f; double split_tree_time = 0.0f; //std::unique_ptr tree(new Tree(config_->num_leaves, false, false)); + int num_leaves = 1; for (int i = 0; i < config_->num_leaves - 1; ++i) { //Log::Warning("Before ConstructHistogramForLeaf"); global_timer.Start("NewCUDATreeLearner::ConstructHistogramForLeaf"); @@ -250,6 +251,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, if (best_leaf_index_ == -1) { Log::Warning("No further splits with positive gain, training stopped with %d leaves.", (i + 1)); + break; } global_timer.Start("NewCUDATreeLearner::Split"); @@ -305,12 +307,13 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::Split"); split_data_indices_time += duration.count(); + ++num_leaves; } const auto end = std::chrono::steady_clock::now(); const double duration = (static_cast>(end - start)).count(); const auto build_tree_start = std::chrono::steady_clock::now(); //Log::Warning("Before BuildTree"); - std::unique_ptr tree(BuildTree()); + std::unique_ptr tree(BuildTree(num_leaves)); const auto build_tree_end = std::chrono::steady_clock::now(); const auto build_tre_duration = (static_cast>(build_tree_end - build_tree_start)).count(); Log::Warning("Train time %f", duration); diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index fa9245a07d89..547e8a5f062e 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -55,7 +55,7 @@ class NewCUDATreeLearner: public SerialTreeLearner { void BeforeTrain() override; - Tree* BuildTree(); + Tree* BuildTree(const int num_leaves); // number of GPUs int num_gpus_; From 88ecde91328e8393877db32f80408e03cc79692c Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 4 Jun 2021 13:03:58 +0000 Subject: [PATCH 022/166] copy data for split by column --- include/LightGBM/bin.h | 2 + include/LightGBM/dataset.h | 7 +++ include/LightGBM/feature_group.h | 13 ++++ src/io/dataset.cpp | 9 +++ src/io/dense_bin.hpp | 39 ++++++++++++ src/io/multi_val_dense_bin.hpp | 34 +++++++++++ src/io/sparse_bin.hpp | 26 ++++++++ src/io/train_share_states.cpp | 4 ++ .../cuda/cuda_best_split_finder.cpp | 24 +++++++- src/treelearner/cuda/cuda_data_partition.cpp | 59 +++++++++++++++--- src/treelearner/cuda/cuda_data_partition.hpp | 9 +-- .../cuda/cuda_histogram_constructor.cpp | 61 +++++-------------- .../cuda/cuda_histogram_constructor.cu | 13 ++-- .../cuda/cuda_histogram_constructor.hpp | 3 - .../cuda/new_cuda_tree_learner.cpp | 15 ++--- 15 files changed, 244 insertions(+), 74 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 1c0ba699ef47..64d1edb18753 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -386,6 +386,8 @@ class Bin { * \brief Deep copy the bin */ virtual Bin* Clone() = 0; + + virtual const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0; }; diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 61989e221bcc..6c421e096c2f 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -565,6 +565,13 @@ class Dataset { return feature_groups_[group]->FeatureGroupData(); } + const uint8_t* GetColWiseData( + const int feature_group_index, + const int sub_feature_index, + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const; + inline double RealThreshold(int i, uint32_t threshold) const { const int group = feature2group_[i]; const int sub_feature = feature2subfeature_[i]; diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 285667c70518..699d6d138b93 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -478,6 +478,19 @@ class FeatureGroup { } } + const uint8_t* GetColWiseData(const int sub_feature_index, + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const { + if (sub_feature_index >= 0) { + CHECK(is_multi_val_); + return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator); + } else { + CHECK(!is_multi_val_); + return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator); + } + } + private: void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { if (is_multi_val) { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 51879351178e..109fb8b804c7 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1471,4 +1471,13 @@ void Dataset::AddFeaturesFrom(Dataset* other) { } } +const uint8_t* Dataset::GetColWiseData( + const int feature_group_index, + const int sub_feature_index, + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const { + return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator); +} + } // namespace LightGBM diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 20ffd724e34c..a6d1d1dbda83 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -461,6 +461,8 @@ class DenseBin : public Bin { DenseBin* Clone() override; + const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override; + private: data_size_t num_data_; #ifdef USE_CUDA @@ -479,6 +481,43 @@ DenseBin* DenseBin::Clone() { return new DenseBin(*this); } +template +const uint8_t* DenseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { + *is_sparse = false; +} + +template <> +const uint8_t* DenseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { + *is_sparse = false; + *bit_type = 8; + *bin_iterator = nullptr; + return data_.data(); +} + +template <> +const uint8_t* DenseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { + *is_sparse = false; + *bit_type = 16; + *bin_iterator = nullptr; + return reinterpret_cast(data_.data()); +} + +template <> +const uint8_t* DenseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { + *is_sparse = false; + *bit_type = 32; + *bin_iterator = nullptr; + return reinterpret_cast(data_.data()); +} + +template <> +const uint8_t* DenseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { + *is_sparse = false; + *bit_type = 4; + *bin_iterator = nullptr; + return data_.data(); +} + template uint32_t DenseBinIterator::Get(data_size_t idx) { auto ret = bin_data_->data(idx); diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index 9559e38b7f72..20e9ed748a7e 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -210,6 +210,8 @@ class MultiValDenseBin : public MultiValBin { MultiValDenseBin* Clone() override; + const uint8_t* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const override; + private: data_size_t num_data_; int num_bin_; @@ -228,5 +230,37 @@ MultiValDenseBin* MultiValDenseBin::Clone() { return new MultiValDenseBin(*this); } +template <> +const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint8_t* to_return = data_.data(); + *bit_type = 8; + *total_size = static_cast(num_data_) * static_cast(num_feature_); + CHECK_EQ(*total_size, data_.size()); + *is_sparse = false; + return to_return; +} + +template <> +const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint16_t* data_ptr = data_.data(); + const uint8_t* to_return = reinterpret_cast(data_ptr); + *bit_type = 16; + *total_size = static_cast(num_data_) * static_cast(num_feature_); + CHECK_EQ(*total_size, data_.size()); + *is_sparse = false; + return to_return; +} + +template <> +const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint32_t* data_ptr = data_.data(); + const uint8_t* to_return = reinterpret_cast(data_ptr); + *bit_type = 32; + *total_size = static_cast(num_data_) * static_cast(num_feature_); + CHECK_EQ(*total_size, data_.size()); + *is_sparse = false; + return to_return; +} + } // namespace LightGBM #endif // LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_ diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index e4259a48862b..bd285514e4fd 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -620,6 +620,8 @@ class SparseBin : public Bin { } } + const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override; + private: data_size_t num_data_; std::vector> @@ -636,6 +638,30 @@ SparseBin* SparseBin::Clone() { return new SparseBin(*this); } +template <> +const uint8_t* SparseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { + *is_sparse = true; + *bit_type = 8; + *bin_iterator = new SparseBinIterator(this, 0); + return nullptr; +} + +template <> +const uint8_t* SparseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { + *is_sparse = true; + *bit_type = 8; + *bin_iterator = new SparseBinIterator(this, 0); + return nullptr; +} + +template <> +const uint8_t* SparseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { + *is_sparse = true; + *bit_type = 8; + *bin_iterator = new SparseBinIterator(this, 0); + return nullptr; +} + template inline uint32_t SparseBinIterator::RawGet(data_size_t idx) { return InnerRawGet(idx); diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp index 478c520f1c68..73273a10095e 100644 --- a/src/io/train_share_states.cpp +++ b/src/io/train_share_states.cpp @@ -382,6 +382,10 @@ void TrainingShareStates::CalcBinOffsets(const std::vector(feature_hist_offsets_.back()); } + column_hist_offsets_ = *offsets; + for (size_t i = 0; i < column_hist_offsets_.size(); ++i) { + Log::Warning("column_hist_offsets_[%d] = %d", i, column_hist_offsets_[i]); + } } void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data, diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 686607a487b9..d019c2e8b293 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -60,10 +60,32 @@ void CUDABestSplitFinder::Init() { AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_output_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_found_); - AllocateCUDAMemory(feature_hist_offsets_.size(), &cuda_feature_hist_offsets_); + + for (size_t i = 0; i < feature_hist_offsets_.size(); ++i) { + Log::Warning("feature_hist_offsets_[%d] = %d", i, feature_hist_offsets_[i]); + } + + Log::Warning("before allocate feature hist offsets"); + Log::Warning("before allocate feature_hist_offsets_.size() = %d", feature_hist_offsets_.size()); + std::vector non_sense(1000); + uint32_t* cuda_non_sense = nullptr; + Log::Warning("step 0"); + AllocateCUDAMemory(non_sense.size(), &cuda_non_sense); + Log::Warning("step 1"); + CopyFromHostToCUDADevice(cuda_non_sense, non_sense.data(), non_sense.size()); + Log::Warning("step 2"); + AllocateCUDAMemory(feature_hist_offsets_.size() * 2, &cuda_feature_hist_offsets_); + Log::Warning("step 3"); + PrintLastCUDAError(); + Log::Warning("before copy feature hist offsets"); + Log::Warning("after allocate feature_hist_offsets_.size() = %d", feature_hist_offsets_.size()); CopyFromHostToCUDADevice(cuda_feature_hist_offsets_, feature_hist_offsets_.data(), feature_hist_offsets_.size()); + PrintLastCUDAError(); + //InitCUDAMemoryFromHostMemory(&cuda_feature_hist_offsets_, feature_hist_offsets_.data(), feature_hist_offsets_.size()); + Log::Warning("after copy feature hist offsets"); AllocateCUDAMemory(feature_mfb_offsets_.size(), &cuda_feature_mfb_offsets_); + PrintLastCUDAError(); CopyFromHostToCUDADevice(cuda_feature_mfb_offsets_, feature_mfb_offsets_.data(), feature_mfb_offsets_.size()); AllocateCUDAMemory(feature_default_bins_.size(), &cuda_feature_default_bins_); diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 4239f3bd8cd1..3b68ad446bf0 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -11,11 +11,11 @@ namespace LightGBM { CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_features, const int num_leaves, - const int num_threads, const data_size_t* cuda_num_data, const int* cuda_num_leaves, const uint8_t* cuda_data, + const int num_threads, const data_size_t* cuda_num_data, const int* cuda_num_leaves, const int* cuda_num_features, const std::vector& feature_hist_offsets, const Dataset* train_data, hist_t* cuda_hist): num_data_(num_data), num_features_(num_features), num_leaves_(num_leaves), num_threads_(num_threads), - num_total_bin_(feature_hist_offsets.back()), cuda_data_(cuda_data), cuda_num_features_(cuda_num_features), + num_total_bin_(feature_hist_offsets.back()), cuda_num_features_(cuda_num_features), cuda_hist_(cuda_hist) { cuda_num_data_ = cuda_num_data; cuda_num_leaves_ = cuda_num_leaves; @@ -41,8 +41,6 @@ CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_f const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); feature_default_bins_[feature_index] = bin_mapper->GetDefaultBin(); feature_most_freq_bins_[feature_index] = bin_mapper->GetMostFreqBin(); - /*Log::Warning("feature_index = %d, feature_hist_offsets[feature_index] = %d, prev_group_bins = %d", - feature_index, feature_hist_offsets[feature_index], prev_group_bins);*/ feature_min_bins_[feature_index] = feature_hist_offsets[feature_index] - prev_group_bins; feature_max_bins_[feature_index] = feature_hist_offsets[feature_index + 1] - prev_group_bins - 1; const MissingType missing_type = bin_mapper->missing_type(); @@ -75,7 +73,7 @@ CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_f num_data_in_leaf_[0] = num_data_; } -void CUDADataPartition::Init() { +void CUDADataPartition::Init(const Dataset* train_data) { // allocate CUDA memory AllocateCUDAMemory(static_cast(num_data_), &cuda_data_indices_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_data_start_); @@ -118,7 +116,7 @@ void CUDADataPartition::Init() { AllocateCUDAMemory(static_cast(num_data_) * static_cast(num_features_), &cuda_data_col_wise_); - CopyColWiseData(); + CopyColWiseData(train_data); cpu_train_data_score_tmp_.resize(num_data_, 0.0f); cpu_split_info_buffer_.resize(6, 0); @@ -135,8 +133,53 @@ void CUDADataPartition::Init() { AllocateCUDAMemory(max_num_blocks_in_debug, &cuda_hessians_sum_buffer_); } -void CUDADataPartition::CopyColWiseData() { - LaunchCopyColWiseDataKernel(); +void CUDADataPartition::CopyColWiseData(const Dataset* train_data) { + const int num_feature_group = train_data->num_feature_groups(); + for (int feature_group_index = 0; feature_group_index < num_feature_group; ++feature_group_index) { + if (!train_data->IsMultiGroup(feature_group_index)) { + uint8_t bit_type = 0; + bool is_sparse = false; + BinIterator* bin_iterator = nullptr; + const uint8_t* column_data = train_data->GetColWiseData(feature_group_index, -1, &bit_type, &is_sparse, &bin_iterator); + void* cuda_column_data = nullptr; + if (column_data != nullptr) { + CHECK(!is_sparse); + if (bit_type == 4) { + std::vector true_column_data(num_data_, 0); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (data_size_t i = 0; i < num_data_; ++i) { + true_column_data[i] = static_cast((column_data[i >> 1] >> ((i & 1) << 2)) & 0xf); + } + uint8_t* cuda_true_column_data = reinterpret_cast(cuda_column_data); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), static_cast(num_data_)); + } else if (bit_type == 8) { + uint8_t* cuda_true_column_data = reinterpret_cast(cuda_column_data); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, column_data, static_cast(num_data_)); + } else if (bit_type == 16) { + uint16_t* cuda_true_column_data = reinterpret_cast(cuda_column_data); + const uint16_t* true_column_data = reinterpret_cast(column_data); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data, static_cast(num_data_)); + } else if (bit_type == 32) { + uint32_t* cuda_true_column_data = reinterpret_cast(cuda_column_data); + const uint32_t* true_column_data = reinterpret_cast(column_data); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data, static_cast(num_data_)); + } else { + Log::Fatal("Unknow bit type = %d", bit_type); + } + } else { + CHECK(is_sparse); + CHECK(bin_iterator != nullptr); + bin_iterator->Reset(0); + if (bit_type == 8) { + std::vector true_column_data(num_data_, 0); + uint8_t* cuda_true_column_data = reinterpret_cast(cuda_column_data); + } + } + column_bit_type_.emplace_back(bit_type); + cuda_data_by_column_.emplace_back(cuda_column_data); + } + } + //LaunchCopyColWiseDataKernel(); } void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index dbe334ac0797..e4d998abcc33 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -24,11 +24,11 @@ namespace LightGBM { class CUDADataPartition { public: CUDADataPartition(const data_size_t num_data, const int num_features, const int num_leaves, - const int num_threads, const data_size_t* cuda_num_data, const int* cuda_num_leaves, const uint8_t* cuda_data, + const int num_threads, const data_size_t* cuda_num_data, const int* cuda_num_leaves, const int* cuda_num_features, const std::vector& feature_hist_offsets, const Dataset* train_data, hist_t* cuda_hist); - void Init(); + void Init(const Dataset* train_data); void BeforeTrain(const data_size_t* data_indices); @@ -190,7 +190,7 @@ class CUDADataPartition { const double* train_data_score_tmp() const { return train_data_score_tmp_; } private: - void CopyColWiseData(); + void CopyColWiseData(const Dataset* train_data); void LaunchCopyColWiseDataKernel(); @@ -287,6 +287,7 @@ class CUDADataPartition { int cur_num_leaves_; std::vector cpu_train_data_score_tmp_; std::vector cpu_split_info_buffer_; + std::vector column_bit_type_; // CUDA streams std::vector cuda_streams_; @@ -337,7 +338,7 @@ class CUDADataPartition { // CUDA memory, held by other object const data_size_t* cuda_num_data_; const int* cuda_num_leaves_; - const uint8_t* cuda_data_; + std::vector cuda_data_by_column_; const int* cuda_num_features_; hist_t* cuda_hist_; }; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 39b76c1fe230..5c16f10acc5e 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -101,6 +101,7 @@ void CUDAHistogramConstructor::InitCUDAData(const Dataset* train_data, TrainingS //CopyFromHostToCUDADevice(cuda_data_, data_.data(), data_.size()); Log::Warning("share_state_->IsSparse() = %d", static_cast(share_state->IsSparseRowwise())); const uint8_t* cpu_data_ptr = share_state->GetRowWiseData(&bit_type, &total_size, &is_sparse_); + Log::Warning("bit_type = %d", bit_type); CHECK_EQ(bit_type, 8); InitCUDAMemoryFromHostMemory(&cuda_data_uint8_t_, cpu_data_ptr, total_size); SynchronizeCUDADevice(); @@ -175,70 +176,53 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data feature_group_num_feature_offsets.emplace_back(offsets); uint32_t start_hist_offset = 0; - uint32_t column_start_hist_offset = 0; - feature_partition_hist_offsets_.clear(); - feature_partition_feature_index_offsets_.clear(); feature_partition_column_index_offsets_.clear(); column_hist_offsets_.clear(); column_hist_offsets_full_.clear(); - feature_partition_hist_offsets_.emplace_back(0); - feature_partition_feature_index_offsets_.emplace_back(0); feature_partition_column_index_offsets_.emplace_back(0); column_hist_offsets_full_.emplace_back(0); const int num_feature_groups = train_data->num_feature_groups(); int column_index = 0; num_feature_partitions_ = 0; for (int feature_group_index = 0; feature_group_index < num_feature_groups; ++feature_group_index) { - const int group_feature_index_start = feature_group_num_feature_offsets[feature_group_index]; - const int group_feature_index_end = feature_group_num_feature_offsets[feature_group_index + 1]; - const int num_features_in_group = group_feature_index_end - group_feature_index_start; if (!train_data->IsMultiGroup(feature_group_index)) { - const uint32_t group_feature_hist_start = feature_hist_offsets[group_feature_index_start]; - const uint32_t group_feature_hist_end = feature_hist_offsets[group_feature_index_end]; - const uint32_t num_bin_in_dense_group = group_feature_hist_end - group_feature_hist_start; + const uint32_t column_feature_hist_start = column_hist_offsets[column_index]; + const uint32_t column_feature_hist_end = column_hist_offsets[column_index + 1]; + const uint32_t num_bin_in_dense_group = column_feature_hist_end - column_feature_hist_start; if (num_bin_in_dense_group > max_num_bin_per_partition) { Log::Fatal("Too many bins in a dense feature group."); } - const uint32_t cur_hist_num_bin = group_feature_hist_end - start_hist_offset; + const uint32_t cur_hist_num_bin = column_feature_hist_end - start_hist_offset; if (cur_hist_num_bin > max_num_bin_per_partition) { - feature_partition_hist_offsets_.emplace_back(group_feature_hist_start); - feature_partition_feature_index_offsets_.emplace_back(group_feature_index_start); feature_partition_column_index_offsets_.emplace_back(column_index); - start_hist_offset = group_feature_hist_start; - column_start_hist_offset = column_hist_offsets[column_index]; - column_hist_offsets_full_.emplace_back(column_start_hist_offset); + start_hist_offset = column_feature_hist_start; + column_hist_offsets_full_.emplace_back(start_hist_offset); ++num_feature_partitions_; } - column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - column_start_hist_offset); + column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - start_hist_offset); if (feature_group_index == num_feature_groups - 1) { - CHECK_EQ(group_feature_index_end, num_features_); - feature_partition_hist_offsets_.emplace_back(group_feature_hist_end); - feature_partition_feature_index_offsets_.emplace_back(group_feature_index_end); feature_partition_column_index_offsets_.emplace_back(column_index + 1); column_hist_offsets_full_.emplace_back(column_hist_offsets.back()); ++num_feature_partitions_; } ++column_index; } else { + const int group_feature_index_start = feature_group_num_feature_offsets[feature_group_index]; + const int num_features_in_group = feature_group_num_feature_offsets[feature_group_index + 1] - group_feature_index_start; for (int sub_feature_index = 0; sub_feature_index < num_features_in_group; ++sub_feature_index) { const int feature_index = group_feature_index_start + sub_feature_index; - const uint32_t feature_hist_start = feature_hist_offsets[feature_index]; - const uint32_t feature_hist_end = feature_hist_offsets[feature_index + 1]; - const uint32_t cur_hist_num_bin = feature_hist_end - start_hist_offset; + const uint32_t column_feature_hist_start = column_hist_offsets[column_index]; + const uint32_t column_feature_hist_end = column_hist_offsets[column_index + 1]; + const uint32_t cur_hist_num_bin = column_feature_hist_end - start_hist_offset; if (cur_hist_num_bin > max_num_bin_per_partition) { - feature_partition_hist_offsets_.emplace_back(feature_hist_start); - feature_partition_feature_index_offsets_.emplace_back(feature_index); feature_partition_column_index_offsets_.emplace_back(column_index); - start_hist_offset = feature_hist_start; - column_start_hist_offset = column_hist_offsets[column_index]; - column_hist_offsets_full_.emplace_back(column_start_hist_offset); + start_hist_offset = column_feature_hist_start; + column_hist_offsets_full_.emplace_back(start_hist_offset); ++num_feature_partitions_; } - column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - column_start_hist_offset); + column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - start_hist_offset); if (feature_group_index == num_feature_groups - 1 && sub_feature_index == num_features_in_group - 1) { CHECK_EQ(feature_index, num_features_ - 1); - feature_partition_hist_offsets_.emplace_back(feature_hist_end); - feature_partition_feature_index_offsets_.emplace_back(feature_index + 1); feature_partition_column_index_offsets_.emplace_back(column_index + 1); column_hist_offsets_full_.emplace_back(column_hist_offsets.back()); ++num_feature_partitions_; @@ -260,19 +244,6 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data i, column_hist_offsets_[i], i, feature_group_bin_offsets_[i]); } - CHECK_EQ(feature_partition_column_index_offsets_.size(), feature_partition_hist_offsets_.size()); - CHECK_EQ(feature_partition_column_index_offsets_.size(), feature_partition_feature_index_offsets_.size()); - for (size_t i = 0; i < feature_partition_column_index_offsets_.size(); ++i) { - Log::Warning("%d column %d hist %d feature %d", i, - feature_partition_column_index_offsets_[i], - feature_partition_feature_index_offsets_[i], - feature_partition_hist_offsets_[i]); - } - - InitCUDAMemoryFromHostMemory(&cuda_feature_partition_hist_offsets_, - feature_partition_hist_offsets_.data(), - feature_partition_hist_offsets_.size()); - InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, feature_partition_column_index_offsets_.data(), feature_partition_column_index_offsets_.size()); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 095fd23a36a3..21061a95ed0f 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -68,12 +68,12 @@ __global__ void CUDAConstructHistogramKernel( const data_size_t* data_indices_ref = *data_indices_ptr; __shared__ float shared_hist[SHRAE_HIST_SIZE]; const unsigned int num_threads_per_block = blockDim.x * blockDim.y; - const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; - const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; - const uint32_t num_bins_in_partition = partition_hist_end - partition_hist_start; const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x]; const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1]; const int num_columns_in_partition = partition_column_end - partition_column_start; + const uint32_t partition_hist_start = column_hist_offsets_full[partition_column_start + blockIdx.x]; + const uint32_t partition_hist_end = column_hist_offsets_full[partition_column_start + blockIdx.x + 1]; + const uint32_t num_bins_in_partition = partition_hist_end - partition_hist_start; const uint32_t num_items_per_thread = (2 * num_bins_in_partition + num_threads_per_block - 1) / num_threads_per_block; const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; const uint32_t thread_start = thread_idx * num_items_per_thread; @@ -93,19 +93,20 @@ __global__ void CUDAConstructHistogramKernel( const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); data_size_t inner_data_index = static_cast(threadIdx_y); float* shared_hist_ptr = shared_hist + (column_hist_offsets[threadIdx.x] << 1); - //if (threadIdx.x < static_cast(num_columns_in_partition)) { + const int column_index = static_cast(threadIdx.x) + partition_column_start; + if (threadIdx.x < static_cast(num_columns_in_partition)) { for (data_size_t i = 0; i < num_iteration_this; ++i) { const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; const score_t grad = cuda_gradients[data_index]; const score_t hess = cuda_hessians[data_index]; - const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + threadIdx.x/* + partition_column_start*/]); + const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + column_index]); const uint32_t pos = bin << 1; float* pos_ptr = shared_hist_ptr + pos; atomicAdd_system(pos_ptr, grad); atomicAdd_system(pos_ptr + 1, hess); inner_data_index += blockDim.y; } - //} + } __syncthreads(); hist_t* feature_histogram_ptr = (*feature_histogram) + (partition_hist_start << 1); for (uint32_t i = thread_start; i < thread_end; ++i) { diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index f2c6edb46e6f..58d17fa44446 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -120,8 +120,6 @@ class CUDAHistogramConstructor { std::vector feature_most_freq_bins_; const int min_data_in_leaf_; const double min_sum_hessian_in_leaf_; - std::vector feature_partition_hist_offsets_; - std::vector feature_partition_feature_index_offsets_; std::vector feature_partition_column_index_offsets_; std::vector column_hist_offsets_; std::vector column_hist_offsets_full_; @@ -144,7 +142,6 @@ class CUDAHistogramConstructor { int* cuda_num_features_; score_t* cuda_ordered_gradients_; score_t* cuda_ordered_hessians_; - uint32_t* cuda_feature_partition_hist_offsets_; int* cuda_feature_partition_column_index_offsets_; uint32_t* cuda_column_hist_offsets_; uint32_t* cuda_column_hist_offsets_full_; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 26acf417328e..77dd49b49c5f 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -26,36 +26,37 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_centralized_info_.reset(new CUDACentralizedInfo(num_data_, this->config_->num_leaves, num_features_)); cuda_centralized_info_->Init(labels); //cuda_centralized_info_->Test(); - + PrintLastCUDAError(); cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); cuda_smaller_leaf_splits_->Init(); cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_, -1, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); cuda_larger_leaf_splits_->Init(); - + PrintLastCUDAError(); cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), share_state_->feature_hist_offsets(), config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf)); cuda_histogram_constructor_->Init(train_data_, share_state_.get()); //cuda_histogram_constructor_->TestAfterInit(); - + PrintLastCUDAError(); cuda_data_partition_.reset(new CUDADataPartition(num_data_, num_features_, this->config_->num_leaves, num_threads_, cuda_centralized_info_->cuda_num_data(), cuda_centralized_info_->cuda_num_leaves(), cuda_histogram_constructor_->cuda_data(), cuda_centralized_info_->cuda_num_features(), share_state_->feature_hist_offsets(), train_data_, cuda_histogram_constructor_->cuda_hist_pointer())); - cuda_data_partition_->Init(); - + cuda_data_partition_->Init(train_data_.get()); + PrintLastCUDAError(); cuda_best_split_finder_.reset(new CUDABestSplitFinder(cuda_histogram_constructor_->cuda_hist(), train_data_, this->share_state_->feature_hist_offsets(), this->config_->num_leaves, this->config_->lambda_l1, this->config_->lambda_l2, this->config_->min_data_in_leaf, this->config_->min_sum_hessian_in_leaf, this->config_->min_gain_to_split, cuda_centralized_info_->cuda_num_features())); + PrintLastCUDAError(); cuda_best_split_finder_->Init(); - + PrintLastCUDAError(); cuda_score_updater_.reset(new CUDAScoreUpdater(num_data_)); cuda_score_updater_->Init(); - + PrintLastCUDAError(); cuda_binary_objective_.reset(new CUDABinaryObjective(num_data_, cuda_centralized_info_->cuda_labels(), config_->sigmoid)); cuda_binary_objective_->Init(); From dec750100437d713c96915cb931b813716fe206d Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 8 Jun 2021 04:51:46 +0000 Subject: [PATCH 023/166] copy data from host to CPU by column for data partition --- include/LightGBM/bin.h | 2 +- include/LightGBM/dataset.h | 3 +- include/LightGBM/feature_group.h | 7 +- include/LightGBM/train_share_states.h | 5 - src/io/dataset.cpp | 5 +- src/io/dense_bin.cpp | 59 ++ src/io/dense_bin.hpp | 39 +- src/io/sparse_bin.cpp | 53 ++ src/io/sparse_bin.hpp | 26 +- src/io/train_share_states.cpp | 3 - .../cuda/cuda_best_split_finder.cpp | 22 - .../cuda/cuda_best_split_finder.cu | 4 +- .../cuda/cuda_best_split_finder.hpp | 2 +- src/treelearner/cuda/cuda_data_partition.cpp | 196 +++++-- src/treelearner/cuda/cuda_data_partition.cu | 552 +++++++++++------- src/treelearner/cuda/cuda_data_partition.hpp | 19 +- .../cuda/cuda_histogram_constructor.cpp | 19 - .../cuda/new_cuda_tree_learner.cpp | 23 +- src/treelearner/cuda/new_cuda_utils.hpp | 3 - 19 files changed, 649 insertions(+), 393 deletions(-) create mode 100644 src/io/dense_bin.cpp create mode 100644 src/io/sparse_bin.cpp diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 64d1edb18753..e890a849c128 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -387,7 +387,7 @@ class Bin { */ virtual Bin* Clone() = 0; - virtual const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0; + virtual const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const = 0; }; diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 6c421e096c2f..b9eb9a380f20 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -570,7 +570,8 @@ class Dataset { const int sub_feature_index, uint8_t* bit_type, bool* is_sparse, - BinIterator** bin_iterator) const; + std::vector* bin_iterator, + const int num_threads) const; inline double RealThreshold(int i, uint32_t threshold) const { const int group = feature2group_[i]; diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 699d6d138b93..955848511efd 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -481,13 +481,14 @@ class FeatureGroup { const uint8_t* GetColWiseData(const int sub_feature_index, uint8_t* bit_type, bool* is_sparse, - BinIterator** bin_iterator) const { + std::vector* bin_iterator, + const int num_threads) const { if (sub_feature_index >= 0) { CHECK(is_multi_val_); - return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator); + return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads); } else { CHECK(!is_multi_val_); - return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator); + return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator, num_threads); } } diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index 15fbd7f86d52..f6535b3f326e 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -178,9 +178,6 @@ struct TrainingShareStates { const std::vector& column_hist_offsets() { return column_hist_offsets_; } bool IsSparseRowwise() { - if (multi_val_bin_wrapper_ == nullptr) { - Log::Warning("in share states get row wise data, and multi_val_bin_wrapper_ == nullptr"); - } return (multi_val_bin_wrapper_ != nullptr && multi_val_bin_wrapper_->IsSparse()); } @@ -229,10 +226,8 @@ struct TrainingShareStates { const uint8_t* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) { if (multi_val_bin_wrapper_ != nullptr) { - Log::Warning("in share states get row wise data, and multi_val_bin_wrapper_ != nullptr"); return multi_val_bin_wrapper_->GetRowWiseData(bit_type, total_size, is_sparse); } else { - Log::Warning("in share states get row wise data, and multi_val_bin_wrapper_ == nullptr"); *bit_type = 0; *total_size = 0; *is_sparse = false; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 109fb8b804c7..fb47961a8fd7 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1476,8 +1476,9 @@ const uint8_t* Dataset::GetColWiseData( const int sub_feature_index, uint8_t* bit_type, bool* is_sparse, - BinIterator** bin_iterator) const { - return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator); + std::vector* bin_iterator, + const int num_threads) const { + return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator, num_threads); } } // namespace LightGBM diff --git a/src/io/dense_bin.cpp b/src/io/dense_bin.cpp new file mode 100644 index 000000000000..7c8cb6247090 --- /dev/null +++ b/src/io/dense_bin.cpp @@ -0,0 +1,59 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "dense_bin.hpp" + +namespace LightGBM { + +template <> +const uint8_t* DenseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + std::vector* bin_iterator, + const int /*num_threads*/) const { + *is_sparse = false; + *bit_type = 8; + bin_iterator->clear(); + return data_.data(); +} + +template <> +const uint8_t* DenseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + std::vector* bin_iterator, + const int /*num_threads*/) const { + *is_sparse = false; + *bit_type = 16; + bin_iterator->clear(); + return reinterpret_cast(data_.data()); +} + +template <> +const uint8_t* DenseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + std::vector* bin_iterator, + const int /*num_threads*/) const { + *is_sparse = false; + *bit_type = 32; + bin_iterator->clear(); + return reinterpret_cast(data_.data()); +} + +template <> +const uint8_t* DenseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + std::vector* bin_iterator, + const int /*num_threads*/) const { + *is_sparse = false; + *bit_type = 4; + bin_iterator->clear(); + return data_.data(); +} + +} // namespace LightGBM diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index a6d1d1dbda83..b68c3763ac4a 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -461,7 +461,7 @@ class DenseBin : public Bin { DenseBin* Clone() override; - const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override; + const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const override; private: data_size_t num_data_; @@ -481,43 +481,6 @@ DenseBin* DenseBin::Clone() { return new DenseBin(*this); } -template -const uint8_t* DenseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { - *is_sparse = false; -} - -template <> -const uint8_t* DenseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { - *is_sparse = false; - *bit_type = 8; - *bin_iterator = nullptr; - return data_.data(); -} - -template <> -const uint8_t* DenseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { - *is_sparse = false; - *bit_type = 16; - *bin_iterator = nullptr; - return reinterpret_cast(data_.data()); -} - -template <> -const uint8_t* DenseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { - *is_sparse = false; - *bit_type = 32; - *bin_iterator = nullptr; - return reinterpret_cast(data_.data()); -} - -template <> -const uint8_t* DenseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { - *is_sparse = false; - *bit_type = 4; - *bin_iterator = nullptr; - return data_.data(); -} - template uint32_t DenseBinIterator::Get(data_size_t idx) { auto ret = bin_data_->data(idx); diff --git a/src/io/sparse_bin.cpp b/src/io/sparse_bin.cpp new file mode 100644 index 000000000000..a22af28e1e04 --- /dev/null +++ b/src/io/sparse_bin.cpp @@ -0,0 +1,53 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "sparse_bin.hpp" + +namespace LightGBM { + +template <> +const uint8_t* SparseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + std::vector* bin_iterator, + const int num_threads) const { + *is_sparse = true; + *bit_type = 8; + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + bin_iterator->emplace_back(new SparseBinIterator(this, 0)); + } + return nullptr; +} + +template <> +const uint8_t* SparseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + std::vector* bin_iterator, + const int num_threads) const { + *is_sparse = true; + *bit_type = 16; + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + bin_iterator->emplace_back(new SparseBinIterator(this, 0)); + } + return nullptr; +} + +template <> +const uint8_t* SparseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + std::vector* bin_iterator, + const int num_threads) const { + *is_sparse = true; + *bit_type = 32; + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + bin_iterator->emplace_back(new SparseBinIterator(this, 0)); + } + return nullptr; +} + +} // namespace LightGBM diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index bd285514e4fd..f3c7b093c419 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -620,7 +620,7 @@ class SparseBin : public Bin { } } - const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override; + const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const override; private: data_size_t num_data_; @@ -638,30 +638,6 @@ SparseBin* SparseBin::Clone() { return new SparseBin(*this); } -template <> -const uint8_t* SparseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { - *is_sparse = true; - *bit_type = 8; - *bin_iterator = new SparseBinIterator(this, 0); - return nullptr; -} - -template <> -const uint8_t* SparseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { - *is_sparse = true; - *bit_type = 8; - *bin_iterator = new SparseBinIterator(this, 0); - return nullptr; -} - -template <> -const uint8_t* SparseBin::GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const { - *is_sparse = true; - *bit_type = 8; - *bin_iterator = new SparseBinIterator(this, 0); - return nullptr; -} - template inline uint32_t SparseBinIterator::RawGet(data_size_t idx) { return InnerRawGet(idx); diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp index 73273a10095e..8a3ff5b1e91e 100644 --- a/src/io/train_share_states.cpp +++ b/src/io/train_share_states.cpp @@ -383,9 +383,6 @@ void TrainingShareStates::CalcBinOffsets(const std::vector(feature_hist_offsets_.back()); } column_hist_offsets_ = *offsets; - for (size_t i = 0; i < column_hist_offsets_.size(); ++i) { - Log::Warning("column_hist_offsets_[%d] = %d", i, column_hist_offsets_[i]); - } } void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data, diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index d019c2e8b293..e3fbefb8556c 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -60,32 +60,10 @@ void CUDABestSplitFinder::Init() { AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_output_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_found_); - - for (size_t i = 0; i < feature_hist_offsets_.size(); ++i) { - Log::Warning("feature_hist_offsets_[%d] = %d", i, feature_hist_offsets_[i]); - } - - Log::Warning("before allocate feature hist offsets"); - Log::Warning("before allocate feature_hist_offsets_.size() = %d", feature_hist_offsets_.size()); - std::vector non_sense(1000); - uint32_t* cuda_non_sense = nullptr; - Log::Warning("step 0"); - AllocateCUDAMemory(non_sense.size(), &cuda_non_sense); - Log::Warning("step 1"); - CopyFromHostToCUDADevice(cuda_non_sense, non_sense.data(), non_sense.size()); - Log::Warning("step 2"); AllocateCUDAMemory(feature_hist_offsets_.size() * 2, &cuda_feature_hist_offsets_); - Log::Warning("step 3"); - PrintLastCUDAError(); - Log::Warning("before copy feature hist offsets"); - Log::Warning("after allocate feature_hist_offsets_.size() = %d", feature_hist_offsets_.size()); CopyFromHostToCUDADevice(cuda_feature_hist_offsets_, feature_hist_offsets_.data(), feature_hist_offsets_.size()); - PrintLastCUDAError(); - //InitCUDAMemoryFromHostMemory(&cuda_feature_hist_offsets_, feature_hist_offsets_.data(), feature_hist_offsets_.size()); - Log::Warning("after copy feature hist offsets"); AllocateCUDAMemory(feature_mfb_offsets_.size(), &cuda_feature_mfb_offsets_); - PrintLastCUDAError(); CopyFromHostToCUDADevice(cuda_feature_mfb_offsets_, feature_mfb_offsets_.data(), feature_mfb_offsets_.size()); AllocateCUDAMemory(feature_default_bins_.size(), &cuda_feature_default_bins_); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index ea9a593411bd..83f96576d2c4 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -641,7 +641,7 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i } __syncthreads(); - ReduceBestSplit(best_found, best_gain, shared_read_index, num_tasks_aligned, 0); + ReduceBestSplit(best_found, best_gain, shared_read_index, NUM_TASKS_PER_SYNC_BLOCK, 0); if (threadIdx.x == 0) { const int leaf_index_ref = is_smaller ? smaller_leaf_index : larger_leaf_index; @@ -670,6 +670,7 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i } } +// TODO(shiyu1994): syncrhonize best from different blocks void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( const int cpu_smaller_leaf_index, const int cpu_larger_leaf_index, @@ -684,6 +685,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_tasks >>= 1; } const int num_blocks_per_leaf = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; + //Log::Warning("num_blocks_per_leaf = %d", num_blocks_per_leaf); if (cpu_larger_leaf_index >= 0 && is_smaller_leaf_valid && is_larger_leaf_valid) { SyncBestSplitForLeafKernel<<<2 * num_blocks_per_leaf, NUM_TASKS_PER_SYNC_BLOCK>>>( cpu_smaller_leaf_index, diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 93e96ae91b78..80c5c5390b39 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -21,7 +21,7 @@ #define NUM_THREADS_FIND_BEST_LEAF (256) #define LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER (4) #define NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER (16) -#define NUM_TASKS_PER_SYNC_BLOCK (256) +#define NUM_TASKS_PER_SYNC_BLOCK (1024) namespace LightGBM { diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 3b68ad446bf0..69640345282f 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -114,8 +114,6 @@ void CUDADataPartition::Init(const Dataset* train_data) { AllocateCUDAMemory(static_cast(num_data_), &train_data_score_tmp_); - AllocateCUDAMemory(static_cast(num_data_) * static_cast(num_features_), &cuda_data_col_wise_); - CopyColWiseData(train_data); cpu_train_data_score_tmp_.resize(num_data_, 0.0f); @@ -135,13 +133,32 @@ void CUDADataPartition::Init(const Dataset* train_data) { void CUDADataPartition::CopyColWiseData(const Dataset* train_data) { const int num_feature_group = train_data->num_feature_groups(); + int column_index = 0; + std::vector> features_in_group(num_feature_group); + for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) { + const int feature_group_index = train_data->Feature2Group(feature_index); + features_in_group[feature_group_index].emplace_back(feature_index); + } + + feature_index_to_column_index_.resize(num_features_, -1); for (int feature_group_index = 0; feature_group_index < num_feature_group; ++feature_group_index) { + if (!train_data->IsMultiGroup(feature_group_index)) { + for (const int feature_index : features_in_group[feature_group_index]) { + feature_index_to_column_index_[feature_index] = column_index; + } + ++column_index; + } else { + for (const int feature_index : features_in_group[feature_group_index]) { + feature_index_to_column_index_[feature_index] = column_index; + ++column_index; + } + } + if (!train_data->IsMultiGroup(feature_group_index)) { uint8_t bit_type = 0; bool is_sparse = false; - BinIterator* bin_iterator = nullptr; - const uint8_t* column_data = train_data->GetColWiseData(feature_group_index, -1, &bit_type, &is_sparse, &bin_iterator); - void* cuda_column_data = nullptr; + std::vector bin_iterator; + const uint8_t* column_data = train_data->GetColWiseData(feature_group_index, -1, &bit_type, &is_sparse, &bin_iterator, num_threads_); if (column_data != nullptr) { CHECK(!is_sparse); if (bit_type == 4) { @@ -150,33 +167,153 @@ void CUDADataPartition::CopyColWiseData(const Dataset* train_data) { for (data_size_t i = 0; i < num_data_; ++i) { true_column_data[i] = static_cast((column_data[i >> 1] >> ((i & 1) << 2)) & 0xf); } - uint8_t* cuda_true_column_data = reinterpret_cast(cuda_column_data); + bit_type = 8; + uint8_t* cuda_true_column_data = nullptr; InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), static_cast(num_data_)); + cuda_data_by_column_.emplace_back(cuda_true_column_data); } else if (bit_type == 8) { - uint8_t* cuda_true_column_data = reinterpret_cast(cuda_column_data); + uint8_t* cuda_true_column_data = nullptr; InitCUDAMemoryFromHostMemory(&cuda_true_column_data, column_data, static_cast(num_data_)); + cuda_data_by_column_.emplace_back(cuda_true_column_data); } else if (bit_type == 16) { - uint16_t* cuda_true_column_data = reinterpret_cast(cuda_column_data); + uint16_t* cuda_true_column_data = nullptr; const uint16_t* true_column_data = reinterpret_cast(column_data); InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data, static_cast(num_data_)); + cuda_data_by_column_.emplace_back(cuda_true_column_data); } else if (bit_type == 32) { - uint32_t* cuda_true_column_data = reinterpret_cast(cuda_column_data); + uint32_t* cuda_true_column_data = nullptr; const uint32_t* true_column_data = reinterpret_cast(column_data); InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data, static_cast(num_data_)); + cuda_data_by_column_.emplace_back(cuda_true_column_data); } else { Log::Fatal("Unknow bit type = %d", bit_type); } } else { CHECK(is_sparse); - CHECK(bin_iterator != nullptr); - bin_iterator->Reset(0); + CHECK_EQ(bin_iterator.size(), static_cast(num_threads_)); if (bit_type == 8) { std::vector true_column_data(num_data_, 0); - uint8_t* cuda_true_column_data = reinterpret_cast(cuda_column_data); + uint8_t* cuda_true_column_data = nullptr; + Threading::For(0, num_data_, 512, + [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { + bin_iterator[thread_index]->Reset(start); + BinIterator* thread_bin_iterator = bin_iterator[thread_index]; + for (data_size_t data_index = start; data_index < end; ++data_index) { + true_column_data[data_index] = static_cast(thread_bin_iterator->RawGet(data_index)); + } + }); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); + cuda_data_by_column_.emplace_back(cuda_true_column_data); + } else if (bit_type == 16) { + std::vector true_column_data(num_data_, 0); + uint16_t* cuda_true_column_data = nullptr; + Threading::For(0, num_data_, 512, + [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { + bin_iterator[thread_index]->Reset(start); + BinIterator* thread_bin_iterator = bin_iterator[thread_index]; + for (data_size_t data_index = start; data_index < end; ++data_index) { + true_column_data[data_index] = static_cast(thread_bin_iterator->RawGet(data_index)); + } + }); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); + cuda_data_by_column_.emplace_back(cuda_true_column_data); + } else if (bit_type == 32) { + std::vector true_column_data(num_data_, 0); + uint32_t* cuda_true_column_data = nullptr; + Threading::For(0, num_data_, 512, + [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { + bin_iterator[thread_index]->Reset(start); + BinIterator* thread_bin_iterator = bin_iterator[thread_index]; + for (data_size_t data_index = start; data_index < end; ++data_index) { + true_column_data[data_index] = thread_bin_iterator->RawGet(data_index); + } + }); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); + cuda_data_by_column_.emplace_back(cuda_true_column_data); } } column_bit_type_.emplace_back(bit_type); - cuda_data_by_column_.emplace_back(cuda_column_data); + } else { + for (int sub_feature_index = 0; sub_feature_index < static_cast(features_in_group[feature_group_index].size()); ++sub_feature_index) { + uint8_t bit_type = 0; + bool is_sparse = false; + std::vector bin_iterator; + const uint8_t* column_data = train_data->GetColWiseData(feature_group_index, sub_feature_index, &bit_type, &is_sparse, &bin_iterator, num_threads_); + if (column_data != nullptr) { + CHECK(!is_sparse); + if (bit_type == 4) { + std::vector true_column_data(num_data_, 0); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (data_size_t i = 0; i < num_data_; ++i) { + true_column_data[i] = static_cast((column_data[i >> 1] >> ((i & 1) << 2)) & 0xf); + } + bit_type = 8; + uint8_t* cuda_true_column_data = nullptr; + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), static_cast(num_data_)); + cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); + } else if (bit_type == 8) { + uint8_t* cuda_true_column_data = nullptr; + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, column_data, static_cast(num_data_)); + cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); + } else if (bit_type == 16) { + uint16_t* cuda_true_column_data = nullptr; + const uint16_t* true_column_data = reinterpret_cast(column_data); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data, static_cast(num_data_)); + cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); + } else if (bit_type == 32) { + uint32_t* cuda_true_column_data = nullptr; + const uint32_t* true_column_data = reinterpret_cast(column_data); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data, static_cast(num_data_)); + cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); + } else { + Log::Fatal("Unknow bit type = %d", bit_type); + } + } else { + CHECK(is_sparse); + CHECK_EQ(bin_iterator.size(), static_cast(num_threads_)); + if (bit_type == 8) { + std::vector true_column_data(num_data_, 0); + uint8_t* cuda_true_column_data = nullptr; + Threading::For(0, num_data_, 512, + [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { + bin_iterator[thread_index]->Reset(start); + BinIterator* thread_bin_iterator = bin_iterator[thread_index]; + for (data_size_t data_index = start; data_index < end; ++data_index) { + true_column_data[data_index] = static_cast(thread_bin_iterator->RawGet(data_index)); + } + }); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); + cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); + } else if (bit_type == 16) { + std::vector true_column_data(num_data_, 0); + uint16_t* cuda_true_column_data = nullptr; + Threading::For(0, num_data_, 512, + [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { + bin_iterator[thread_index]->Reset(start); + BinIterator* thread_bin_iterator = bin_iterator[thread_index]; + for (data_size_t data_index = start; data_index < end; ++data_index) { + true_column_data[data_index] = static_cast(thread_bin_iterator->RawGet(data_index)); + } + }); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); + cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); + } else if (bit_type == 32) { + std::vector true_column_data(num_data_, 0); + uint32_t* cuda_true_column_data = nullptr; + Threading::For(0, num_data_, 512, + [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { + bin_iterator[thread_index]->Reset(start); + BinIterator* thread_bin_iterator = bin_iterator[thread_index]; + for (data_size_t data_index = start; data_index < end; ++data_index) { + true_column_data[data_index] = thread_bin_iterator->RawGet(data_index); + } + }); + InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); + cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); + } + } + column_bit_type_.emplace_back(bit_type); + } } } //LaunchCopyColWiseDataKernel(); @@ -241,16 +378,15 @@ void CUDADataPartition::Split(const int* leaf_id, const uint8_t split_default_left = cpu_leaf_best_split_default_left[cpu_leaf_index]; const data_size_t leaf_data_start = cpu_leaf_data_start->at(cpu_leaf_index); global_timer.Stop("SplitInner Copy CUDA To Host"); - auto start = std::chrono::steady_clock::now(); - //GenDataToLeftBitVector(leaf_id, cpu_num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left); - GenDataToLeftBitVector2(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start); - auto end = std::chrono::steady_clock::now(); - double duration = (static_cast>(end - start)).count(); + //auto start = std::chrono::steady_clock::now(); + GenDataToLeftBitVector(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start); + //auto end = std::chrono::steady_clock::now(); + //double duration = (static_cast>(end - start)).count(); global_timer.Stop("GenDataToLeftBitVector"); //Log::Warning("CUDADataPartition::GenDataToLeftBitVector time %f", duration); global_timer.Start("SplitInner"); - start = std::chrono::steady_clock::now(); + //start = std::chrono::steady_clock::now(); SplitInner(leaf_id, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, @@ -268,24 +404,16 @@ void CUDADataPartition::Split(const int* leaf_id, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, smaller_leaf_index, larger_leaf_index); - end = std::chrono::steady_clock::now(); - duration = (static_cast>(end - start)).count(); + //end = std::chrono::steady_clock::now(); + //duration = (static_cast>(end - start)).count(); global_timer.Stop("SplitInner"); //Log::Warning("CUDADataPartition::SplitInner time %f", duration); } -void CUDADataPartition::GenDataToLeftBitVector(const int* leaf_id, - const data_size_t num_data_in_leaf, - const int* best_split_feature, - const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left) { - LaunchGenDataToLeftBitVectorKernel(leaf_id, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left); -} - -void CUDADataPartition::GenDataToLeftBitVector2(const data_size_t num_data_in_leaf, +void CUDADataPartition::GenDataToLeftBitVector(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start) { - LaunchGenDataToLeftBitVectorKernel2(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start); + LaunchGenDataToLeftBitVectorKernel(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start); } void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, @@ -331,12 +459,8 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ Tree* CUDADataPartition::GetCPUTree() {} -void CUDADataPartition::UpdateTrainScore(const double learning_rate, double* train_score, double* cuda_scores) { +void CUDADataPartition::UpdateTrainScore(const double learning_rate, double* cuda_scores) { LaunchAddPredictionToScoreKernel(learning_rate, cuda_scores); - /*#pragma omp parallel for schedule(static) num_threads(num_threads_) - for (data_size_t i = 0; i < num_data_; ++i) { - train_score[i] += cpu_train_data_score_tmp_[i]; - }*/ } void CUDADataPartition::CUDACheck( diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 9afbabfac143..e383f1302bc2 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -146,9 +146,10 @@ __device__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, const uint } // missing_is_zero = 0, missing_is_na = 0, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t /*split_missing_default_to_left*/, @@ -161,7 +162,7 @@ __global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; @@ -181,9 +182,10 @@ __global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature } // missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel4(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, @@ -196,7 +198,7 @@ __global__ void GenDataToLeftBitVectorKernel4(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; @@ -219,9 +221,10 @@ __global__ void GenDataToLeftBitVectorKernel4(const int best_split_feature_ref, } // missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel5(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, @@ -234,7 +237,7 @@ __global__ void GenDataToLeftBitVectorKernel5(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; @@ -254,9 +257,10 @@ __global__ void GenDataToLeftBitVectorKernel5(const int best_split_feature_ref, } // missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel6(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, @@ -269,7 +273,7 @@ __global__ void GenDataToLeftBitVectorKernel6(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin == max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; @@ -292,9 +296,10 @@ __global__ void GenDataToLeftBitVectorKernel6(const int best_split_feature_ref, } // missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel7(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, @@ -307,7 +312,7 @@ __global__ void GenDataToLeftBitVectorKernel7(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; @@ -327,9 +332,10 @@ __global__ void GenDataToLeftBitVectorKernel7(const int best_split_feature_ref, } // missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel8(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, @@ -342,7 +348,7 @@ __global__ void GenDataToLeftBitVectorKernel8(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; } else if (bin < min_bin_ref || bin > max_bin_ref) { @@ -364,9 +370,10 @@ __global__ void GenDataToLeftBitVectorKernel8(const int best_split_feature_ref, } // missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel9(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, @@ -379,7 +386,7 @@ __global__ void GenDataToLeftBitVectorKernel9(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; @@ -402,9 +409,10 @@ __global__ void GenDataToLeftBitVectorKernel9(const int best_split_feature_ref, } // missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel10(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, @@ -417,7 +425,7 @@ __global__ void GenDataToLeftBitVectorKernel10(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; @@ -437,9 +445,10 @@ __global__ void GenDataToLeftBitVectorKernel10(const int best_split_feature_ref, } // missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel11(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, @@ -452,7 +461,7 @@ __global__ void GenDataToLeftBitVectorKernel11(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; @@ -472,9 +481,10 @@ __global__ void GenDataToLeftBitVectorKernel11(const int best_split_feature_ref, } // missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel12(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, @@ -487,7 +497,7 @@ __global__ void GenDataToLeftBitVectorKernel12(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin == t_zero_bin || bin == max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; @@ -510,9 +520,10 @@ __global__ void GenDataToLeftBitVectorKernel12(const int best_split_feature_ref, } // missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel13(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, @@ -525,7 +536,7 @@ __global__ void GenDataToLeftBitVectorKernel13(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; @@ -548,9 +559,10 @@ __global__ void GenDataToLeftBitVectorKernel13(const int best_split_feature_ref, } // missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel14(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, @@ -563,7 +575,7 @@ __global__ void GenDataToLeftBitVectorKernel14(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin == max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; @@ -586,9 +598,10 @@ __global__ void GenDataToLeftBitVectorKernel14(const int best_split_feature_ref, } // missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref +template __global__ void GenDataToLeftBitVectorKernel15(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const uint8_t* cuda_data, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, @@ -601,7 +614,7 @@ __global__ void GenDataToLeftBitVectorKernel15(const int best_split_feature_ref, const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(cuda_data[global_data_index]); + const uint32_t bin = static_cast(column_data[global_data_index]); if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; @@ -620,88 +633,65 @@ __global__ void GenDataToLeftBitVectorKernel15(const int best_split_feature_ref, split_indices_block_size_data_partition, thread_to_left_offset_cnt); } -__global__ void GenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t* cuda_leaf_data_start, - const data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, const int* best_split_feature, - const uint32_t* best_split_threshold, const int* cuda_num_features, const uint8_t* cuda_data, - const uint32_t* default_bin, const uint32_t* most_freq_bin, const uint8_t* default_left, - const uint32_t* min_bin, const uint32_t* max_bin, const uint8_t* missing_is_zero, const uint8_t* missing_is_na, - const uint8_t* mfb_is_zero, const uint8_t* mfb_is_na, - uint8_t* cuda_data_to_left) { - const int leaf_index_ref = *leaf_index; - const int best_split_feature_ref = best_split_feature[leaf_index_ref]; - const int num_features_ref = *cuda_num_features; - const uint32_t best_split_threshold_ref = best_split_threshold[leaf_index_ref]; - const uint8_t default_left_ref = default_left[leaf_index_ref]; - const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; - const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; +// min_bin_ref == max_bin_ref +template +__global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, + uint8_t* cuda_data_to_left, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, + const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const unsigned int global_feature_value_index = global_data_index * num_features_ref + best_split_feature_ref; - const uint32_t default_bin_ref = default_bin[best_split_feature_ref]; - const uint32_t most_freq_bin_ref = most_freq_bin[best_split_feature_ref]; - const uint32_t max_bin_ref = max_bin[best_split_feature_ref]; - const uint32_t min_bin_ref = min_bin[best_split_feature_ref]; - const uint8_t missing_is_zero_ref = missing_is_zero[best_split_feature_ref]; - const uint8_t missing_is_na_ref = missing_is_na[best_split_feature_ref]; - const uint8_t mfb_is_zero_ref = mfb_is_zero[best_split_feature_ref]; - const uint8_t mfb_is_na_ref = mfb_is_na[best_split_feature_ref]; - uint32_t th = best_split_threshold_ref + min_bin_ref; - uint32_t t_zero_bin = min_bin_ref + default_bin_ref; - if (most_freq_bin_ref == 0) { - --th; - --t_zero_bin; - } - uint8_t split_default_to_left = 0; - uint8_t split_missing_default_to_left = 0; - if (most_freq_bin_ref <= best_split_threshold_ref) { - split_default_to_left = 1; - } - if (missing_is_zero_ref || missing_is_na_ref) { - if (default_left_ref) { - split_missing_default_to_left = 1; + const uint32_t bin = static_cast(column_data[global_data_index]); + if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + } else if (bin != max_bin_ref) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; } - } - if (local_data_index < static_cast(num_data_in_leaf)) { - const uint32_t bin = static_cast(cuda_data[global_feature_value_index]); - if (min_bin_ref < max_bin_ref) { - if ((missing_is_zero_ref && !mfb_is_zero_ref && bin == t_zero_bin) || - (missing_is_na_ref && !mfb_is_na_ref && bin == max_bin_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - } else if (bin < min_bin_ref || bin > max_bin_ref) { - if ((missing_is_na_ref && mfb_is_na_ref) || (missing_is_zero_ref && mfb_is_zero_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - } else { - cuda_data_to_left[local_data_index] = 1; - } + } else { + if (MISSING_IS_NA && !MFB_IS_NA) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; } else { - if (missing_is_zero_ref && !mfb_is_zero_ref && bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - } else if (bin != max_bin_ref) { - if ((missing_is_na_ref && mfb_is_na_ref) || (missing_is_zero_ref && mfb_is_zero_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - } + if (MAX_TO_LEFT) { + cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; } else { - if (missing_is_na_ref && !mfb_is_na_ref) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - } + cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } } } + } else { + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } + __syncthreads(); + PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, + split_indices_block_size_data_partition, thread_to_left_offset_cnt); } -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t num_data_in_leaf, +#define GenBitVector_ARGS \ + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, \ + th, num_features_, \ + column_data, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, \ + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, \ + split_indices_block_size_data_partition_aligned + +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start) { const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; @@ -738,129 +728,287 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu split_missing_default_to_left = 1; } } - const uint8_t* cuda_data_col_wise_ptr = cuda_data_col_wise_ + split_feature_index * num_data_; + const int column_index = feature_index_to_column_index_[split_feature_index]; + const uint8_t bit_type = column_bit_type_[column_index]; if (min_bin < max_bin) { if (!missing_is_zero && !missing_is_na) { - GenDataToLeftBitVectorKernel0_1_2_3<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else { if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel4<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel5<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel6<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel7<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel8<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel9<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel10<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel11<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel12<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel13<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - GenDataToLeftBitVectorKernel14<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - GenDataToLeftBitVectorKernel15<<>>( - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, - th, num_features_, /* TODO(shiyu1994): the case when num_features != num_groups*/ - cuda_data_col_wise_ptr, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - split_indices_block_size_data_partition_aligned); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); + } else { + Log::Fatal("Unknown bit type %d", bit_type); + } } } } else { - Log::Fatal("Unsupported for max_bin == min_bin"); + const bool max_bin_to_left = (max_bin <= th); + if (bit_type == 8) { + if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } + } } SynchronizeCUDADevice(); } -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t num_data_in_leaf, const int* best_split_feature, - const uint32_t* best_split_threshold, const uint8_t* best_split_default_left) { - const int num_blocks = std::max(80, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); - int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; - int split_indices_block_size_data_partition_aligned = 1; - while (split_indices_block_size_data_partition > 0) { - split_indices_block_size_data_partition_aligned <<= 1; - split_indices_block_size_data_partition >>= 1; - } - GenDataToLeftBitVectorKernel<<>>( - leaf_index, cuda_leaf_data_start_, cuda_leaf_num_data_, - cuda_data_indices_, best_split_feature, best_split_threshold, - cuda_num_features_, cuda_data_, - cuda_feature_default_bins_, cuda_feature_most_freq_bins_, best_split_default_left, - cuda_feature_min_bins_, cuda_feature_max_bins_, cuda_feature_missing_is_zero_, cuda_feature_missing_is_na_, - cuda_feature_mfb_is_zero_, cuda_feature_mfb_is_na_, - cuda_data_to_left_); - SynchronizeCUDADevice(); -} - __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, @@ -1227,10 +1375,6 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); auto start = std::chrono::steady_clock::now(); - /*int cpu_leaf_index = 0, cpu_cur_num_leaves = 0; - CopyFromCUDADeviceToHost(&cpu_leaf_index, leaf_index, 1); - CopyFromCUDADeviceToHost(&cpu_cur_num_leaves, cuda_cur_num_leaves_, 1); - Log::Warning("cpu_leaf_index = %d, cpu_cur_num_leaves = %d before aggregate", cpu_leaf_index, cpu_cur_num_leaves);*/ AggregateBlockOffsetKernel<<<1, split_indices_block_size_data_partition_aligned / 2>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, @@ -1260,10 +1404,6 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, data_partition_leaf_output_); SynchronizeCUDADevice(); - /*PrintLastCUDAError(); - CopyFromCUDADeviceToHost(&cpu_leaf_index, leaf_index, 1); - CopyFromCUDADeviceToHost(&cpu_cur_num_leaves, cuda_cur_num_leaves_, 1); - Log::Warning("cpu_leaf_index = %d, cpu_cur_num_leaves = %d after aggregate", cpu_leaf_index, cpu_cur_num_leaves);*/ auto end = std::chrono::steady_clock::now(); auto duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); @@ -1393,9 +1533,9 @@ __global__ void CopyColWiseDataKernel(const uint8_t* row_wise_data, } void CUDADataPartition::LaunchCopyColWiseDataKernel() { - const int block_size = 1024; + /*const int block_size = 1024; const int num_blocks = (num_data_ + block_size - 1) / block_size; - CopyColWiseDataKernel<<>>(cuda_data_, num_data_, num_features_, cuda_data_col_wise_); + CopyColWiseDataKernel<<>>(cuda_data_, num_data_, num_features_, cuda_data_col_wise_);*/ } __global__ void CUDACheckKernel(const data_size_t** data_indices_in_leaf_ptr, diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index e4d998abcc33..0714ead5479d 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -151,7 +151,7 @@ class CUDADataPartition { } } - void UpdateTrainScore(const double learning_rate, double* train_score, double* cuda_scores); + void UpdateTrainScore(const double learning_rate, double* cuda_scores); const data_size_t* cuda_leaf_data_start() const { return cuda_leaf_data_start_; } @@ -194,10 +194,7 @@ class CUDADataPartition { void LaunchCopyColWiseDataKernel(); - void GenDataToLeftBitVector(const int* leaf_id, const data_size_t num_data_in_leaf, const int* best_split_feature, - const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); - - void GenDataToLeftBitVector2(const data_size_t num_data_in_leaf, + void GenDataToLeftBitVector(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start); @@ -248,10 +245,7 @@ class CUDADataPartition { std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index); - void LaunchGenDataToLeftBitVectorKernel(const int* leaf_index, const data_size_t num_data_in_leaf, const int* best_split_feature, - const uint32_t* best_split_threshold, const uint8_t* best_split_default_left); - - void LaunchGenDataToLeftBitVectorKernel2(const data_size_t num_data_in_leaf, + void LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start); @@ -287,7 +281,8 @@ class CUDADataPartition { int cur_num_leaves_; std::vector cpu_train_data_score_tmp_; std::vector cpu_split_info_buffer_; - std::vector column_bit_type_; + std::vector column_bit_type_; + std::vector feature_index_to_column_index_; // CUDA streams std::vector cuda_streams_; @@ -330,15 +325,15 @@ class CUDADataPartition { double* data_partition_leaf_output_; // for train data update double* train_data_score_tmp_; - uint8_t* cuda_data_col_wise_; // for debug double* cuda_gradients_sum_buffer_; double* cuda_hessians_sum_buffer_; + // for train data split + std::vector cuda_data_by_column_; // CUDA memory, held by other object const data_size_t* cuda_num_data_; const int* cuda_num_leaves_; - std::vector cuda_data_by_column_; const int* cuda_num_features_; hist_t* cuda_hist_; }; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 5c16f10acc5e..a9fadab2ae8f 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -86,22 +86,9 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat } void CUDAHistogramConstructor::InitCUDAData(const Dataset* train_data, TrainingShareStates* share_state) { - /*std::vector> bin_iterators(num_feature_groups_); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int group_id = 0; group_id < num_feature_groups_; ++group_id) { - bin_iterators[group_id].reset(train_data->FeatureGroupIterator(group_id)); - bin_iterators[group_id]->Reset(0); - for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { - const uint32_t bin = static_cast(bin_iterators[group_id]->RawGet(data_index)); - PushOneData(bin, group_id, data_index); - } - }*/ uint8_t bit_type = 0; size_t total_size = 0; - //CopyFromHostToCUDADevice(cuda_data_, data_.data(), data_.size()); - Log::Warning("share_state_->IsSparse() = %d", static_cast(share_state->IsSparseRowwise())); const uint8_t* cpu_data_ptr = share_state->GetRowWiseData(&bit_type, &total_size, &is_sparse_); - Log::Warning("bit_type = %d", bit_type); CHECK_EQ(bit_type, 8); InitCUDAMemoryFromHostMemory(&cuda_data_uint8_t_, cpu_data_ptr, total_size); SynchronizeCUDADevice(); @@ -160,7 +147,6 @@ void CUDAHistogramConstructor::CalcConstructHistogramKernelDim( void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state) { const uint32_t max_num_bin_per_partition = SHRAE_HIST_SIZE / 2; - const std::vector& feature_hist_offsets = share_state->feature_hist_offsets(); const std::vector& column_hist_offsets = share_state->column_hist_offsets(); std::vector feature_group_num_feature_offsets; int offsets = 0; @@ -239,11 +225,6 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data } } - for (size_t i = 0; i < column_hist_offsets_.size(); ++i) { - Log::Warning("column_hist_offsets[%d] = %d, feature_group_bin_offsets_[%d] = %d", - i, column_hist_offsets_[i], i, feature_group_bin_offsets_[i]); - } - InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, feature_partition_column_index_offsets_.data(), feature_partition_column_index_offsets_.size()); diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 77dd49b49c5f..7565d833e6ed 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -26,37 +26,30 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_centralized_info_.reset(new CUDACentralizedInfo(num_data_, this->config_->num_leaves, num_features_)); cuda_centralized_info_->Init(labels); //cuda_centralized_info_->Test(); - PrintLastCUDAError(); cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); cuda_smaller_leaf_splits_->Init(); cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_, -1, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); cuda_larger_leaf_splits_->Init(); - PrintLastCUDAError(); cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), share_state_->feature_hist_offsets(), config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf)); cuda_histogram_constructor_->Init(train_data_, share_state_.get()); //cuda_histogram_constructor_->TestAfterInit(); - PrintLastCUDAError(); cuda_data_partition_.reset(new CUDADataPartition(num_data_, num_features_, this->config_->num_leaves, num_threads_, cuda_centralized_info_->cuda_num_data(), cuda_centralized_info_->cuda_num_leaves(), - cuda_histogram_constructor_->cuda_data(), cuda_centralized_info_->cuda_num_features(), + cuda_centralized_info_->cuda_num_features(), share_state_->feature_hist_offsets(), train_data_, cuda_histogram_constructor_->cuda_hist_pointer())); - cuda_data_partition_->Init(train_data_.get()); - PrintLastCUDAError(); + cuda_data_partition_->Init(train_data_); cuda_best_split_finder_.reset(new CUDABestSplitFinder(cuda_histogram_constructor_->cuda_hist(), train_data_, this->share_state_->feature_hist_offsets(), this->config_->num_leaves, this->config_->lambda_l1, this->config_->lambda_l2, this->config_->min_data_in_leaf, this->config_->min_sum_hessian_in_leaf, this->config_->min_gain_to_split, cuda_centralized_info_->cuda_num_features())); - PrintLastCUDAError(); cuda_best_split_finder_->Init(); - PrintLastCUDAError(); cuda_score_updater_.reset(new CUDAScoreUpdater(num_data_)); cuda_score_updater_->Init(); - PrintLastCUDAError(); cuda_binary_objective_.reset(new CUDABinaryObjective(num_data_, cuda_centralized_info_->cuda_labels(), config_->sigmoid)); cuda_binary_objective_->Init(); @@ -127,12 +120,12 @@ void NewCUDATreeLearner::FindBestSplitsFromHistograms(const std::vector& void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, int* /*left_leaf*/, int* /*right_leaf*/) {} -void NewCUDATreeLearner::AddPredictionToScore(const Tree* /*tree*/, double* out_score) const { - const auto start = std::chrono::steady_clock::now(); - cuda_data_partition_->UpdateTrainScore(config_->learning_rate, out_score, cuda_score_updater_->cuda_score_ref()); - const auto end = std::chrono::steady_clock::now(); - const auto duration = static_cast>(end - start).count(); - Log::Warning("AddPredictionToScore time %f", duration); +void NewCUDATreeLearner::AddPredictionToScore(const Tree* /*tree*/, double* /*out_score*/) const { + //const auto start = std::chrono::steady_clock::now(); + cuda_data_partition_->UpdateTrainScore(config_->learning_rate, cuda_score_updater_->cuda_score_ref()); + //const auto end = std::chrono::steady_clock::now(); + //const auto duration = static_cast>(end - start).count(); + //Log::Warning("AddPredictionToScore time %f", duration); } Tree* NewCUDATreeLearner::BuildTree(const int num_leaves) { diff --git a/src/treelearner/cuda/new_cuda_utils.hpp b/src/treelearner/cuda/new_cuda_utils.hpp index 74236c668bf7..dbbf77d9bf0a 100644 --- a/src/treelearner/cuda/new_cuda_utils.hpp +++ b/src/treelearner/cuda/new_cuda_utils.hpp @@ -95,9 +95,6 @@ void SetCUDAMemory(T* dst_ptr, int value, size_t size) { void PrintLastCUDAError(); -//template -//__device__ void PrefixSum(T* elements, unsigned int n); - } // namespace LightGBM #endif // USE_CUDA From 2dccb7fdcd8f25fcbecb710d3ab6fb2e0291bd90 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 8 Jun 2021 09:02:12 +0000 Subject: [PATCH 024/166] add synchronize best splits for one leaf from multiple blocks --- .../cuda/cuda_best_split_finder.cpp | 47 ++--- .../cuda/cuda_best_split_finder.cu | 161 +++++++++++++++--- src/treelearner/cuda/cuda_data_partition.cpp | 3 - src/treelearner/cuda/cuda_data_partition.cu | 22 +-- src/treelearner/cuda/cuda_data_partition.hpp | 5 - .../cuda/cuda_histogram_constructor.cpp | 29 ++-- .../cuda/cuda_histogram_constructor.cu | 9 +- src/treelearner/cuda/cuda_leaf_splits.cu | 10 +- .../cuda/new_cuda_tree_learner.cpp | 10 +- .../cuda/new_cuda_tree_learner.hpp | 29 ---- 10 files changed, 201 insertions(+), 124 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index e3fbefb8556c..787bdaea0ebf 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -44,21 +44,6 @@ CUDABestSplitFinder::CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* void CUDABestSplitFinder::Init() { AllocateCUDAMemory(1, &cuda_best_leaf_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_feature_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_default_left_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_threshold_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_gain_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_sum_gradient_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_sum_hessian_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_count_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_gain_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_left_output_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_sum_gradient_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_sum_hessian_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_count_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_gain_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_right_output_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_best_split_found_); AllocateCUDAMemory(feature_hist_offsets_.size() * 2, &cuda_feature_hist_offsets_); CopyFromHostToCUDADevice(cuda_feature_hist_offsets_, feature_hist_offsets_.data(), feature_hist_offsets_.size()); @@ -123,6 +108,26 @@ void CUDABestSplitFinder::Init() { ++num_tasks_; } } + + const int num_task_blocks = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; + const size_t cuda_best_leaf_split_info_buffer_size = static_cast(num_task_blocks) * static_cast(num_leaves_); + + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_feature_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_default_left_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_threshold_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_gain_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_left_sum_gradient_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_left_sum_hessian_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_left_count_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_left_gain_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_left_output_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_right_sum_gradient_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_right_sum_hessian_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_right_count_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_right_gain_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_right_output_); + AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_found_); + InitCUDAMemoryFromHostMemory(&cuda_task_feature_index_, cpu_task_feature_index_.data(), cpu_task_feature_index_.size()); InitCUDAMemoryFromHostMemory(&cuda_task_reverse_, cpu_task_reverse_.data(), cpu_task_reverse_.size()); InitCUDAMemoryFromHostMemory(&cuda_task_skip_default_bin_, cpu_task_skip_default_bin_.data(), cpu_task_skip_default_bin_.size()); @@ -162,7 +167,7 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { - auto start = std::chrono::steady_clock::now(); + //auto start = std::chrono::steady_clock::now(); const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ && sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_); const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ && sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_); LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits, @@ -172,20 +177,20 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); SynchronizeCUDADevice(); global_timer.Stop("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); - auto end = std::chrono::steady_clock::now(); - double duration = (static_cast>(end - start)).count(); + //auto end = std::chrono::steady_clock::now(); + //double duration = (static_cast>(end - start)).count(); //Log::Warning("FindBestSplitsForLeaf time %f", duration); } void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { - auto start = std::chrono::steady_clock::now(); + //auto start = std::chrono::steady_clock::now(); LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves, smaller_leaf_index, larger_leaf_index, leaf_best_split_feature, leaf_best_split_threshold, leaf_best_split_default_left, best_leaf_index); SynchronizeCUDADevice(); - auto end = std::chrono::steady_clock::now(); - double duration = (static_cast>(end - start)).count(); + //auto end = std::chrono::steady_clock::now(); + //double duration = (static_cast>(end - start)).count(); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 83f96576d2c4..ac5f1d4eeb96 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -573,8 +573,8 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( } __device__ void ReduceBestSplit(uint8_t* found, double* gain, uint32_t* shared_read_index, - uint32_t num_features_aligned, uint32_t thread_offset) { - const uint32_t threadIdx_x = threadIdx.x - thread_offset; + uint32_t num_features_aligned) { + const uint32_t threadIdx_x = threadIdx.x; for (unsigned int s = 1; s < num_features_aligned; s <<= 1) { if (threadIdx_x % (2 * s) == 0 && (threadIdx_x + s) < num_features_aligned) { const uint32_t pos_to_compare = threadIdx_x + s; @@ -619,7 +619,8 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i const int num_tasks, const int num_tasks_aligned, const int num_blocks_per_leaf, - const bool larger_only) { + const bool larger_only, + const int num_leaves) { const uint32_t threadIdx_x = threadIdx.x; const uint32_t blockIdx_x = blockIdx.x; @@ -633,39 +634,102 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i const int task_index = static_cast(leaf_block_index * num_blocks_per_leaf + threadIdx_x); const uint32_t read_index = is_smaller ? static_cast(task_index) : static_cast(task_index + num_tasks); if (task_index < num_tasks) { - best_found[task_index] = cuda_best_split_found[read_index]; - best_gain[task_index] = cuda_best_split_gain[read_index]; - shared_read_index[task_index] = read_index; + best_found[threadIdx_x] = cuda_best_split_found[read_index]; + best_gain[threadIdx_x] = cuda_best_split_gain[read_index]; + shared_read_index[threadIdx_x] = read_index; } else { - best_found[task_index] = 0; + best_found[threadIdx_x] = 0; } __syncthreads(); - ReduceBestSplit(best_found, best_gain, shared_read_index, NUM_TASKS_PER_SYNC_BLOCK, 0); - + ReduceBestSplit(best_found, best_gain, shared_read_index, NUM_TASKS_PER_SYNC_BLOCK); if (threadIdx.x == 0) { const int leaf_index_ref = is_smaller ? smaller_leaf_index : larger_leaf_index; + const unsigned buffer_write_pos = static_cast(leaf_index_ref) + leaf_block_index * num_leaves; const uint32_t best_read_index = shared_read_index[0]; if (best_found[0]) { - cuda_leaf_best_split_gain[leaf_index_ref] = best_gain[0]; - cuda_leaf_best_split_feature[leaf_index_ref] = is_smaller ? cuda_task_feature_index[best_read_index] : + cuda_leaf_best_split_gain[buffer_write_pos] = best_gain[0]; + cuda_leaf_best_split_feature[buffer_write_pos] = is_smaller ? cuda_task_feature_index[best_read_index] : cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; - cuda_leaf_best_split_default_left[leaf_index_ref] = cuda_best_split_default_left[best_read_index]; - cuda_leaf_best_split_threshold[leaf_index_ref] = cuda_best_split_threshold[best_read_index]; - cuda_leaf_best_split_left_sum_gradient[leaf_index_ref] = cuda_best_split_left_sum_gradient[best_read_index]; - cuda_leaf_best_split_left_sum_hessian[leaf_index_ref] = cuda_best_split_left_sum_hessian[best_read_index]; - cuda_leaf_best_split_left_count[leaf_index_ref] = cuda_best_split_left_count[best_read_index]; - cuda_leaf_best_split_left_gain[leaf_index_ref] = cuda_best_split_left_gain[best_read_index]; - cuda_leaf_best_split_left_output[leaf_index_ref] = cuda_best_split_left_output[best_read_index]; - cuda_leaf_best_split_right_sum_gradient[leaf_index_ref] = cuda_best_split_right_sum_gradient[best_read_index]; - cuda_leaf_best_split_right_sum_hessian[leaf_index_ref] = cuda_best_split_right_sum_hessian[best_read_index]; - cuda_leaf_best_split_right_count[leaf_index_ref] = cuda_best_split_right_count[best_read_index]; - cuda_leaf_best_split_right_gain[leaf_index_ref] = cuda_best_split_right_gain[best_read_index]; - cuda_leaf_best_split_right_output[leaf_index_ref] = cuda_best_split_right_output[best_read_index]; - cuda_leaf_best_split_found[leaf_index_ref] = 1; + cuda_leaf_best_split_default_left[buffer_write_pos] = cuda_best_split_default_left[best_read_index]; + cuda_leaf_best_split_threshold[buffer_write_pos] = cuda_best_split_threshold[best_read_index]; + cuda_leaf_best_split_left_sum_gradient[buffer_write_pos] = cuda_best_split_left_sum_gradient[best_read_index]; + cuda_leaf_best_split_left_sum_hessian[buffer_write_pos] = cuda_best_split_left_sum_hessian[best_read_index]; + cuda_leaf_best_split_left_count[buffer_write_pos] = cuda_best_split_left_count[best_read_index]; + cuda_leaf_best_split_left_gain[buffer_write_pos] = cuda_best_split_left_gain[best_read_index]; + cuda_leaf_best_split_left_output[buffer_write_pos] = cuda_best_split_left_output[best_read_index]; + cuda_leaf_best_split_right_sum_gradient[buffer_write_pos] = cuda_best_split_right_sum_gradient[best_read_index]; + cuda_leaf_best_split_right_sum_hessian[buffer_write_pos] = cuda_best_split_right_sum_hessian[best_read_index]; + cuda_leaf_best_split_right_count[buffer_write_pos] = cuda_best_split_right_count[best_read_index]; + cuda_leaf_best_split_right_gain[buffer_write_pos] = cuda_best_split_right_gain[best_read_index]; + cuda_leaf_best_split_right_output[buffer_write_pos] = cuda_best_split_right_output[best_read_index]; + cuda_leaf_best_split_found[buffer_write_pos] = 1; } else { - cuda_leaf_best_split_gain[leaf_index_ref] = K_MIN_SCORE; - cuda_leaf_best_split_found[leaf_index_ref] = 0; + cuda_leaf_best_split_gain[buffer_write_pos] = K_MIN_SCORE; + cuda_leaf_best_split_found[buffer_write_pos] = 0; + } + } +} + +__global__ void SyncBestSplitForLeafKernelAllBlocks( + const int smaller_leaf_index, + const int larger_leaf_index, + const unsigned int num_blocks_per_leaf, + const int num_leaves, + int* cuda_leaf_best_split_feature, uint8_t* cuda_leaf_best_split_default_left, + uint32_t* cuda_leaf_best_split_threshold, double* cuda_leaf_best_split_gain, + double* cuda_leaf_best_split_left_sum_gradient, double* cuda_leaf_best_split_left_sum_hessian, + data_size_t* cuda_leaf_best_split_left_count, double* cuda_leaf_best_split_left_gain, + double* cuda_leaf_best_split_left_output, + double* cuda_leaf_best_split_right_sum_gradient, double* cuda_leaf_best_split_right_sum_hessian, + data_size_t* cuda_leaf_best_split_right_count, double* cuda_leaf_best_split_right_gain, + double* cuda_leaf_best_split_right_output, + uint8_t* cuda_leaf_best_split_found, + const bool larger_only) { + if (!larger_only) { + for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { + const unsigned int leaf_read_pos = static_cast(smaller_leaf_index) + block_index * static_cast(num_leaves); + if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[smaller_leaf_index] == 1 && + cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[smaller_leaf_index])) { + cuda_leaf_best_split_found[smaller_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; + cuda_leaf_best_split_feature[smaller_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; + cuda_leaf_best_split_default_left[smaller_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; + cuda_leaf_best_split_threshold[smaller_leaf_index] = cuda_leaf_best_split_threshold[leaf_read_pos]; + cuda_leaf_best_split_gain[smaller_leaf_index] = cuda_leaf_best_split_gain[leaf_read_pos]; + cuda_leaf_best_split_left_sum_gradient[smaller_leaf_index] = cuda_leaf_best_split_left_sum_gradient[leaf_read_pos]; + cuda_leaf_best_split_left_sum_hessian[smaller_leaf_index] = cuda_leaf_best_split_left_sum_hessian[leaf_read_pos]; + cuda_leaf_best_split_left_count[smaller_leaf_index] = cuda_leaf_best_split_left_count[leaf_read_pos]; + cuda_leaf_best_split_left_gain[smaller_leaf_index] = cuda_leaf_best_split_left_gain[leaf_read_pos]; + cuda_leaf_best_split_left_output[smaller_leaf_index] = cuda_leaf_best_split_left_output[leaf_read_pos]; + cuda_leaf_best_split_right_sum_gradient[smaller_leaf_index] = cuda_leaf_best_split_right_sum_gradient[leaf_read_pos]; + cuda_leaf_best_split_right_sum_hessian[smaller_leaf_index] = cuda_leaf_best_split_right_sum_hessian[leaf_read_pos]; + cuda_leaf_best_split_right_count[smaller_leaf_index] = cuda_leaf_best_split_right_count[leaf_read_pos]; + cuda_leaf_best_split_right_gain[smaller_leaf_index] = cuda_leaf_best_split_right_gain[leaf_read_pos]; + cuda_leaf_best_split_right_output[smaller_leaf_index] = cuda_leaf_best_split_right_output[leaf_read_pos]; + } + } + } + if (larger_leaf_index >= 0) { + for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { + const unsigned int leaf_read_pos = static_cast(larger_leaf_index) + block_index * static_cast(num_leaves); + if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[larger_leaf_index] == 1 && + cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[larger_leaf_index])) { + cuda_leaf_best_split_found[larger_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; + cuda_leaf_best_split_feature[larger_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; + cuda_leaf_best_split_default_left[larger_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; + cuda_leaf_best_split_threshold[larger_leaf_index] = cuda_leaf_best_split_threshold[leaf_read_pos]; + cuda_leaf_best_split_gain[larger_leaf_index] = cuda_leaf_best_split_gain[leaf_read_pos]; + cuda_leaf_best_split_left_sum_gradient[larger_leaf_index] = cuda_leaf_best_split_left_sum_gradient[leaf_read_pos]; + cuda_leaf_best_split_left_sum_hessian[larger_leaf_index] = cuda_leaf_best_split_left_sum_hessian[leaf_read_pos]; + cuda_leaf_best_split_left_count[larger_leaf_index] = cuda_leaf_best_split_left_count[leaf_read_pos]; + cuda_leaf_best_split_left_gain[larger_leaf_index] = cuda_leaf_best_split_left_gain[leaf_read_pos]; + cuda_leaf_best_split_left_output[larger_leaf_index] = cuda_leaf_best_split_left_output[leaf_read_pos]; + cuda_leaf_best_split_right_sum_gradient[larger_leaf_index] = cuda_leaf_best_split_right_sum_gradient[leaf_read_pos]; + cuda_leaf_best_split_right_sum_hessian[larger_leaf_index] = cuda_leaf_best_split_right_sum_hessian[leaf_read_pos]; + cuda_leaf_best_split_right_count[larger_leaf_index] = cuda_leaf_best_split_right_count[leaf_read_pos]; + cuda_leaf_best_split_right_gain[larger_leaf_index] = cuda_leaf_best_split_right_gain[leaf_read_pos]; + cuda_leaf_best_split_right_output[larger_leaf_index] = cuda_leaf_best_split_right_output[leaf_read_pos]; + } } } } @@ -685,7 +749,6 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_tasks >>= 1; } const int num_blocks_per_leaf = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; - //Log::Warning("num_blocks_per_leaf = %d", num_blocks_per_leaf); if (cpu_larger_leaf_index >= 0 && is_smaller_leaf_valid && is_larger_leaf_valid) { SyncBestSplitForLeafKernel<<<2 * num_blocks_per_leaf, NUM_TASKS_PER_SYNC_BLOCK>>>( cpu_smaller_leaf_index, @@ -725,6 +788,28 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_tasks_, num_tasks_aligned, num_blocks_per_leaf, + false, + num_leaves_); + SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( + cpu_smaller_leaf_index, + cpu_larger_leaf_index, + num_blocks_per_leaf, + num_leaves_, + cuda_leaf_best_split_feature_, + cuda_leaf_best_split_default_left_, + cuda_leaf_best_split_threshold_, + cuda_leaf_best_split_gain_, + cuda_leaf_best_split_left_sum_gradient_, + cuda_leaf_best_split_left_sum_hessian_, + cuda_leaf_best_split_left_count_, + cuda_leaf_best_split_left_gain_, + cuda_leaf_best_split_left_output_, + cuda_leaf_best_split_right_sum_gradient_, + cuda_leaf_best_split_right_sum_hessian_, + cuda_leaf_best_split_right_count_, + cuda_leaf_best_split_right_gain_, + cuda_leaf_best_split_right_output_, + cuda_leaf_best_split_found_, false); } else { const bool larger_only = (!is_smaller_leaf_valid && is_larger_leaf_valid); @@ -766,6 +851,28 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_tasks_, num_tasks_aligned, num_blocks_per_leaf, + larger_only, + num_leaves_); + SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( + cpu_smaller_leaf_index, + cpu_larger_leaf_index, + num_blocks_per_leaf, + num_leaves_, + cuda_leaf_best_split_feature_, + cuda_leaf_best_split_default_left_, + cuda_leaf_best_split_threshold_, + cuda_leaf_best_split_gain_, + cuda_leaf_best_split_left_sum_gradient_, + cuda_leaf_best_split_left_sum_hessian_, + cuda_leaf_best_split_left_count_, + cuda_leaf_best_split_left_gain_, + cuda_leaf_best_split_left_output_, + cuda_leaf_best_split_right_sum_gradient_, + cuda_leaf_best_split_right_sum_hessian_, + cuda_leaf_best_split_right_count_, + cuda_leaf_best_split_right_gain_, + cuda_leaf_best_split_right_output_, + cuda_leaf_best_split_found_, larger_only); } } diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 69640345282f..d11b5e20ce1a 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -112,11 +112,8 @@ void CUDADataPartition::Init(const Dataset* train_data) { AllocateCUDAMemory(static_cast(num_leaves_), &data_partition_leaf_output_); - AllocateCUDAMemory(static_cast(num_data_), &train_data_score_tmp_); - CopyColWiseData(train_data); - cpu_train_data_score_tmp_.resize(num_data_, 0.0f); cpu_split_info_buffer_.resize(6, 0); cuda_streams_.resize(5); diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index e383f1302bc2..83750e42551a 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -1374,7 +1374,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data } const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); - auto start = std::chrono::steady_clock::now(); + //auto start = std::chrono::steady_clock::now(); AggregateBlockOffsetKernel<<<1, split_indices_block_size_data_partition_aligned / 2>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, @@ -1404,8 +1404,8 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, data_partition_leaf_output_); SynchronizeCUDADevice(); - auto end = std::chrono::steady_clock::now(); - auto duration = (static_cast>(end - start)).count(); + //auto end = std::chrono::steady_clock::now(); + //auto duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); global_timer.Start("CUDADataPartition::SplitInnerKernel"); @@ -1413,18 +1413,18 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); - end = std::chrono::steady_clock::now(); - duration = (static_cast>(end - start)).count(); + //end = std::chrono::steady_clock::now(); + //duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::SplitInnerKernel"); global_timer.Start("CUDADataPartition::CopyDataIndicesKernel"); - start = std::chrono::steady_clock::now(); + //start = std::chrono::steady_clock::now(); CopyDataIndicesKernel<<>>( leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_data_indices_); - end = std::chrono::steady_clock::now(); - duration = (static_cast>(end - start)).count(); + //end = std::chrono::steady_clock::now(); + //duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); - start = std::chrono::steady_clock::now(); + //start = std::chrono::steady_clock::now(); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); SplitTreeStructureKernel<<<1, 1, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, @@ -1495,7 +1495,7 @@ void CUDADataPartition::LaunchPrefixSumKernel(uint32_t* cuda_elements) { __global__ void AddPredictionToScoreKernel(const double* data_partition_leaf_output, const data_size_t* num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const data_size_t* leaf_data_start, const double learning_rate, double* output_score, double* cuda_scores) { + const data_size_t* leaf_data_start, const double learning_rate, double* cuda_scores) { const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockIdx_x = blockIdx.x; const unsigned int blockDim_x = blockDim.x; @@ -1514,7 +1514,7 @@ __global__ void AddPredictionToScoreKernel(const double* data_partition_leaf_out void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double learning_rate, double* cuda_scores) { global_timer.Start("CUDADataPartition::AddPredictionToScoreKernel"); AddPredictionToScoreKernel<<>>(data_partition_leaf_output_, - cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, learning_rate, train_data_score_tmp_, cuda_scores); + cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, learning_rate, cuda_scores); SynchronizeCUDADevice(); global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel"); } diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 0714ead5479d..3c936086e8b6 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -187,8 +187,6 @@ class CUDADataPartition { const uint8_t* tree_default_left() const { return tree_default_left_; } - const double* train_data_score_tmp() const { return train_data_score_tmp_; } - private: void CopyColWiseData(const Dataset* train_data); @@ -279,7 +277,6 @@ class CUDADataPartition { std::vector feature_mfb_is_na_; std::vector num_data_in_leaf_; int cur_num_leaves_; - std::vector cpu_train_data_score_tmp_; std::vector cpu_split_info_buffer_; std::vector column_bit_type_; std::vector feature_index_to_column_index_; @@ -323,8 +320,6 @@ class CUDADataPartition { double* tree_gain_; uint8_t* tree_default_left_; double* data_partition_leaf_output_; - // for train data update - double* train_data_score_tmp_; // for debug double* cuda_gradients_sum_buffer_; double* cuda_hessians_sum_buffer_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index a9fadab2ae8f..01a8465e490b 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -48,11 +48,6 @@ void CUDAHistogramConstructor::BeforeTrain() { } void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStates* share_state) { - // allocate CPU memory - //data_.resize(num_data_ * num_feature_groups_, 0); - // allocate GPU memory - //AllocateCUDAMemory(num_feature_groups_ * num_data_, &cuda_data_); - AllocateCUDAMemory(num_total_bin_ * 2 * num_leaves_, &cuda_hist_); SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_); @@ -109,7 +104,7 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { - auto start = std::chrono::steady_clock::now(); + //auto start = std::chrono::steady_clock::now(); if ((num_data_in_smaller_leaf <= min_data_in_leaf_ || sum_hessians_in_smaller_leaf <= min_sum_hessian_in_leaf_) && (num_data_in_larger_leaf <= min_data_in_leaf_ || sum_hessians_in_larger_leaf <= min_sum_hessian_in_leaf_)) { return; @@ -117,16 +112,16 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_num_data_in_smaller_leaf, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist, num_data_in_smaller_leaf); SynchronizeCUDADevice(); - auto end = std::chrono::steady_clock::now(); - double duration = (static_cast>(end - start)).count(); + //auto end = std::chrono::steady_clock::now(); + //double duration = (static_cast>(end - start)).count(); //Log::Warning("LaunchConstructHistogramKernel time %f", duration); global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); - start = std::chrono::steady_clock::now(); + //start = std::chrono::steady_clock::now(); LaunchSubtractHistogramKernel(cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_smaller_leaf_sum_gradients, cuda_smaller_leaf_sum_hessians, cuda_larger_leaf_sum_gradients, cuda_larger_leaf_sum_hessians, cuda_smaller_leaf_hist, cuda_larger_leaf_hist); - end = std::chrono::steady_clock::now(); - duration = (static_cast>(end - start)).count(); + //end = std::chrono::steady_clock::now(); + //duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); //Log::Warning("LaunchSubtractHistogramKernel time %f", duration); /*PrintLastCUDAError(); @@ -225,6 +220,18 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data } } + for (size_t i = 0; i < feature_partition_column_index_offsets_.size(); ++i) { + Log::Warning("feature_partition_column_index_offsets_[%d] = %d", i, feature_partition_column_index_offsets_[i]); + } + + for (size_t i = 0; i < column_hist_offsets_.size(); ++i) { + Log::Warning("column_hist_offsets_[%d] = %d", i, column_hist_offsets_[i]); + } + + for (size_t i = 0; i < column_hist_offsets_full_.size(); ++i) { + Log::Warning("column_hist_offsets_full_[%d] = %d", i, column_hist_offsets_full_[i]); + } + InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, feature_partition_column_index_offsets_.data(), feature_partition_column_index_offsets_.size()); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 21061a95ed0f..8bc946575ca8 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -71,8 +71,8 @@ __global__ void CUDAConstructHistogramKernel( const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x]; const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1]; const int num_columns_in_partition = partition_column_end - partition_column_start; - const uint32_t partition_hist_start = column_hist_offsets_full[partition_column_start + blockIdx.x]; - const uint32_t partition_hist_end = column_hist_offsets_full[partition_column_start + blockIdx.x + 1]; + const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; + const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; const uint32_t num_bins_in_partition = partition_hist_end - partition_hist_start; const uint32_t num_items_per_thread = (2 * num_bins_in_partition + num_threads_per_block - 1) / num_threads_per_block; const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; @@ -92,8 +92,8 @@ __global__ void CUDAConstructHistogramKernel( const data_size_t remainder = block_num_data % blockDim.y; const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); data_size_t inner_data_index = static_cast(threadIdx_y); - float* shared_hist_ptr = shared_hist + (column_hist_offsets[threadIdx.x] << 1); const int column_index = static_cast(threadIdx.x) + partition_column_start; + float* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); if (threadIdx.x < static_cast(num_columns_in_partition)) { for (data_size_t i = 0; i < num_iteration_this; ++i) { const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; @@ -126,6 +126,9 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( int block_dim_x = 0; int block_dim_y = 0; CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf); + /*Log::Warning("grid_dim_x = %d, grid_dim_y = %d", grid_dim_x, grid_dim_y); + Log::Warning("block_dim_x = %d, block_dim_y = %d", block_dim_x, block_dim_y); + Log::Warning("num_data_in_smaller_leaf = %d", num_data_in_smaller_leaf);*/ dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); CUDAConstructHistogramKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index f1f1dadc96a1..88aef303ed4e 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -62,15 +62,15 @@ void CUDALeafSplits::LaunchInitValuesKernal() { cuda_sum_of_hessians_); CopyFromCUDADeviceToCUDADevice(cuda_num_data_in_leaf_, cuda_num_data_, 1); SynchronizeCUDADevice(); - auto end = std::chrono::steady_clock::now(); - auto duration = static_cast>(end - start); + //auto end = std::chrono::steady_clock::now(); + //auto duration = static_cast>(end - start); //Log::Warning("CUDAInitValuesKernel1 duration = %f", duration.count()); - start = std::chrono::steady_clock::now(); + //start = std::chrono::steady_clock::now(); CUDAInitValuesKernel2<<>>( cuda_sum_of_gradients_, cuda_sum_of_hessians_); SynchronizeCUDADevice(); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); + //end = std::chrono::steady_clock::now(); + //duration = static_cast>(end - start); //Log::Warning("cuda_sum_of_gradients_ = %f, cuda_sum_of_hessians_ = %f", *cuda_sum_of_gradients_, *cuda_sum_of_hessians_); //Log::Warning("CUDAInitValuesKernel2 duration = %f", duration.count()); } diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 7565d833e6ed..c355e70f215d 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -103,13 +103,7 @@ void NewCUDATreeLearner::BeforeTrain() { larger_leaf_index_ = -1; } -void NewCUDATreeLearner::AllocateMemory(const bool is_constant_hessian) {} - -void NewCUDATreeLearner::CreateCUDAHistogramConstructors() {} - -void NewCUDATreeLearner::PushDataIntoDeviceHistogramConstructors() {} - -void NewCUDATreeLearner::FindBestSplits(const Tree* tree) {} +void NewCUDATreeLearner::FindBestSplits(const Tree* /*tree*/) {} void NewCUDATreeLearner::ConstructHistograms(const std::vector& /*is_feature_used*/, bool /*use_subtract*/) {} @@ -186,7 +180,6 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, double find_best_split_time = 0.0f; double find_best_split_from_all_leaves_time = 0.0f; double split_data_indices_time = 0.0f; - double split_tree_time = 0.0f; //std::unique_ptr tree(new Tree(config_->num_leaves, false, false)); int num_leaves = 1; for (int i = 0; i < config_->num_leaves - 1; ++i) { @@ -316,7 +309,6 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, Log::Warning("find best split time %f", find_best_split_time); Log::Warning("find best split time from all leaves %f", find_best_split_from_all_leaves_time); Log::Warning("split data indices time %f", split_data_indices_time); - //Log::Warning("split tree time %f", split_tree_time); Log::Warning("build tree time %f", build_tre_duration); return tree.release(); } diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 547e8a5f062e..ca297fba03f9 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -37,14 +37,6 @@ class NewCUDATreeLearner: public SerialTreeLearner { void AddPredictionToScore(const Tree* tree, double* out_score) const override; protected: - void AllocateFeatureTasks(); - - void AllocateMemory(const bool is_constant_hessian); - - void CreateCUDAHistogramConstructors(); - - void PushDataIntoDeviceHistogramConstructors(); - void FindBestSplits(const Tree* tree) override; void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) override; @@ -88,27 +80,6 @@ class NewCUDATreeLearner: public SerialTreeLearner { int smaller_leaf_index_; int larger_leaf_index_; int best_leaf_index_; - - /* - // full data indices on CUDA devices, as the data indices of data_partition_ in CPU version - std::vector device_data_indices_; - // gradient values on CUDA devices - std::vector device_gradients_; - // hessian values on CUDA devices - std::vector device_hessians_; - // gradient and hessian values in CUDA devices - std::vector device_gradients_and_hessians_; - // histogram storage on CUDA devices - std::vector device_histograms_; - - // device leaf splits initializer - std::vector> device_leaf_splits_initializers_; - // device histogram constructors - std::vector> device_histogram_constructors_; - // device best split finder - std::vector> device_best_split_finders_; - // device splitter - std::vector> device_splitters_;*/ }; } // namespace LightGBM From 0168d2c4b39a83967b314b664867d56a904d7ea2 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 9 Jun 2021 07:28:49 +0000 Subject: [PATCH 025/166] partition dense row data --- .../cuda/cuda_best_split_finder.cpp | 4 + .../cuda/cuda_best_split_finder.cu | 99 ++++++++++--------- .../cuda/cuda_histogram_constructor.cpp | 35 ++++++- .../cuda/cuda_histogram_constructor.cu | 11 ++- .../cuda/cuda_histogram_constructor.hpp | 3 + 5 files changed, 100 insertions(+), 52 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 787bdaea0ebf..3664d3e3f79a 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -189,6 +189,10 @@ void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves, LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves, smaller_leaf_index, larger_leaf_index, leaf_best_split_feature, leaf_best_split_threshold, leaf_best_split_default_left, best_leaf_index); SynchronizeCUDADevice(); + Log::Warning("smaller_leaf %d best split feature %d", smaller_leaf_index, leaf_best_split_feature->at(smaller_leaf_index)); + if (larger_leaf_index >= 0) { + Log::Warning("larger_leaf %d best split feature %d", larger_leaf_index, leaf_best_split_feature->at(larger_leaf_index)); + } //auto end = std::chrono::steady_clock::now(); //double duration = (static_cast>(end - start)).count(); } diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index ac5f1d4eeb96..b3f328c18da6 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -231,6 +231,10 @@ __device__ void FindBestSplitsForLeafKernelInner( __shared__ uint8_t threshold_found[MAX_NUM_BIN_IN_FEATURE]; __shared__ uint32_t threshold_value[MAX_NUM_BIN_IN_FEATURE]; + if (inner_feature_index >= 1000) { + printf("finding best split for feature %d\n", inner_feature_index); + } + const unsigned int threadIdx_x = threadIdx.x; const bool skip_sum = (skip_default_bin && (threadIdx_x + feature_mfb_offset) == static_cast(feature_default_bin)); const uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; @@ -690,7 +694,8 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { const unsigned int leaf_read_pos = static_cast(smaller_leaf_index) + block_index * static_cast(num_leaves); if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[smaller_leaf_index] == 1 && - cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[smaller_leaf_index])) { + cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[smaller_leaf_index]) || + (cuda_leaf_best_split_found[leaf_read_pos] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { cuda_leaf_best_split_found[smaller_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; cuda_leaf_best_split_feature[smaller_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; cuda_leaf_best_split_default_left[smaller_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; @@ -713,7 +718,8 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { const unsigned int leaf_read_pos = static_cast(larger_leaf_index) + block_index * static_cast(num_leaves); if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[larger_leaf_index] == 1 && - cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[larger_leaf_index])) { + cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[larger_leaf_index]) || + (cuda_leaf_best_split_found[leaf_read_pos] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { cuda_leaf_best_split_found[larger_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; cuda_leaf_best_split_feature[larger_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; cuda_leaf_best_split_default_left[larger_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; @@ -734,7 +740,6 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( } } -// TODO(shiyu1994): syncrhonize best from different blocks void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( const int cpu_smaller_leaf_index, const int cpu_larger_leaf_index, @@ -790,27 +795,29 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_blocks_per_leaf, false, num_leaves_); - SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( - cpu_smaller_leaf_index, - cpu_larger_leaf_index, - num_blocks_per_leaf, - num_leaves_, - cuda_leaf_best_split_feature_, - cuda_leaf_best_split_default_left_, - cuda_leaf_best_split_threshold_, - cuda_leaf_best_split_gain_, - cuda_leaf_best_split_left_sum_gradient_, - cuda_leaf_best_split_left_sum_hessian_, - cuda_leaf_best_split_left_count_, - cuda_leaf_best_split_left_gain_, - cuda_leaf_best_split_left_output_, - cuda_leaf_best_split_right_sum_gradient_, - cuda_leaf_best_split_right_sum_hessian_, - cuda_leaf_best_split_right_count_, - cuda_leaf_best_split_right_gain_, - cuda_leaf_best_split_right_output_, - cuda_leaf_best_split_found_, - false); + if (num_blocks_per_leaf > 1) { + SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( + cpu_smaller_leaf_index, + cpu_larger_leaf_index, + num_blocks_per_leaf, + num_leaves_, + cuda_leaf_best_split_feature_, + cuda_leaf_best_split_default_left_, + cuda_leaf_best_split_threshold_, + cuda_leaf_best_split_gain_, + cuda_leaf_best_split_left_sum_gradient_, + cuda_leaf_best_split_left_sum_hessian_, + cuda_leaf_best_split_left_count_, + cuda_leaf_best_split_left_gain_, + cuda_leaf_best_split_left_output_, + cuda_leaf_best_split_right_sum_gradient_, + cuda_leaf_best_split_right_sum_hessian_, + cuda_leaf_best_split_right_count_, + cuda_leaf_best_split_right_gain_, + cuda_leaf_best_split_right_output_, + cuda_leaf_best_split_found_, + false); + } } else { const bool larger_only = (!is_smaller_leaf_valid && is_larger_leaf_valid); SyncBestSplitForLeafKernel<<>>( @@ -853,27 +860,29 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_blocks_per_leaf, larger_only, num_leaves_); - SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( - cpu_smaller_leaf_index, - cpu_larger_leaf_index, - num_blocks_per_leaf, - num_leaves_, - cuda_leaf_best_split_feature_, - cuda_leaf_best_split_default_left_, - cuda_leaf_best_split_threshold_, - cuda_leaf_best_split_gain_, - cuda_leaf_best_split_left_sum_gradient_, - cuda_leaf_best_split_left_sum_hessian_, - cuda_leaf_best_split_left_count_, - cuda_leaf_best_split_left_gain_, - cuda_leaf_best_split_left_output_, - cuda_leaf_best_split_right_sum_gradient_, - cuda_leaf_best_split_right_sum_hessian_, - cuda_leaf_best_split_right_count_, - cuda_leaf_best_split_right_gain_, - cuda_leaf_best_split_right_output_, - cuda_leaf_best_split_found_, - larger_only); + if (num_blocks_per_leaf > 1) { + SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( + cpu_smaller_leaf_index, + cpu_larger_leaf_index, + num_blocks_per_leaf, + num_leaves_, + cuda_leaf_best_split_feature_, + cuda_leaf_best_split_default_left_, + cuda_leaf_best_split_threshold_, + cuda_leaf_best_split_gain_, + cuda_leaf_best_split_left_sum_gradient_, + cuda_leaf_best_split_left_sum_hessian_, + cuda_leaf_best_split_left_count_, + cuda_leaf_best_split_left_gain_, + cuda_leaf_best_split_left_output_, + cuda_leaf_best_split_right_sum_gradient_, + cuda_leaf_best_split_right_sum_hessian_, + cuda_leaf_best_split_right_count_, + cuda_leaf_best_split_right_gain_, + cuda_leaf_best_split_right_output_, + cuda_leaf_best_split_found_, + larger_only); + } } } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 01a8465e490b..a216599c89e7 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -75,9 +75,9 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat InitCUDAValueFromConstant(&cuda_num_features_, num_features_); - InitCUDAData(train_data, share_state); - DivideCUDAFeatureGroups(train_data, share_state); + + InitCUDAData(train_data, share_state); } void CUDAHistogramConstructor::InitCUDAData(const Dataset* train_data, TrainingShareStates* share_state) { @@ -85,7 +85,9 @@ void CUDAHistogramConstructor::InitCUDAData(const Dataset* train_data, TrainingS size_t total_size = 0; const uint8_t* cpu_data_ptr = share_state->GetRowWiseData(&bit_type, &total_size, &is_sparse_); CHECK_EQ(bit_type, 8); - InitCUDAMemoryFromHostMemory(&cuda_data_uint8_t_, cpu_data_ptr, total_size); + std::vector partitioned_data; + GetDenseDataPartitioned(cpu_data_ptr, &partitioned_data); + InitCUDAMemoryFromHostMemory(&cuda_data_uint8_t_, partitioned_data.data(), total_size); SynchronizeCUDADevice(); } @@ -245,6 +247,33 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data column_hist_offsets_full_.size()); } +template +void CUDAHistogramConstructor::GetDenseDataPartitioned(const BIN_TYPE* row_wise_data, std::vector* partitioned_data) { + Log::Warning("feature_partition_column_index_offsets_.size() = %d", feature_partition_column_index_offsets_.size()); + const int num_total_columns = feature_partition_column_index_offsets_.back(); + partitioned_data->resize(static_cast(num_total_columns) * static_cast(num_data_), 0); + BIN_TYPE* out_data = partitioned_data->data(); + Threading::For(0, num_data_, 512, + [this, num_total_columns, row_wise_data, out_data] (int thread_index, data_size_t start, data_size_t end) { + for (size_t i = 0; i < feature_partition_column_index_offsets_.size() - 1; ++i) { + const int num_prev_columns = static_cast(feature_partition_column_index_offsets_[i]); + const data_size_t offset = num_data_ * num_prev_columns; + const int partition_column_start = feature_partition_column_index_offsets_[i]; + const int partition_column_end = feature_partition_column_index_offsets_[i + 1]; + const int num_columns_in_cur_partition = partition_column_end - partition_column_start; + for (data_size_t data_index = start; data_index < end; ++data_index) { + const data_size_t data_offset = offset + data_index * num_columns_in_cur_partition; + const data_size_t read_data_offset = data_index * num_total_columns; + for (int column_index = 0; column_index < num_columns_in_cur_partition; ++column_index) { + const int true_column_index = read_data_offset + column_index + partition_column_start; + const BIN_TYPE bin = row_wise_data[true_column_index]; + out_data[data_offset + column_index] = bin; + } + } + } + }); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 8bc946575ca8..a0829d9d9501 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -58,7 +58,8 @@ __global__ void CUDAConstructHistogramKernel( const uint8_t* data, const uint32_t* column_hist_offsets, const uint32_t* column_hist_offsets_full, - const int* feature_partition_column_index_offsets) { + const int* feature_partition_column_index_offsets, + const data_size_t num_data) { const int num_feature_groups_ref = *num_feature_groups; const int leaf_index_ref = *leaf_index; @@ -70,6 +71,7 @@ __global__ void CUDAConstructHistogramKernel( const unsigned int num_threads_per_block = blockDim.x * blockDim.y; const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x]; const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1]; + const uint8_t* data_ptr = data + partition_column_start * num_data; const int num_columns_in_partition = partition_column_end - partition_column_start; const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; @@ -93,13 +95,13 @@ __global__ void CUDAConstructHistogramKernel( const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); data_size_t inner_data_index = static_cast(threadIdx_y); const int column_index = static_cast(threadIdx.x) + partition_column_start; - float* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); if (threadIdx.x < static_cast(num_columns_in_partition)) { + float* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); for (data_size_t i = 0; i < num_iteration_this; ++i) { const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; const score_t grad = cuda_gradients[data_index]; const score_t hess = cuda_hessians[data_index]; - const uint32_t bin = static_cast(data[data_index * num_feature_groups_ref + column_index]); + const uint32_t bin = static_cast(data_ptr[data_index * num_columns_in_partition + threadIdx.x]); const uint32_t pos = bin << 1; float* pos_ptr = shared_hist_ptr + pos; atomicAdd_system(pos_ptr, grad); @@ -135,7 +137,8 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, cuda_column_hist_offsets_, cuda_column_hist_offsets_full_, - cuda_feature_partition_column_index_offsets_); + cuda_feature_partition_column_index_offsets_, + num_data_); } __global__ void SubtractHistogramKernel(const int* /*cuda_smaller_leaf_index*/, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 58d17fa44446..c581d9032470 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -104,6 +104,9 @@ class CUDAHistogramConstructor { void DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state); + template + void GetDenseDataPartitioned(const BIN_TYPE* row_wise_data, std::vector* partitioned_data); + // Host memory // data on CPU, stored in row-wise style const data_size_t num_data_; From 0570fe039cfce1d1ebe29cd38a9a2a4c90bef868 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 9 Jun 2021 08:51:25 +0000 Subject: [PATCH 026/166] fix sync best split from task blocks --- src/treelearner/cuda/cuda_best_split_finder.cpp | 4 ---- src/treelearner/cuda/cuda_best_split_finder.cu | 14 ++++++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 3664d3e3f79a..787bdaea0ebf 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -189,10 +189,6 @@ void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves, LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves, smaller_leaf_index, larger_leaf_index, leaf_best_split_feature, leaf_best_split_threshold, leaf_best_split_default_left, best_leaf_index); SynchronizeCUDADevice(); - Log::Warning("smaller_leaf %d best split feature %d", smaller_leaf_index, leaf_best_split_feature->at(smaller_leaf_index)); - if (larger_leaf_index >= 0) { - Log::Warning("larger_leaf %d best split feature %d", larger_leaf_index, leaf_best_split_feature->at(larger_leaf_index)); - } //auto end = std::chrono::steady_clock::now(); //double duration = (static_cast>(end - start)).count(); } diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index b3f328c18da6..cb81bad2805d 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -231,10 +231,6 @@ __device__ void FindBestSplitsForLeafKernelInner( __shared__ uint8_t threshold_found[MAX_NUM_BIN_IN_FEATURE]; __shared__ uint32_t threshold_value[MAX_NUM_BIN_IN_FEATURE]; - if (inner_feature_index >= 1000) { - printf("finding best split for feature %d\n", inner_feature_index); - } - const unsigned int threadIdx_x = threadIdx.x; const bool skip_sum = (skip_default_bin && (threadIdx_x + feature_mfb_offset) == static_cast(feature_default_bin)); const uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; @@ -634,8 +630,8 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i __shared__ uint32_t shared_read_index[NUM_TASKS_PER_SYNC_BLOCK]; const bool is_smaller = (blockIdx_x < static_cast(num_blocks_per_leaf) && !larger_only); - const uint32_t leaf_block_index = (is_smaller || larger_only) ? blockIdx_x : (blockIdx_x - static_cast(num_blocks_per_leaf)); - const int task_index = static_cast(leaf_block_index * num_blocks_per_leaf + threadIdx_x); + const uint32_t leaf_block_index = (is_smaller || larger_only) ? blockIdx_x : (blockIdx_x - static_cast(num_blocks_per_leaf)); + const int task_index = static_cast(leaf_block_index * blockDim.x + threadIdx_x); const uint32_t read_index = is_smaller ? static_cast(task_index) : static_cast(task_index + num_tasks); if (task_index < num_tasks) { best_found[threadIdx_x] = cuda_best_split_found[read_index]; @@ -695,7 +691,7 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( const unsigned int leaf_read_pos = static_cast(smaller_leaf_index) + block_index * static_cast(num_leaves); if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[smaller_leaf_index] == 1 && cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[smaller_leaf_index]) || - (cuda_leaf_best_split_found[leaf_read_pos] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { + (cuda_leaf_best_split_found[smaller_leaf_index] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { cuda_leaf_best_split_found[smaller_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; cuda_leaf_best_split_feature[smaller_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; cuda_leaf_best_split_default_left[smaller_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; @@ -719,7 +715,7 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( const unsigned int leaf_read_pos = static_cast(larger_leaf_index) + block_index * static_cast(num_leaves); if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[larger_leaf_index] == 1 && cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[larger_leaf_index]) || - (cuda_leaf_best_split_found[leaf_read_pos] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { + (cuda_leaf_best_split_found[larger_leaf_index] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { cuda_leaf_best_split_found[larger_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; cuda_leaf_best_split_feature[larger_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; cuda_leaf_best_split_default_left[larger_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; @@ -796,6 +792,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( false, num_leaves_); if (num_blocks_per_leaf > 1) { + SynchronizeCUDADevice(); SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( cpu_smaller_leaf_index, cpu_larger_leaf_index, @@ -861,6 +858,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( larger_only, num_leaves_); if (num_blocks_per_leaf > 1) { + SynchronizeCUDADevice(); SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( cpu_smaller_leaf_index, cpu_larger_leaf_index, From 374018cb0ab7b341aab799b51e3503067797a399 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 9 Jun 2021 13:57:45 +0000 Subject: [PATCH 027/166] add support for sparse row wise for CUDA --- include/LightGBM/bin.h | 6 +- include/LightGBM/train_share_states.h | 17 +- src/io/multi_val_dense_bin.hpp | 30 ++- src/io/multi_val_sparse_bin.hpp | 87 ++++++- .../cuda/cuda_histogram_constructor.cpp | 236 ++++++++++++++++-- .../cuda/cuda_histogram_constructor.cu | 186 +++++++++++++- .../cuda/cuda_histogram_constructor.hpp | 17 +- 7 files changed, 535 insertions(+), 44 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index e890a849c128..dbb942373f97 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -462,7 +462,11 @@ class MultiValBin { virtual MultiValBin* Clone() = 0; - virtual const uint8_t* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const = 0; + virtual const uint8_t* GetRowWiseData(uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const = 0; }; inline uint32_t BinMapper::ValueToBin(double value) const { diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index f6535b3f326e..4521bb474de6 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -125,14 +125,19 @@ class MultiValBinWrapper { is_subrow_copied_ = is_subrow_copied; } - const uint8_t* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { + const uint8_t* GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { if (multi_val_bin_ == nullptr) { *bit_type = 0; *total_size = 0; *is_sparse = false; return nullptr; } else { - return multi_val_bin_->GetRowWiseData(bit_type, total_size, is_sparse); + return multi_val_bin_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type); } } @@ -224,9 +229,13 @@ struct TrainingShareStates { } } - const uint8_t* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) { + const uint8_t* GetRowWiseData(uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) { if (multi_val_bin_wrapper_ != nullptr) { - return multi_val_bin_wrapper_->GetRowWiseData(bit_type, total_size, is_sparse); + return multi_val_bin_wrapper_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type); } else { *bit_type = 0; *total_size = 0; diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index 20e9ed748a7e..13339cce0b23 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -210,7 +210,11 @@ class MultiValDenseBin : public MultiValBin { MultiValDenseBin* Clone() override; - const uint8_t* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const override; + const uint8_t* GetRowWiseData(uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const override; private: data_size_t num_data_; @@ -231,34 +235,52 @@ MultiValDenseBin* MultiValDenseBin::Clone() { } template <> -const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = data_.data(); *bit_type = 8; *total_size = static_cast(num_data_) * static_cast(num_feature_); CHECK_EQ(*total_size, data_.size()); *is_sparse = false; + *out_data_ptr = nullptr; + *data_ptr_bit_type = 0; return to_return; } template <> -const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint16_t* data_ptr = data_.data(); const uint8_t* to_return = reinterpret_cast(data_ptr); *bit_type = 16; *total_size = static_cast(num_data_) * static_cast(num_feature_); CHECK_EQ(*total_size, data_.size()); *is_sparse = false; + *out_data_ptr = nullptr; + *data_ptr_bit_type = 0; return to_return; } template <> -const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint32_t* data_ptr = data_.data(); const uint8_t* to_return = reinterpret_cast(data_ptr); *bit_type = 32; *total_size = static_cast(num_data_) * static_cast(num_feature_); CHECK_EQ(*total_size, data_.size()); *is_sparse = false; + *out_data_ptr = nullptr; + *data_ptr_bit_type = 0; return to_return; } diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index 58e063ba432b..a4e151a5830f 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -290,7 +290,11 @@ class MultiValSparseBin : public MultiValBin { MultiValSparseBin* Clone() override; - const uint8_t* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const override; + const uint8_t* GetRowWiseData(uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const override; private: data_size_t num_data_; @@ -319,83 +323,146 @@ MultiValSparseBin* MultiValSparseBin::Clone() { } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = data_.data(); *bit_type = 8; *total_size = data_.size(); *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 16; return to_return; } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 16; *total_size = data_.size(); *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 16; return to_return; } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 32; *total_size = data_.size(); *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 16; return to_return; } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = data_.data(); *bit_type = 8; *total_size = data_.size(); *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 32; return to_return; } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 16; *total_size = data_.size(); *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 32; return to_return; } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 32; *total_size = data_.size(); *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 32; return to_return; } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = data_.data(); *bit_type = 8; *total_size = data_.size(); *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 64; return to_return; } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 16; *total_size = data_.size(); *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 64; return to_return; } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse) const { +const uint8_t* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const uint8_t** out_data_ptr, + uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 32; *total_size = data_.size(); *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 64; return to_return; } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index a216599c89e7..3ff5f1e78ac5 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -17,8 +17,8 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, const int min_data_in_leaf, const double min_sum_hessian_in_leaf): num_data_(train_data->num_data()), num_features_(train_data->num_features()), num_leaves_(num_leaves), num_threads_(num_threads), num_feature_groups_(train_data->num_feature_groups()), - cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians), - min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf) { + min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf), + cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians) { int offset = 0; for (int group_id = 0; group_id < train_data->num_feature_groups(); ++group_id) { feature_group_bin_offsets_.emplace_back(offset); @@ -77,17 +77,178 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat DivideCUDAFeatureGroups(train_data, share_state); - InitCUDAData(train_data, share_state); + InitCUDAData(share_state); } -void CUDAHistogramConstructor::InitCUDAData(const Dataset* train_data, TrainingShareStates* share_state) { - uint8_t bit_type = 0; +void CUDAHistogramConstructor::InitCUDAData(TrainingShareStates* share_state) { + bit_type_ = 0; size_t total_size = 0; - const uint8_t* cpu_data_ptr = share_state->GetRowWiseData(&bit_type, &total_size, &is_sparse_); - CHECK_EQ(bit_type, 8); - std::vector partitioned_data; - GetDenseDataPartitioned(cpu_data_ptr, &partitioned_data); - InitCUDAMemoryFromHostMemory(&cuda_data_uint8_t_, partitioned_data.data(), total_size); + const uint8_t* data_ptr = nullptr; + data_ptr_bit_type_ = 0; + const uint8_t* cpu_data_ptr = share_state->GetRowWiseData(&bit_type_, &total_size, &is_sparse_, &data_ptr, &data_ptr_bit_type_); + CHECK_EQ(bit_type_, 8); + if (bit_type_ == 8) { + if (!is_sparse_) { + std::vector partitioned_data; + GetDenseDataPartitioned(cpu_data_ptr, &partitioned_data); + InitCUDAMemoryFromHostMemory(&cuda_data_uint8_t_, partitioned_data.data(), total_size); + } else { + std::vector> partitioned_data; + if (data_ptr_bit_type_ == 16) { + std::vector> partitioned_data_ptr; + std::vector partition_ptr; + const uint16_t* data_ptr_uint16_t = reinterpret_cast(data_ptr); + GetSparseDataPartitioned(cpu_data_ptr, data_ptr_uint16_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); + InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint16_t_, partition_ptr.data(), partition_ptr.size()); + AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint8_t_); + AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint16_t_); + for (size_t i = 0; i < partitioned_data.size(); ++i) { + const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; + const std::vector& data_for_this_partition = partitioned_data[i]; + CopyFromHostToCUDADevice(cuda_data_uint8_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); + CopyFromHostToCUDADevice(cuda_row_ptr_uint16_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); + } + } else if (data_ptr_bit_type_ == 32) { + const uint32_t* data_ptr_uint32_t = reinterpret_cast(data_ptr); + std::vector> partitioned_data_ptr; + std::vector partition_ptr; + GetSparseDataPartitioned(cpu_data_ptr, data_ptr_uint32_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); + InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint32_t_, partition_ptr.data(), partition_ptr.size()); + AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint8_t_); + AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint32_t_); + for (size_t i = 0; i < partitioned_data.size(); ++i) { + const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; + const std::vector& data_for_this_partition = partitioned_data[i]; + CopyFromHostToCUDADevice(cuda_data_uint8_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); + CopyFromHostToCUDADevice(cuda_row_ptr_uint32_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); + } + } else if (data_ptr_bit_type_ == 64) { + const uint64_t* data_ptr_uint64_t = reinterpret_cast(data_ptr); + std::vector> partitioned_data_ptr; + std::vector partition_ptr; + GetSparseDataPartitioned(cpu_data_ptr, data_ptr_uint64_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); + InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint64_t_, partition_ptr.data(), partition_ptr.size()); + AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint8_t_); + AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint64_t_); + for (size_t i = 0; i < partitioned_data.size(); ++i) { + const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; + const std::vector& data_for_this_partition = partitioned_data[i]; + CopyFromHostToCUDADevice(cuda_data_uint8_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); + CopyFromHostToCUDADevice(cuda_row_ptr_uint64_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); + } + } else { + Log::Fatal("Unknow data ptr bit type %d", data_ptr_bit_type_); + } + } + } else if (bit_type_ == 16) { + if (!is_sparse_) { + std::vector partitioned_data; + GetDenseDataPartitioned(reinterpret_cast(cpu_data_ptr), &partitioned_data); + InitCUDAMemoryFromHostMemory(&cuda_data_uint16_t_, partitioned_data.data(), total_size); + } else { + std::vector> partitioned_data; + if (data_ptr_bit_type_ == 16) { + std::vector> partitioned_data_ptr; + std::vector partition_ptr; + const uint16_t* data_ptr_uint16_t = reinterpret_cast(data_ptr); + GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint16_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); + InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint16_t_, partition_ptr.data(), partition_ptr.size()); + AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint16_t_); + AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint16_t_); + for (size_t i = 0; i < partitioned_data.size(); ++i) { + const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; + const std::vector& data_for_this_partition = partitioned_data[i]; + CopyFromHostToCUDADevice(cuda_data_uint16_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); + CopyFromHostToCUDADevice(cuda_row_ptr_uint16_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); + } + } else if (data_ptr_bit_type_ == 32) { + std::vector> partitioned_data_ptr; + std::vector partition_ptr; + const uint32_t* data_ptr_uint32_t = reinterpret_cast(data_ptr); + GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint32_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); + InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint32_t_, partition_ptr.data(), partition_ptr.size()); + AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint16_t_); + AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint32_t_); + for (size_t i = 0; i < partitioned_data.size(); ++i) { + const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; + const std::vector& data_for_this_partition = partitioned_data[i]; + CopyFromHostToCUDADevice(cuda_data_uint16_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); + CopyFromHostToCUDADevice(cuda_row_ptr_uint32_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); + } + } else if (data_ptr_bit_type_ == 64) { + std::vector> partitioned_data_ptr; + std::vector partition_ptr; + const uint64_t* data_ptr_uint64_t = reinterpret_cast(data_ptr); + GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint64_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); + InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint64_t_, partition_ptr.data(), partition_ptr.size()); + AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint16_t_); + AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint64_t_); + for (size_t i = 0; i < partitioned_data.size(); ++i) { + const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; + const std::vector& data_for_this_partition = partitioned_data[i]; + CopyFromHostToCUDADevice(cuda_data_uint16_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); + CopyFromHostToCUDADevice(cuda_row_ptr_uint64_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); + } + } else { + Log::Fatal("Unknow data ptr bit type %d", data_ptr_bit_type_); + } + } + } else if (bit_type_ == 32) { + if (!is_sparse_) { + std::vector partitioned_data; + GetDenseDataPartitioned(reinterpret_cast(cpu_data_ptr), &partitioned_data); + InitCUDAMemoryFromHostMemory(&cuda_data_uint32_t_, partitioned_data.data(), total_size); + } else { + std::vector> partitioned_data; + if (data_ptr_bit_type_ == 16) { + const uint16_t* data_ptr_uint16_t = reinterpret_cast(data_ptr); + std::vector> partitioned_data_ptr; + std::vector partition_ptr; + GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint16_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); + InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint16_t_, partition_ptr.data(), partition_ptr.size()); + AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint32_t_); + AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint16_t_); + for (size_t i = 0; i < partitioned_data.size(); ++i) { + const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; + const std::vector& data_for_this_partition = partitioned_data[i]; + CopyFromHostToCUDADevice(cuda_data_uint32_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); + CopyFromHostToCUDADevice(cuda_row_ptr_uint16_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); + } + } else if (data_ptr_bit_type_ == 32) { + const uint32_t* data_ptr_uint32_t = reinterpret_cast(data_ptr); + std::vector> partitioned_data_ptr; + std::vector partition_ptr; + GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint32_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); + InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint32_t_, partition_ptr.data(), partition_ptr.size()); + AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint32_t_); + AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint32_t_); + for (size_t i = 0; i < partitioned_data.size(); ++i) { + const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; + const std::vector& data_for_this_partition = partitioned_data[i]; + CopyFromHostToCUDADevice(cuda_data_uint32_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); + CopyFromHostToCUDADevice(cuda_row_ptr_uint32_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); + } + } else if (data_ptr_bit_type_ == 64) { + const uint64_t* data_ptr_uint64_t = reinterpret_cast(data_ptr); + std::vector> partitioned_data_ptr; + std::vector partition_ptr; + GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint64_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); + InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint64_t_, partition_ptr.data(), partition_ptr.size()); + AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint32_t_); + AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint64_t_); + for (size_t i = 0; i < partitioned_data.size(); ++i) { + const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; + const std::vector& data_for_this_partition = partitioned_data[i]; + CopyFromHostToCUDADevice(cuda_data_uint32_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); + CopyFromHostToCUDADevice(cuda_row_ptr_uint64_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); + } + } else { + Log::Fatal("Unknow data ptr bit type %d", data_ptr_bit_type_); + } + } + } else { + Log::Fatal("Unknow bit type = %d", bit_type_); + } SynchronizeCUDADevice(); } @@ -101,7 +262,7 @@ void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, } void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const data_size_t* cuda_num_data_in_smaller_leaf, - const int* cuda_larger_leaf_index, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, + const int* cuda_larger_leaf_index, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** /*cuda_data_indices_in_larger_leaf*/, const double* cuda_smaller_leaf_sum_gradients, const double* cuda_smaller_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, @@ -222,7 +383,7 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data } } - for (size_t i = 0; i < feature_partition_column_index_offsets_.size(); ++i) { + /*for (size_t i = 0; i < feature_partition_column_index_offsets_.size(); ++i) { Log::Warning("feature_partition_column_index_offsets_[%d] = %d", i, feature_partition_column_index_offsets_[i]); } @@ -232,7 +393,7 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data for (size_t i = 0; i < column_hist_offsets_full_.size(); ++i) { Log::Warning("column_hist_offsets_full_[%d] = %d", i, column_hist_offsets_full_[i]); - } + }*/ InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, feature_partition_column_index_offsets_.data(), @@ -249,12 +410,11 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data template void CUDAHistogramConstructor::GetDenseDataPartitioned(const BIN_TYPE* row_wise_data, std::vector* partitioned_data) { - Log::Warning("feature_partition_column_index_offsets_.size() = %d", feature_partition_column_index_offsets_.size()); const int num_total_columns = feature_partition_column_index_offsets_.back(); partitioned_data->resize(static_cast(num_total_columns) * static_cast(num_data_), 0); BIN_TYPE* out_data = partitioned_data->data(); Threading::For(0, num_data_, 512, - [this, num_total_columns, row_wise_data, out_data] (int thread_index, data_size_t start, data_size_t end) { + [this, num_total_columns, row_wise_data, out_data] (int /*thread_index*/, data_size_t start, data_size_t end) { for (size_t i = 0; i < feature_partition_column_index_offsets_.size() - 1; ++i) { const int num_prev_columns = static_cast(feature_partition_column_index_offsets_[i]); const data_size_t offset = num_data_ * num_prev_columns; @@ -274,6 +434,52 @@ void CUDAHistogramConstructor::GetDenseDataPartitioned(const BIN_TYPE* row_wise_ }); } +template +void CUDAHistogramConstructor::GetSparseDataPartitioned( + const BIN_TYPE* row_wise_data, + const DATA_PTR_TYPE* row_ptr, + std::vector>* partitioned_data, + std::vector>* partitioned_row_ptr, + std::vector* partition_ptr) { + const int num_partitions = static_cast(feature_partition_column_index_offsets_.size()) - 1; + partitioned_data->resize(num_partitions); + partitioned_row_ptr->resize(num_partitions); + Threading::For(0, num_partitions, 1, + [partitioned_data, partitioned_row_ptr, row_ptr, row_wise_data, this] (int /*thread_index*/, int start, int end) { + for (int partition_index = start; partition_index < end; ++partition_index) { + std::vector& data_for_this_partition = partitioned_data->at(partition_index); + std::vector& row_ptr_for_this_partition = partitioned_row_ptr->at(partition_index); + const int partition_hist_start = column_hist_offsets_full_[partition_index]; + const int partition_hist_end = column_hist_offsets_full_[partition_index + 1]; + DATA_PTR_TYPE offset = 0; + row_ptr_for_this_partition.emplace_back(offset); + for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { + const DATA_PTR_TYPE row_start = row_ptr[data_index]; + const DATA_PTR_TYPE row_end = row_ptr[data_index + 1]; + const BIN_TYPE* row_data_start = row_wise_data + row_start; + const BIN_TYPE* row_data_end = row_wise_data + row_end; + const size_t partition_start_in_row = std::lower_bound(row_data_start, row_data_end, partition_hist_start) - row_data_start; + const size_t partition_end_in_row = std::lower_bound(row_data_start, row_data_end, partition_hist_end) - row_data_start; + for (size_t pos = partition_start_in_row; pos < partition_end_in_row; ++pos) { + const BIN_TYPE bin = row_data_start[pos]; + CHECK_GE(bin, static_cast(partition_hist_start)); + data_for_this_partition.emplace_back(bin - partition_hist_start); + } + CHECK_GE(partition_end_in_row, partition_start_in_row); + offset += static_cast(partition_end_in_row - partition_start_in_row); + row_ptr_for_this_partition.emplace_back(offset); + } + } + }); + partition_ptr->clear(); + DATA_PTR_TYPE offset = 0; + partition_ptr->emplace_back(offset); + for (size_t i = 0; i < partitioned_row_ptr->size(); ++i) { + offset += partitioned_row_ptr->at(i).back(); + partition_ptr->emplace_back(offset); + } +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index a0829d9d9501..ad4ab0309aef 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -47,7 +47,8 @@ __device__ void PrefixSum(hist_t* elements, unsigned int n) { } } -__global__ void CUDAConstructHistogramKernel( +template +__global__ void CUDAConstructHistogramDenseKernel( const int* leaf_index, const score_t* cuda_gradients, const score_t* cuda_hessians, @@ -55,7 +56,7 @@ __global__ void CUDAConstructHistogramKernel( hist_t** feature_histogram, const int* num_feature_groups, const data_size_t* leaf_num_data, - const uint8_t* data, + const BIN_TYPE* data, const uint32_t* column_hist_offsets, const uint32_t* column_hist_offsets_full, const int* feature_partition_column_index_offsets, @@ -71,7 +72,7 @@ __global__ void CUDAConstructHistogramKernel( const unsigned int num_threads_per_block = blockDim.x * blockDim.y; const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x]; const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1]; - const uint8_t* data_ptr = data + partition_column_start * num_data; + const BIN_TYPE* data_ptr = data + partition_column_start * num_data; const int num_columns_in_partition = partition_column_end - partition_column_start; const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; @@ -116,6 +117,75 @@ __global__ void CUDAConstructHistogramKernel( } } +template +__global__ void CUDAConstructHistogramSparseKernel( + const int* leaf_index, + const score_t* cuda_gradients, + const score_t* cuda_hessians, + const data_size_t** data_indices_ptr, + hist_t** feature_histogram, + const int* num_feature_groups, + const data_size_t* leaf_num_data, + const BIN_TYPE* data, + const DATA_PTR_TYPE* row_ptr, + const DATA_PTR_TYPE* partition_ptr, + const uint32_t* column_hist_offsets_full, + const data_size_t num_data) { + + const int num_feature_groups_ref = *num_feature_groups; + const int leaf_index_ref = *leaf_index; + const int dim_y = static_cast(gridDim.y * blockDim.y); + const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; + const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; + const data_size_t* data_indices_ref = *data_indices_ptr; + __shared__ float shared_hist[SHRAE_HIST_SIZE]; + const unsigned int num_threads_per_block = blockDim.x * blockDim.y; + const DATA_PTR_TYPE* block_row_ptr = row_ptr + blockIdx.x * (num_data + 1); + const BIN_TYPE* data_ptr = data + partition_ptr[blockIdx.x]; + const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; + const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; + const uint32_t num_bins_in_partition = partition_hist_end - partition_hist_start; + const uint32_t num_items_per_thread = (2 * num_bins_in_partition + num_threads_per_block - 1) / num_threads_per_block; + const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; + const uint32_t thread_start = thread_idx * num_items_per_thread; + const uint32_t thread_end = thread_start + num_items_per_thread > num_bins_in_partition * 2 ? + num_bins_in_partition * 2 : thread_start + num_items_per_thread; + for (uint32_t i = thread_start; i < thread_end; ++i) { + shared_hist[i] = 0.0f; + } + __syncthreads(); + const unsigned int threadIdx_y = threadIdx.y; + const unsigned int blockIdx_y = blockIdx.y; + const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread; + const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; + data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf_ref - block_start, num_data_per_thread * static_cast(blockDim.y))); + const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y; + const data_size_t remainder = block_num_data % blockDim.y; + const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); + data_size_t inner_data_index = static_cast(threadIdx_y); + for (data_size_t i = 0; i < num_iteration_this; ++i) { + const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; + const DATA_PTR_TYPE row_start = block_row_ptr[data_index]; + const DATA_PTR_TYPE row_end = block_row_ptr[data_index + 1]; + const DATA_PTR_TYPE row_size = row_end - row_start; + if (threadIdx.x < row_size) { + const score_t grad = cuda_gradients[data_index]; + const score_t hess = cuda_hessians[data_index]; + const uint32_t bin = static_cast(data_ptr[row_start + threadIdx.x]); + const uint32_t pos = bin << 1; + float* pos_ptr = shared_hist + pos; + atomicAdd_system(pos_ptr, grad); + atomicAdd_system(pos_ptr + 1, hess); + inner_data_index += blockDim.y; + } + } + __syncthreads(); + hist_t* feature_histogram_ptr = (*feature_histogram) + (partition_hist_start << 1); + for (uint32_t i = thread_start; i < thread_end; ++i) { + atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); + } +} + void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const int* cuda_smaller_leaf_index, const data_size_t* cuda_smaller_leaf_num_data, @@ -133,12 +203,110 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( Log::Warning("num_data_in_smaller_leaf = %d", num_data_in_smaller_leaf);*/ dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); - CUDAConstructHistogramKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, - cuda_column_hist_offsets_, - cuda_column_hist_offsets_full_, - cuda_feature_partition_column_index_offsets_, - num_data_); + if (is_sparse_) { + if (bit_type_ == 8) { + if (data_ptr_bit_type_ == 16) { + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint8_t_, + cuda_row_ptr_uint16_t_, + cuda_partition_ptr_uint16_t_, + cuda_column_hist_offsets_full_, + num_data_); + } else if (data_ptr_bit_type_ == 32) { + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint8_t_, + cuda_row_ptr_uint32_t_, + cuda_partition_ptr_uint32_t_, + cuda_column_hist_offsets_full_, + num_data_); + } else if (data_ptr_bit_type_ == 64) { + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint8_t_, + cuda_row_ptr_uint64_t_, + cuda_partition_ptr_uint64_t_, + cuda_column_hist_offsets_full_, + num_data_); + } + } else if (bit_type_ == 16) { + if (data_ptr_bit_type_ == 16) { + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint16_t_, + cuda_row_ptr_uint16_t_, + cuda_partition_ptr_uint16_t_, + cuda_column_hist_offsets_full_, + num_data_); + } else if (data_ptr_bit_type_ == 32) { + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint16_t_, + cuda_row_ptr_uint32_t_, + cuda_partition_ptr_uint32_t_, + cuda_column_hist_offsets_full_, + num_data_); + } else if (data_ptr_bit_type_ == 64) { + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint32_t_, + cuda_row_ptr_uint64_t_, + cuda_partition_ptr_uint64_t_, + cuda_column_hist_offsets_full_, + num_data_); + } + } else if (bit_type_ == 32) { + if (data_ptr_bit_type_ == 16) { + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint32_t_, + cuda_row_ptr_uint16_t_, + cuda_partition_ptr_uint16_t_, + cuda_column_hist_offsets_full_, + num_data_); + } else if (data_ptr_bit_type_ == 32) { + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint32_t_, + cuda_row_ptr_uint32_t_, + cuda_partition_ptr_uint32_t_, + cuda_column_hist_offsets_full_, + num_data_); + } else if (data_ptr_bit_type_ == 64) { + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint32_t_, + cuda_row_ptr_uint64_t_, + cuda_partition_ptr_uint64_t_, + cuda_column_hist_offsets_full_, + num_data_); + } + } + } else { + if (bit_type_ == 8) { + CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, + cuda_column_hist_offsets_, + cuda_column_hist_offsets_full_, + cuda_feature_partition_column_index_offsets_, + num_data_); + } else if (bit_type_ == 16) { + CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, + cuda_column_hist_offsets_, + cuda_column_hist_offsets_full_, + cuda_feature_partition_column_index_offsets_, + num_data_); + } else if (bit_type_ == 32) { + CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, + cuda_column_hist_offsets_, + cuda_column_hist_offsets_full_, + cuda_feature_partition_column_index_offsets_, + num_data_); + } + } } __global__ void SubtractHistogramKernel(const int* /*cuda_smaller_leaf_index*/, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index c581d9032470..aff2543dcfff 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -98,7 +98,7 @@ class CUDAHistogramConstructor { const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist); - void InitCUDAData(const Dataset* train_data, TrainingShareStates* share_state); + void InitCUDAData(TrainingShareStates* share_state); void PushOneData(const uint32_t feature_bin_value, const int feature_group_id, const data_size_t data_index); @@ -107,6 +107,13 @@ class CUDAHistogramConstructor { template void GetDenseDataPartitioned(const BIN_TYPE* row_wise_data, std::vector* partitioned_data); + template + void GetSparseDataPartitioned(const BIN_TYPE* row_wise_data, + const DATA_PTR_TYPE* row_ptr, + std::vector>* partitioned_data, + std::vector>* partitioned_row_ptr, + std::vector* partition_ptr); + // Host memory // data on CPU, stored in row-wise style const data_size_t num_data_; @@ -129,6 +136,8 @@ class CUDAHistogramConstructor { bool is_sparse_; int num_feature_partitions_; int max_num_column_per_partition_; + uint8_t data_ptr_bit_type_; + uint8_t bit_type_; // CUDA memory, held by this object uint32_t* cuda_feature_group_bin_offsets_; @@ -142,6 +151,12 @@ class CUDAHistogramConstructor { uint8_t* cuda_data_uint8_t_; uint16_t* cuda_data_uint16_t_; uint32_t* cuda_data_uint32_t_; + uint16_t* cuda_row_ptr_uint16_t_; + uint32_t* cuda_row_ptr_uint32_t_; + uint64_t* cuda_row_ptr_uint64_t_; + uint16_t* cuda_partition_ptr_uint16_t_; + uint32_t* cuda_partition_ptr_uint32_t_; + uint64_t* cuda_partition_ptr_uint64_t_; int* cuda_num_features_; score_t* cuda_ordered_gradients_; score_t* cuda_ordered_hessians_; From 40c49cc0abf0d2bf1d6d345c28f587692b20504b Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 9 Jun 2021 14:05:31 +0000 Subject: [PATCH 028/166] remove useless code --- .../cuda/cuda_best_split_finder.cpp | 7 ---- src/treelearner/cuda/cuda_data_partition.cpp | 9 ----- src/treelearner/cuda/cuda_data_partition.cu | 35 +++---------------- src/treelearner/cuda/cuda_data_partition.hpp | 2 -- .../cuda/cuda_histogram_constructor.cpp | 23 ------------ .../cuda/cuda_histogram_constructor.cu | 10 ++---- src/treelearner/cuda/cuda_leaf_splits.cu | 10 ------ .../cuda/new_cuda_tree_learner.cpp | 25 ------------- src/treelearner/cuda/new_cuda_utils.cu | 34 ------------------ 9 files changed, 6 insertions(+), 149 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 787bdaea0ebf..efcfc3ea3b12 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -167,7 +167,6 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { - //auto start = std::chrono::steady_clock::now(); const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ && sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_); const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ && sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_); LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits, @@ -177,20 +176,14 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); SynchronizeCUDADevice(); global_timer.Stop("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); - //auto end = std::chrono::steady_clock::now(); - //double duration = (static_cast>(end - start)).count(); - //Log::Warning("FindBestSplitsForLeaf time %f", duration); } void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { - //auto start = std::chrono::steady_clock::now(); LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves, smaller_leaf_index, larger_leaf_index, leaf_best_split_feature, leaf_best_split_threshold, leaf_best_split_default_left, best_leaf_index); SynchronizeCUDADevice(); - //auto end = std::chrono::steady_clock::now(); - //double duration = (static_cast>(end - start)).count(); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index d11b5e20ce1a..6110e95a5a93 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -313,7 +313,6 @@ void CUDADataPartition::CopyColWiseData(const Dataset* train_data) { } } } - //LaunchCopyColWiseDataKernel(); } void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { @@ -375,15 +374,10 @@ void CUDADataPartition::Split(const int* leaf_id, const uint8_t split_default_left = cpu_leaf_best_split_default_left[cpu_leaf_index]; const data_size_t leaf_data_start = cpu_leaf_data_start->at(cpu_leaf_index); global_timer.Stop("SplitInner Copy CUDA To Host"); - //auto start = std::chrono::steady_clock::now(); GenDataToLeftBitVector(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start); - //auto end = std::chrono::steady_clock::now(); - //double duration = (static_cast>(end - start)).count(); global_timer.Stop("GenDataToLeftBitVector"); - //Log::Warning("CUDADataPartition::GenDataToLeftBitVector time %f", duration); global_timer.Start("SplitInner"); - //start = std::chrono::steady_clock::now(); SplitInner(leaf_id, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, @@ -401,10 +395,7 @@ void CUDADataPartition::Split(const int* leaf_id, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, smaller_leaf_index, larger_leaf_index); - //end = std::chrono::steady_clock::now(); - //duration = (static_cast>(end - start)).count(); global_timer.Stop("SplitInner"); - //Log::Warning("CUDADataPartition::SplitInner time %f", duration); } void CUDADataPartition::GenDataToLeftBitVector(const data_size_t num_data_in_leaf, diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 83750e42551a..e5daf339937d 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -1195,23 +1195,11 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo best_split_found[leaf_index_ref] = 0; best_split_found[cur_max_leaf_index] = 0; - /*if (cuda_leaf_num_data[leaf_index_ref] <= 0) { - printf("error !!! leaf %d has count %d\n", leaf_index_ref, cuda_leaf_num_data[leaf_index_ref]); - } - - if (cuda_leaf_num_data[cur_max_leaf_index] <= 0) { - printf("error !!! leaf %d has count %d\n", cur_max_leaf_index, cuda_leaf_num_data[cur_max_leaf_index]); - } - - printf("splitting %d into %d with num data %d and %d with num data %d\n", - leaf_index_ref, leaf_index_ref, cuda_leaf_num_data[leaf_index_ref], - cur_max_leaf_index, cuda_leaf_num_data[cur_max_leaf_index]);*/ - if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; *smaller_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; *smaller_leaf_cuda_sum_of_hessians_pointer = best_left_sum_hessians[leaf_index_ref]; - *smaller_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt;//best_left_count[leaf_index_ref]; + *smaller_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt; *smaller_leaf_cuda_gain_pointer = best_left_gain[leaf_index_ref]; *smaller_leaf_cuda_leaf_value_pointer = best_left_leaf_value[leaf_index_ref]; *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[leaf_index_ref]; @@ -1219,7 +1207,7 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo *larger_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; *larger_leaf_cuda_sum_of_gradients_pointer = best_right_sum_gradients[leaf_index_ref]; *larger_leaf_cuda_sum_of_hessians_pointer = best_right_sum_hessians[leaf_index_ref]; - *larger_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index];//best_right_count[leaf_index_ref]; + *larger_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index]; *larger_leaf_cuda_gain_pointer = best_right_gain[leaf_index_ref]; *larger_leaf_cuda_leaf_value_pointer = best_right_leaf_value[leaf_index_ref]; *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[cur_max_leaf_index]; @@ -1235,7 +1223,7 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo *larger_leaf_cuda_leaf_index_pointer = leaf_index_ref; *larger_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; *larger_leaf_cuda_sum_of_hessians_pointer = best_left_sum_hessians[leaf_index_ref]; - *larger_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt;//best_left_count[leaf_index_ref]; + *larger_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt; *larger_leaf_cuda_gain_pointer = best_left_gain[leaf_index_ref]; *larger_leaf_cuda_leaf_value_pointer = best_left_leaf_value[leaf_index_ref]; *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[leaf_index_ref]; @@ -1243,7 +1231,7 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo *smaller_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; *smaller_leaf_cuda_sum_of_gradients_pointer = best_right_sum_gradients[leaf_index_ref]; *smaller_leaf_cuda_sum_of_hessians_pointer = best_right_sum_hessians[leaf_index_ref]; - *smaller_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index];//best_right_count[leaf_index_ref]; + *smaller_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index]; *smaller_leaf_cuda_gain_pointer = best_right_gain[leaf_index_ref]; *smaller_leaf_cuda_leaf_value_pointer = best_right_leaf_value[leaf_index_ref]; *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[cur_max_leaf_index]; @@ -1374,7 +1362,6 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data } const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); - //auto start = std::chrono::steady_clock::now(); AggregateBlockOffsetKernel<<<1, split_indices_block_size_data_partition_aligned / 2>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, @@ -1404,8 +1391,6 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, data_partition_leaf_output_); SynchronizeCUDADevice(); - //auto end = std::chrono::steady_clock::now(); - //auto duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); global_timer.Start("CUDADataPartition::SplitInnerKernel"); @@ -1413,18 +1398,12 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); - //end = std::chrono::steady_clock::now(); - //duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::SplitInnerKernel"); global_timer.Start("CUDADataPartition::CopyDataIndicesKernel"); - //start = std::chrono::steady_clock::now(); CopyDataIndicesKernel<<>>( leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_data_indices_); - //end = std::chrono::steady_clock::now(); - //duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); - //start = std::chrono::steady_clock::now(); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); SplitTreeStructureKernel<<<1, 1, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, @@ -1532,12 +1511,6 @@ __global__ void CopyColWiseDataKernel(const uint8_t* row_wise_data, } } -void CUDADataPartition::LaunchCopyColWiseDataKernel() { - /*const int block_size = 1024; - const int num_blocks = (num_data_ + block_size - 1) / block_size; - CopyColWiseDataKernel<<>>(cuda_data_, num_data_, num_features_, cuda_data_col_wise_);*/ -} - __global__ void CUDACheckKernel(const data_size_t** data_indices_in_leaf_ptr, const data_size_t num_data_in_leaf, const score_t* gradients, diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 3c936086e8b6..919d9da3a32e 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -190,8 +190,6 @@ class CUDADataPartition { private: void CopyColWiseData(const Dataset* train_data); - void LaunchCopyColWiseDataKernel(); - void GenDataToLeftBitVector(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 3ff5f1e78ac5..4079574daa3c 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -267,7 +267,6 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { - //auto start = std::chrono::steady_clock::now(); if ((num_data_in_smaller_leaf <= min_data_in_leaf_ || sum_hessians_in_smaller_leaf <= min_sum_hessian_in_leaf_) && (num_data_in_larger_leaf <= min_data_in_leaf_ || sum_hessians_in_larger_leaf <= min_sum_hessian_in_leaf_)) { return; @@ -275,21 +274,11 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_num_data_in_smaller_leaf, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist, num_data_in_smaller_leaf); SynchronizeCUDADevice(); - //auto end = std::chrono::steady_clock::now(); - //double duration = (static_cast>(end - start)).count(); - //Log::Warning("LaunchConstructHistogramKernel time %f", duration); global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); - //start = std::chrono::steady_clock::now(); LaunchSubtractHistogramKernel(cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_smaller_leaf_sum_gradients, cuda_smaller_leaf_sum_hessians, cuda_larger_leaf_sum_gradients, cuda_larger_leaf_sum_hessians, cuda_smaller_leaf_hist, cuda_larger_leaf_hist); - //end = std::chrono::steady_clock::now(); - //duration = (static_cast>(end - start)).count(); global_timer.Stop("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); - //Log::Warning("LaunchSubtractHistogramKernel time %f", duration); - /*PrintLastCUDAError(); - std::vector cpu_hist(6143 * 2, 0.0f); - CopyFromCUDADeviceToHost(cpu_hist.data(), cuda_hist_, 6143 * 2);*/ } void CUDAHistogramConstructor::CalcConstructHistogramKernelDim( @@ -383,18 +372,6 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data } } - /*for (size_t i = 0; i < feature_partition_column_index_offsets_.size(); ++i) { - Log::Warning("feature_partition_column_index_offsets_[%d] = %d", i, feature_partition_column_index_offsets_[i]); - } - - for (size_t i = 0; i < column_hist_offsets_.size(); ++i) { - Log::Warning("column_hist_offsets_[%d] = %d", i, column_hist_offsets_[i]); - } - - for (size_t i = 0; i < column_hist_offsets_full_.size(); ++i) { - Log::Warning("column_hist_offsets_full_[%d] = %d", i, column_hist_offsets_full_[i]); - }*/ - InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, feature_partition_column_index_offsets_.data(), feature_partition_column_index_offsets_.size()); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index ad4ab0309aef..f43c44e94337 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -198,9 +198,6 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( int block_dim_x = 0; int block_dim_y = 0; CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf); - /*Log::Warning("grid_dim_x = %d, grid_dim_y = %d", grid_dim_x, grid_dim_y); - Log::Warning("block_dim_x = %d, block_dim_y = %d", block_dim_x, block_dim_y); - Log::Warning("num_data_in_smaller_leaf = %d", num_data_in_smaller_leaf);*/ dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); if (is_sparse_) { @@ -315,11 +312,10 @@ __global__ void SubtractHistogramKernel(const int* /*cuda_smaller_leaf_index*/, hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist) { const int cuda_num_total_bin_ref = *cuda_num_total_bin; const unsigned int global_thread_index = threadIdx.x + blockIdx.x * blockDim.x; - //const int cuda_smaller_leaf_index_ref = *cuda_smaller_leaf_index; const int cuda_larger_leaf_index_ref = *cuda_larger_leaf_index; if (cuda_larger_leaf_index_ref >= 0) { - const hist_t* smaller_leaf_hist = *cuda_smaller_leaf_hist; //cuda_hist + (cuda_smaller_leaf_index_ref * cuda_num_total_bin_ref * 2); - hist_t* larger_leaf_hist = *cuda_larger_leaf_hist; //cuda_hist + (cuda_larger_leaf_index_ref * cuda_num_total_bin_ref * 2); + const hist_t* smaller_leaf_hist = *cuda_smaller_leaf_hist; + hist_t* larger_leaf_hist = *cuda_larger_leaf_hist; if (global_thread_index < 2 * cuda_num_total_bin_ref) { larger_leaf_hist[global_thread_index] -= smaller_leaf_hist[global_thread_index]; } @@ -342,7 +338,6 @@ __global__ void FixHistogramKernel(const int* cuda_smaller_leaf_index, __shared__ double hist_gradients[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; __shared__ double hist_hessians[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; if (leaf_index_ref >= 0) { - //const int cuda_num_total_bin_ref = *cuda_num_total_bin; const uint32_t feature_hist_offset = cuda_feature_hist_offsets[feature_index]; const uint32_t most_freq_bin = cuda_feature_most_freq_bins[feature_index]; if (most_freq_bin > 0) { @@ -350,7 +345,6 @@ __global__ void FixHistogramKernel(const int* cuda_smaller_leaf_index, const double leaf_sum_hessians = larger_or_smaller ? *larger_leaf_sum_hessians : *smaller_leaf_sum_hessians; hist_t* feature_hist = larger_or_smaller ? (*cuda_larger_leaf_hist) + feature_hist_offset * 2 : (*cuda_smaller_leaf_hist) + feature_hist_offset * 2; - //cuda_hist + cuda_num_total_bin_ref * 2 * leaf_index_ref + feature_hist_offset * 2; const unsigned int threadIdx_x = threadIdx.x; const uint32_t num_bin = cuda_feature_num_bins[feature_index]; if (threadIdx_x < num_bin) { diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 88aef303ed4e..5ed07f1a8f59 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -48,8 +48,6 @@ __global__ void CUDAInitValuesKernel2(double* cuda_sum_of_gradients, double* cud sum_of_gradients += cuda_sum_of_gradients[i]; sum_of_hessians += cuda_sum_of_hessians[i]; } - //printf("sum_of_gradients = %f\n", sum_of_gradients); - //printf("sum_of_hessians = %f\n", sum_of_hessians); cuda_sum_of_gradients[0] += sum_of_gradients; cuda_sum_of_hessians[0] += sum_of_hessians; } @@ -62,17 +60,9 @@ void CUDALeafSplits::LaunchInitValuesKernal() { cuda_sum_of_hessians_); CopyFromCUDADeviceToCUDADevice(cuda_num_data_in_leaf_, cuda_num_data_, 1); SynchronizeCUDADevice(); - //auto end = std::chrono::steady_clock::now(); - //auto duration = static_cast>(end - start); - //Log::Warning("CUDAInitValuesKernel1 duration = %f", duration.count()); - //start = std::chrono::steady_clock::now(); CUDAInitValuesKernel2<<>>( cuda_sum_of_gradients_, cuda_sum_of_hessians_); SynchronizeCUDADevice(); - //end = std::chrono::steady_clock::now(); - //duration = static_cast>(end - start); - //Log::Warning("cuda_sum_of_gradients_ = %f, cuda_sum_of_hessians_ = %f", *cuda_sum_of_gradients_, *cuda_sum_of_hessians_); - //Log::Warning("CUDAInitValuesKernel2 duration = %f", duration.count()); } } // namespace LightGBM diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index c355e70f215d..b1612bce205f 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -25,7 +25,6 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia const label_t* labels = train_data->metadata().label(); cuda_centralized_info_.reset(new CUDACentralizedInfo(num_data_, this->config_->num_leaves, num_features_)); cuda_centralized_info_->Init(labels); - //cuda_centralized_info_->Test(); cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); cuda_smaller_leaf_splits_->Init(); @@ -36,7 +35,6 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), share_state_->feature_hist_offsets(), config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf)); cuda_histogram_constructor_->Init(train_data_, share_state_.get()); - //cuda_histogram_constructor_->TestAfterInit(); cuda_data_partition_.reset(new CUDADataPartition(num_data_, num_features_, this->config_->num_leaves, num_threads_, cuda_centralized_info_->cuda_num_data(), cuda_centralized_info_->cuda_num_leaves(), cuda_centralized_info_->cuda_num_features(), @@ -56,7 +54,6 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_binary_objective_->CalcInitScore(); cuda_score_updater_->SetInitScore(cuda_binary_objective_->cuda_init_score()); - //cuda_best_split_finder_->TestAfterInit(); leaf_best_split_feature_.resize(config_->num_leaves, -1); leaf_best_split_threshold_.resize(config_->num_leaves, 0); @@ -71,32 +68,25 @@ void NewCUDATreeLearner::BeforeTrain() { cuda_data_partition_->BeforeTrain(nullptr); auto end = std::chrono::steady_clock::now(); auto duration = static_cast>(end - start); - //Log::Warning("cuda_data_partition_->BeforeTrain duration = %f", duration.count()); global_timer.Start("CUDACentralizedInfo::BeforeTrain"); start = std::chrono::steady_clock::now(); - //cuda_centralized_info_->BeforeTrain(gradients_, hessians_); cuda_binary_objective_->GetGradients(cuda_score_updater_->cuda_scores(), cuda_centralized_info_->cuda_gradients_ref(), cuda_centralized_info_->cuda_hessians_ref()); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); global_timer.Stop("CUDACentralizedInfo::BeforeTrain"); - //Log::Warning("cuda_centralized_info_->BeforeTrain duration = %f", duration.count()); cuda_smaller_leaf_splits_->InitValues(cuda_data_partition_->cuda_data_indices(), cuda_histogram_constructor_->cuda_hist_pointer(), &leaf_sum_hessians_[0]); cuda_larger_leaf_splits_->InitValues(); - //cuda_smaller_leaf_splits_->Test(); start = std::chrono::steady_clock::now(); cuda_histogram_constructor_->BeforeTrain(); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); - //Log::Warning("cuda_histogram_constructor_->BeforeTrain() duration = %f", duration.count()); start = std::chrono::steady_clock::now(); cuda_best_split_finder_->BeforeTrain(); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); - //Log::Warning("cuda_best_split_finder_->BeforeTrain() duration = %f", duration.count()); - //cuda_data_partition_->Test(); leaf_num_data_[0] = num_data_; leaf_data_start_[0] = 0; smaller_leaf_index_ = 0; @@ -115,11 +105,7 @@ void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, int* /*left_leaf*/, int* /*right_leaf*/) {} void NewCUDATreeLearner::AddPredictionToScore(const Tree* /*tree*/, double* /*out_score*/) const { - //const auto start = std::chrono::steady_clock::now(); cuda_data_partition_->UpdateTrainScore(config_->learning_rate, cuda_score_updater_->cuda_score_ref()); - //const auto end = std::chrono::steady_clock::now(); - //const auto duration = static_cast>(end - start).count(); - //Log::Warning("AddPredictionToScore time %f", duration); } Tree* NewCUDATreeLearner::BuildTree(const int num_leaves) { @@ -180,10 +166,8 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, double find_best_split_time = 0.0f; double find_best_split_from_all_leaves_time = 0.0f; double split_data_indices_time = 0.0f; - //std::unique_ptr tree(new Tree(config_->num_leaves, false, false)); int num_leaves = 1; for (int i = 0; i < config_->num_leaves - 1; ++i) { - //Log::Warning("Before ConstructHistogramForLeaf"); global_timer.Start("NewCUDATreeLearner::ConstructHistogramForLeaf"); auto start = std::chrono::steady_clock::now(); const data_size_t num_data_in_smaller_leaf = leaf_num_data_[smaller_leaf_index_]; @@ -207,21 +191,16 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, num_data_in_larger_leaf, sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf); - /*if (i == 0) { - cuda_histogram_constructor_->TestAfterConstructHistogram(); - }*/ auto end = std::chrono::steady_clock::now(); auto duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::ConstructHistogramForLeaf"); construct_histogram_time += duration.count(); - //Log::Warning("Before FindBestSplitsForLeaf"); global_timer.Start("NewCUDATreeLearner::FindBestSplitsForLeaf"); start = std::chrono::steady_clock::now(); cuda_best_split_finder_->FindBestSplitsForLeaf(cuda_smaller_leaf_splits_.get(), cuda_larger_leaf_splits_.get(), smaller_leaf_index_, larger_leaf_index_, num_data_in_smaller_leaf, num_data_in_larger_leaf, sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf); - //Log::Warning("Before FindBestFromAllSplits"); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::FindBestSplitsForLeaf"); @@ -287,9 +266,6 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, &smaller_leaf_index_, &larger_leaf_index_, best_leaf_index_); - /*cuda_data_partition_->CUDACheck(smaller_leaf_index_, larger_leaf_index_, - leaf_num_data_, cuda_smaller_leaf_splits_.get(), cuda_larger_leaf_splits_.get(), - cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians());*/ end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::Split"); @@ -299,7 +275,6 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, const auto end = std::chrono::steady_clock::now(); const double duration = (static_cast>(end - start)).count(); const auto build_tree_start = std::chrono::steady_clock::now(); - //Log::Warning("Before BuildTree"); std::unique_ptr tree(BuildTree(num_leaves)); const auto build_tree_end = std::chrono::steady_clock::now(); const auto build_tre_duration = (static_cast>(build_tree_end - build_tree_start)).count(); diff --git a/src/treelearner/cuda/new_cuda_utils.cu b/src/treelearner/cuda/new_cuda_utils.cu index 3be69ee4ad92..5b2b4a3ba9fa 100644 --- a/src/treelearner/cuda/new_cuda_utils.cu +++ b/src/treelearner/cuda/new_cuda_utils.cu @@ -8,38 +8,4 @@ namespace LightGBM { -/*template <> -__device__ void PrefixSum(uint32_t* elements, unsigned int n) { - unsigned int offset = 1; - unsigned int threadIdx_x = threadIdx.x; - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - elements[dst_pos] += elements[src_pos]; - } - offset <<= 1; - __syncthreads(); - } - const uint32_t last_element = elements[n - 1]; - if (threadIdx_x == 0) { - elements[n - 1] = 0; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - if (threadIdx_x < d) { - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const uint32_t src_val = elements[src_pos]; - elements[src_pos] = elements[dst_pos]; - elements[dst_pos] += src_val; - } - offset >>= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[n] = elements[n - 1] + last_element; - } -}*/ - } // namespace LightGBM From dc41a00d8a3fc079843d1a6780408598ead6f868 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 10 Jun 2021 03:28:25 +0000 Subject: [PATCH 029/166] add l2 regression objective --- src/treelearner/cuda/cuda_binary_objective.cu | 12 ++-- .../cuda/cuda_binary_objective.hpp | 6 +- .../cuda/cuda_histogram_constructor.cu | 4 +- src/treelearner/cuda/cuda_objective.hpp | 6 ++ .../cuda/cuda_regression_objective.cpp | 31 +++++++++ .../cuda/cuda_regression_objective.cu | 69 +++++++++++++++++++ .../cuda/cuda_regression_objective.hpp | 46 +++++++++++++ .../cuda/new_cuda_tree_learner.cpp | 25 +++++-- .../cuda/new_cuda_tree_learner.hpp | 5 +- 9 files changed, 185 insertions(+), 19 deletions(-) create mode 100644 src/treelearner/cuda/cuda_regression_objective.cpp create mode 100644 src/treelearner/cuda/cuda_regression_objective.cu create mode 100644 src/treelearner/cuda/cuda_regression_objective.hpp diff --git a/src/treelearner/cuda/cuda_binary_objective.cu b/src/treelearner/cuda/cuda_binary_objective.cu index 2034614638e8..c07e2acff60c 100644 --- a/src/treelearner/cuda/cuda_binary_objective.cu +++ b/src/treelearner/cuda/cuda_binary_objective.cu @@ -10,7 +10,7 @@ namespace LightGBM { -__global__ void CalcInitScoreKernel_1(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_init_score) { +__global__ void CalcInitScoreKernel_1_Binary(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_init_score) { __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE]; const unsigned int tid = threadIdx.x; const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; @@ -33,7 +33,7 @@ __global__ void CalcInitScoreKernel_1(const label_t* cuda_labels, const data_siz } } -__global__ void CalcInitScoreKernel_2(double* out_cuda_init_score, const data_size_t num_data, const double sigmoid) { +__global__ void CalcInitScoreKernel_2_Binary(double* out_cuda_init_score, const data_size_t num_data, const double sigmoid) { const double suml = *out_cuda_init_score; const double sumw = static_cast(num_data); const double pavg = suml / sumw; @@ -44,13 +44,13 @@ __global__ void CalcInitScoreKernel_2(double* out_cuda_init_score, const data_si void CUDABinaryObjective::LaunchCalcInitScoreKernel() { const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; - CalcInitScoreKernel_1<<>>(cuda_labels_, num_data_, cuda_init_score_); + CalcInitScoreKernel_1_Binary<<>>(cuda_labels_, num_data_, cuda_init_score_); SynchronizeCUDADevice(); - CalcInitScoreKernel_2<<<1, 1>>>(cuda_init_score_, num_data_, sigmoid_); + CalcInitScoreKernel_2_Binary<<<1, 1>>>(cuda_init_score_, num_data_, sigmoid_); SynchronizeCUDADevice(); } -__global__ void GetGradientsKernel(const double* cuda_scores, const label_t* cuda_labels, +__global__ void GetGradientsKernel_Binary(const double* cuda_scores, const label_t* cuda_labels, const double sigmoid, const data_size_t num_data, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); @@ -66,7 +66,7 @@ __global__ void GetGradientsKernel(const double* cuda_scores, const label_t* cud void CUDABinaryObjective::LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE - 1) / GET_GRADIENTS_BLOCK_SIZE; - GetGradientsKernel<<>>(cuda_scores, cuda_labels_, sigmoid_, num_data_, + GetGradientsKernel_Binary<<>>(cuda_scores, cuda_labels_, sigmoid_, num_data_, cuda_out_gradients, cuda_out_hessians); } diff --git a/src/treelearner/cuda/cuda_binary_objective.hpp b/src/treelearner/cuda/cuda_binary_objective.hpp index 580ea67b6290..62327a960a02 100644 --- a/src/treelearner/cuda/cuda_binary_objective.hpp +++ b/src/treelearner/cuda/cuda_binary_objective.hpp @@ -21,11 +21,11 @@ class CUDABinaryObjective : public CUDAObjective { public: CUDABinaryObjective(const data_size_t num_data, const label_t* cuda_label, const double sigmoid); - void Init(); + void Init() override; - void CalcInitScore(); + void CalcInitScore() override; - const double* cuda_init_score() const { + const double* cuda_init_score() const override { return cuda_init_score_; } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index f43c44e94337..27920e009ccc 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -245,9 +245,9 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, - cuda_data_uint32_t_, + cuda_data_uint16_t_, cuda_row_ptr_uint64_t_, cuda_partition_ptr_uint64_t_, cuda_column_hist_offsets_full_, diff --git a/src/treelearner/cuda/cuda_objective.hpp b/src/treelearner/cuda/cuda_objective.hpp index f15fa74e89b4..98cc9a630341 100644 --- a/src/treelearner/cuda/cuda_objective.hpp +++ b/src/treelearner/cuda/cuda_objective.hpp @@ -18,8 +18,14 @@ class CUDAObjective { public: CUDAObjective(const data_size_t num_data); + virtual void Init() = 0; + + virtual void CalcInitScore() = 0; + virtual void GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) = 0; + virtual const double* cuda_init_score() const = 0; + protected: const data_size_t num_data_; }; diff --git a/src/treelearner/cuda/cuda_regression_objective.cpp b/src/treelearner/cuda/cuda_regression_objective.cpp new file mode 100644 index 000000000000..8964e25a1328 --- /dev/null +++ b/src/treelearner/cuda/cuda_regression_objective.cpp @@ -0,0 +1,31 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_regression_objective.hpp" + +namespace LightGBM { + +CUDARegressionObjective::CUDARegressionObjective(const data_size_t num_data, const label_t* cuda_labels): +CUDAObjective(num_data), cuda_labels_(cuda_labels) {} + +void CUDARegressionObjective::Init() { + AllocateCUDAMemory(1, &cuda_init_score_); + SetCUDAMemory(cuda_init_score_, 0, 1); +} + +void CUDARegressionObjective::GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + LaunchGetGradientsKernel(cuda_scores, cuda_out_gradients, cuda_out_hessians); +} + +void CUDARegressionObjective::CalcInitScore() { + LaunchCalcInitScoreKernel(); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_regression_objective.cu b/src/treelearner/cuda/cuda_regression_objective.cu new file mode 100644 index 000000000000..a8769c744950 --- /dev/null +++ b/src/treelearner/cuda/cuda_regression_objective.cu @@ -0,0 +1,69 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_regression_objective.hpp" + +namespace LightGBM { + +__global__ void CalcInitScoreKernel_1_Regression(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_init_score) { + __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE]; + const unsigned int tid = threadIdx.x; + const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; + shared_label[tid] = 0.0f; + __syncthreads(); + for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; ++j) { + if (i + j < num_data) { + shared_label[tid] += cuda_labels[i + j]; + } + } + __syncthreads(); + for (unsigned int s = 1; s < blockDim.x; s *= 2) { + if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE) { + shared_label[tid] += shared_label[tid + s]; + } + __syncthreads(); + } + if (tid == 0) { + atomicAdd_system(out_cuda_init_score, shared_label[0]); + } +} + +__global__ void CalcInitScoreKernel_2_Regression(double* out_cuda_init_score, const data_size_t num_data) { + const double suml = *out_cuda_init_score; + const double sumw = static_cast(num_data); + const double init_score = suml / sumw; + *out_cuda_init_score = init_score; +} + +void CUDARegressionObjective::LaunchCalcInitScoreKernel() { + const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; + const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; + CalcInitScoreKernel_1_Regression<<>>(cuda_labels_, num_data_, cuda_init_score_); + SynchronizeCUDADevice(); + CalcInitScoreKernel_2_Regression<<<1, 1>>>(cuda_init_score_, num_data_); + SynchronizeCUDADevice(); +} + +__global__ void GetGradientsKernel_Regression(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, + score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + cuda_out_gradients[data_index] = static_cast(cuda_scores[data_index] - cuda_labels[data_index]); + cuda_out_hessians[data_index] = 1.0f; + } +} + +void CUDARegressionObjective::LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE - 1) / GET_GRADIENTS_BLOCK_SIZE; + GetGradientsKernel_Regression<<>>(cuda_scores, cuda_labels_, num_data_, + cuda_out_gradients, cuda_out_hessians); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_regression_objective.hpp b/src/treelearner/cuda/cuda_regression_objective.hpp new file mode 100644 index 000000000000..488b52ea3790 --- /dev/null +++ b/src/treelearner/cuda/cuda_regression_objective.hpp @@ -0,0 +1,46 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifndef LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ +#define LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ + +#ifdef USE_CUDA + +#define GET_GRADIENTS_BLOCK_SIZE (1024) +#define CALC_INIT_SCORE_BLOCK_SIZE (1024) +#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE (6) + +#include "cuda_objective.hpp" + +namespace LightGBM { + +class CUDARegressionObjective : public CUDAObjective { + public: + CUDARegressionObjective(const data_size_t num_data, const label_t* cuda_label); + + void Init() override; + + void CalcInitScore() override; + + const double* cuda_init_score() const override { + return cuda_init_score_; + } + + void GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) override; + + private: + void LaunchCalcInitScoreKernel(); + + void LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians); + + const label_t* cuda_labels_; + double* cuda_init_score_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index b1612bce205f..a7be7c150a76 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -48,12 +48,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_best_split_finder_->Init(); cuda_score_updater_.reset(new CUDAScoreUpdater(num_data_)); cuda_score_updater_->Init(); - cuda_binary_objective_.reset(new CUDABinaryObjective(num_data_, - cuda_centralized_info_->cuda_labels(), config_->sigmoid)); - cuda_binary_objective_->Init(); - cuda_binary_objective_->CalcInitScore(); - - cuda_score_updater_->SetInitScore(cuda_binary_objective_->cuda_init_score()); + InitObjective(); leaf_best_split_feature_.resize(config_->num_leaves, -1); leaf_best_split_threshold_.resize(config_->num_leaves, 0); @@ -70,7 +65,7 @@ void NewCUDATreeLearner::BeforeTrain() { auto duration = static_cast>(end - start); global_timer.Start("CUDACentralizedInfo::BeforeTrain"); start = std::chrono::steady_clock::now(); - cuda_binary_objective_->GetGradients(cuda_score_updater_->cuda_scores(), + cuda_objective_->GetGradients(cuda_score_updater_->cuda_scores(), cuda_centralized_info_->cuda_gradients_ref(), cuda_centralized_info_->cuda_hessians_ref()); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); @@ -294,6 +289,22 @@ void NewCUDATreeLearner::ResetTrainingData(const Dataset* /*train_data*/, void NewCUDATreeLearner::SetBaggingData(const Dataset* /*subset*/, const data_size_t* /*used_indices*/, data_size_t /*num_data*/) {} +void NewCUDATreeLearner::InitObjective() { + if (config_->objective == std::string("binary")) { + cuda_objective_.reset(new CUDABinaryObjective(num_data_, + cuda_centralized_info_->cuda_labels(), config_->sigmoid)); + } else if (config_->objective == std::string("regression")) { + cuda_objective_.reset(new CUDARegressionObjective(num_data_, cuda_centralized_info_->cuda_labels())); + } else { + Log::Fatal("Unsupported objective %s for CUDA.", config_->objective.c_str()); + } + + cuda_objective_->Init(); + cuda_objective_->CalcInitScore(); + + cuda_score_updater_->SetInitScore(cuda_objective_->cuda_init_score()); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index ca297fba03f9..baa4bf56ca26 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -16,6 +16,7 @@ #include "cuda_centralized_info.hpp" #include "cuda_score_updater.hpp" #include "cuda_binary_objective.hpp" +#include "cuda_regression_objective.hpp" namespace LightGBM { @@ -49,6 +50,8 @@ class NewCUDATreeLearner: public SerialTreeLearner { Tree* BuildTree(const int num_leaves); + void InitObjective(); + // number of GPUs int num_gpus_; // number of threads on CPU @@ -69,7 +72,7 @@ class NewCUDATreeLearner: public SerialTreeLearner { std::unique_ptr cuda_score_updater_; - std::unique_ptr cuda_binary_objective_; + std::unique_ptr cuda_objective_; std::vector leaf_best_split_feature_; std::vector leaf_best_split_threshold_; From bd065b7162eef22427103284fd33934b42f97266 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 11 Jun 2021 09:31:11 +0000 Subject: [PATCH 030/166] sparse multi value bin enabled for CUDA --- CMakeLists.txt | 6 + include/LightGBM/dataset.h | 12 + include/LightGBM/feature_group.h | 17 ++ src/treelearner/cuda/cuda_binary_objective.cu | 16 +- .../cuda/cuda_binary_objective.hpp | 6 +- .../cuda/cuda_centralized_info.cpp | 7 +- .../cuda/cuda_centralized_info.hpp | 6 +- src/treelearner/cuda/cuda_data_partition.cpp | 13 +- src/treelearner/cuda/cuda_data_partition.cu | 2 + src/treelearner/cuda/cuda_data_partition.hpp | 1 + .../cuda/cuda_histogram_constructor.cpp | 33 ++- .../cuda/cuda_histogram_constructor.cu | 4 +- .../cuda/cuda_histogram_constructor.hpp | 1 + .../cuda/cuda_ranking_objective.cpp | 65 +++++ .../cuda/cuda_ranking_objective.cu | 252 ++++++++++++++++++ .../cuda/cuda_ranking_objective.hpp | 72 +++++ .../cuda/cuda_regression_objective.cu | 16 +- .../cuda/cuda_regression_objective.hpp | 6 +- .../cuda/new_cuda_tree_learner.cpp | 24 +- .../cuda/new_cuda_tree_learner.hpp | 1 + 20 files changed, 516 insertions(+), 44 deletions(-) create mode 100644 src/treelearner/cuda/cuda_ranking_objective.cpp create mode 100644 src/treelearner/cuda/cuda_ranking_objective.cu create mode 100644 src/treelearner/cuda/cuda_ranking_objective.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 65696c0f4b5d..141ce31d5ea1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -248,6 +248,12 @@ if(USE_CUDA) add_library(cuda_binary_objective OBJECT src/treelearner/cuda/cuda_binary_objective.cu) set_target_properties(cuda_binary_objective PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + add_library(cuda_regression_objective OBJECT src/treelearner/cuda/cuda_regression_objective.cu) + set_target_properties(cuda_regression_objective PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + add_library(cuda_ranking_objective OBJECT src/treelearner/cuda/cuda_ranking_objective.cu) + set_target_properties(cuda_ranking_objective PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + add_library(cuda_score_updater OBJECT src/treelearner/cuda/cuda_score_updater.cu) set_target_properties(cuda_score_updater PROPERTIES CUDA_SEPARABLE_COMPILATION ON) endif(USE_CUDA) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index b9eb9a380f20..f9218b45b937 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -686,6 +686,18 @@ class Dataset { return raw_data_[numeric_feature_map_[feat_ind]].data(); } + inline uint32_t feature_max_bin(const int inner_feature_index) const { + const int feature_group_index = Feature2Group(inner_feature_index); + const int sub_feature_index = feature2subfeature_[inner_feature_index]; + return feature_groups_[feature_group_index]->feature_max_bin(sub_feature_index); + } + + inline uint32_t feature_min_bin(const int inner_feature_index) const { + const int feature_group_index = Feature2Group(inner_feature_index); + const int sub_feature_index = feature2subfeature_[inner_feature_index]; + return feature_groups_[feature_group_index]->feature_min_bin(sub_feature_index); + } + private: std::string data_filename_; /*! \brief Store used features */ diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 955848511efd..a81e06fa64e6 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -492,6 +492,23 @@ class FeatureGroup { } } + uint32_t feature_max_bin(const int sub_feature_index) { + if (!is_multi_val_) { + return bin_offsets_[sub_feature_index + 1] - 1; + } else { + int addi = bin_mappers_[sub_feature_index]->GetMostFreqBin() == 0 ? 0 : 1; + return bin_mappers_[sub_feature_index]->num_bin() - 1 + addi; + } + } + + uint32_t feature_min_bin(const int sub_feature_index) { + if (!is_multi_val_) { + return bin_offsets_[sub_feature_index]; + } else { + return 1; + } + } + private: void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { if (is_multi_val) { diff --git a/src/treelearner/cuda/cuda_binary_objective.cu b/src/treelearner/cuda/cuda_binary_objective.cu index c07e2acff60c..ee42adb5884d 100644 --- a/src/treelearner/cuda/cuda_binary_objective.cu +++ b/src/treelearner/cuda/cuda_binary_objective.cu @@ -11,19 +11,19 @@ namespace LightGBM { __global__ void CalcInitScoreKernel_1_Binary(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_init_score) { - __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE]; + __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE_BINARY]; const unsigned int tid = threadIdx.x; - const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; + const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; shared_label[tid] = 0.0f; __syncthreads(); - for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; ++j) { + for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; ++j) { if (i + j < num_data) { shared_label[tid] += cuda_labels[i + j]; } } __syncthreads(); for (unsigned int s = 1; s < blockDim.x; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE) { + if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE_BINARY) { shared_label[tid] += shared_label[tid + s]; } __syncthreads(); @@ -42,9 +42,9 @@ __global__ void CalcInitScoreKernel_2_Binary(double* out_cuda_init_score, const } void CUDABinaryObjective::LaunchCalcInitScoreKernel() { - const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; + const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE_BINARY * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; - CalcInitScoreKernel_1_Binary<<>>(cuda_labels_, num_data_, cuda_init_score_); + CalcInitScoreKernel_1_Binary<<>>(cuda_labels_, num_data_, cuda_init_score_); SynchronizeCUDADevice(); CalcInitScoreKernel_2_Binary<<<1, 1>>>(cuda_init_score_, num_data_, sigmoid_); SynchronizeCUDADevice(); @@ -65,8 +65,8 @@ __global__ void GetGradientsKernel_Binary(const double* cuda_scores, const label } void CUDABinaryObjective::LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE - 1) / GET_GRADIENTS_BLOCK_SIZE; - GetGradientsKernel_Binary<<>>(cuda_scores, cuda_labels_, sigmoid_, num_data_, + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY; + GetGradientsKernel_Binary<<>>(cuda_scores, cuda_labels_, sigmoid_, num_data_, cuda_out_gradients, cuda_out_hessians); } diff --git a/src/treelearner/cuda/cuda_binary_objective.hpp b/src/treelearner/cuda/cuda_binary_objective.hpp index 62327a960a02..eaab85492210 100644 --- a/src/treelearner/cuda/cuda_binary_objective.hpp +++ b/src/treelearner/cuda/cuda_binary_objective.hpp @@ -9,9 +9,9 @@ #ifdef USE_CUDA -#define GET_GRADIENTS_BLOCK_SIZE (1024) -#define CALC_INIT_SCORE_BLOCK_SIZE (1024) -#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE (6) +#define GET_GRADIENTS_BLOCK_SIZE_BINARY (1024) +#define CALC_INIT_SCORE_BLOCK_SIZE_BINARY (1024) +#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY (6) #include "cuda_objective.hpp" diff --git a/src/treelearner/cuda/cuda_centralized_info.cpp b/src/treelearner/cuda/cuda_centralized_info.cpp index 50553a7cc5c2..64874a3bd767 100644 --- a/src/treelearner/cuda/cuda_centralized_info.cpp +++ b/src/treelearner/cuda/cuda_centralized_info.cpp @@ -13,7 +13,7 @@ namespace LightGBM { CUDACentralizedInfo::CUDACentralizedInfo(const data_size_t num_data, const int num_leaves, const int num_features): num_data_(num_data), num_leaves_(num_leaves), num_features_(num_features) {} -void CUDACentralizedInfo::Init(const score_t* labels) { +void CUDACentralizedInfo::Init(const score_t* labels, const Dataset* train_data) { InitCUDAMemoryFromHostMemory(&cuda_num_data_, &num_data_, 1); InitCUDAMemoryFromHostMemory(&cuda_num_leaves_, &num_leaves_, 1); InitCUDAMemoryFromHostMemory(&cuda_num_features_, &num_features_, 1); @@ -22,6 +22,11 @@ void CUDACentralizedInfo::Init(const score_t* labels) { AllocateCUDAMemory(static_cast(num_data_), &cuda_hessians_); InitCUDAMemoryFromHostMemory(&cuda_labels_, labels, num_data_); + + InitCUDAMemoryFromHostMemory( + &cuda_query_boundaries_, + train_data->metadata().query_boundaries(), + static_cast(train_data->metadata().num_queries())); } void CUDACentralizedInfo::BeforeTrain(const score_t* gradients, const score_t* hessians) { diff --git a/src/treelearner/cuda/cuda_centralized_info.hpp b/src/treelearner/cuda/cuda_centralized_info.hpp index d005975cdccb..0105bb587e91 100644 --- a/src/treelearner/cuda/cuda_centralized_info.hpp +++ b/src/treelearner/cuda/cuda_centralized_info.hpp @@ -9,6 +9,7 @@ #ifdef USE_CUDA +#include #include #include #include "new_cuda_utils.hpp" @@ -21,7 +22,7 @@ class CUDACentralizedInfo { public: CUDACentralizedInfo(const data_size_t num_data, const int num_leaves, const int num_features); - void Init(const label_t* labels); + void Init(const label_t* labels, const Dataset* train_data); void BeforeTrain(const score_t* gradients, const score_t* hessians); @@ -41,6 +42,8 @@ class CUDACentralizedInfo { score_t* cuda_hessians_ref() { return cuda_hessians_; } + const data_size_t* cuda_query_boundaries() { return cuda_query_boundaries_; } + void Test() { data_size_t test_num_data = 0; int test_num_leaves = 0; @@ -67,6 +70,7 @@ class CUDACentralizedInfo { score_t* cuda_gradients_; score_t* cuda_hessians_; label_t* cuda_labels_; + data_size_t* cuda_query_boundaries_; }; } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 6110e95a5a93..c14ecb1b2133 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -41,8 +41,8 @@ CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_f const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); feature_default_bins_[feature_index] = bin_mapper->GetDefaultBin(); feature_most_freq_bins_[feature_index] = bin_mapper->GetMostFreqBin(); - feature_min_bins_[feature_index] = feature_hist_offsets[feature_index] - prev_group_bins; - feature_max_bins_[feature_index] = feature_hist_offsets[feature_index + 1] - prev_group_bins - 1; + feature_min_bins_[feature_index] = train_data->feature_min_bin(feature_index); + feature_max_bins_[feature_index] = train_data->feature_max_bin(feature_index); const MissingType missing_type = bin_mapper->missing_type(); if (missing_type == MissingType::None) { feature_missing_is_zero_[feature_index] = 0; @@ -71,6 +71,12 @@ CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_f } num_data_in_leaf_.resize(num_leaves_, 0); num_data_in_leaf_[0] = num_data_; + + /*for (size_t i = 0; i < feature_max_bins_.size(); ++i) { + Log::Warning("feature_min_bins_[%d] = %d, feature_max_bins_[%d] = %d", i, feature_min_bins_[i], i, feature_max_bins_[i]); + }*/ + + train_data_ = train_data; } void CUDADataPartition::Init(const Dataset* train_data) { @@ -373,6 +379,9 @@ void CUDADataPartition::Split(const int* leaf_id, const uint32_t split_threshold = cpu_leaf_best_split_threshold[cpu_leaf_index]; const uint8_t split_default_left = cpu_leaf_best_split_default_left[cpu_leaf_index]; const data_size_t leaf_data_start = cpu_leaf_data_start->at(cpu_leaf_index); + //Log::Warning("real split feature index = %d", train_data_->RealFeatureIndex(split_feature_index)); + //Log::Warning("split threshold = %d", split_threshold); + //Log::Warning("split default left = %d", split_default_left); global_timer.Stop("SplitInner Copy CUDA To Host"); GenDataToLeftBitVector(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start); global_timer.Stop("GenDataToLeftBitVector"); diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index e5daf339937d..781c8f86d001 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -1444,6 +1444,8 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data const int right_leaf_index = cpu_split_info_buffer[3]; const data_size_t right_leaf_num_data = cpu_split_info_buffer[4]; const data_size_t right_leaf_data_start = cpu_split_info_buffer[5]; + //Log::Warning("################################# left_leaf_num_data = %d, right_leaf_num_data = %d #################################", + // left_leaf_num_data, right_leaf_num_data); (*cpu_leaf_num_data)[left_leaf_index] = left_leaf_num_data; (*cpu_leaf_data_start)[left_leaf_index] = left_leaf_data_start; (*cpu_leaf_num_data)[right_leaf_index] = right_leaf_num_data; diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 919d9da3a32e..7d24508f7a50 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -278,6 +278,7 @@ class CUDADataPartition { std::vector cpu_split_info_buffer_; std::vector column_bit_type_; std::vector feature_index_to_column_index_; + const Dataset* train_data_; // CUDA streams std::vector cuda_streams_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 4079574daa3c..74dfbceab2f7 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -19,6 +19,7 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, num_feature_groups_(train_data->num_feature_groups()), min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf), cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians) { + train_data_ = train_data; int offset = 0; for (int group_id = 0; group_id < train_data->num_feature_groups(); ++group_id) { feature_group_bin_offsets_.emplace_back(offset); @@ -86,7 +87,7 @@ void CUDAHistogramConstructor::InitCUDAData(TrainingShareStates* share_state) { const uint8_t* data_ptr = nullptr; data_ptr_bit_type_ = 0; const uint8_t* cpu_data_ptr = share_state->GetRowWiseData(&bit_type_, &total_size, &is_sparse_, &data_ptr, &data_ptr_bit_type_); - CHECK_EQ(bit_type_, 8); + Log::Warning("bit_type_ = %d, is_sparse_ = %d, data_ptr_bit_type_ = %d", bit_type_, static_cast(is_sparse_), data_ptr_bit_type_); if (bit_type_ == 8) { if (!is_sparse_) { std::vector partitioned_data; @@ -168,7 +169,7 @@ void CUDAHistogramConstructor::InitCUDAData(TrainingShareStates* share_state) { GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint32_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint32_t_, partition_ptr.data(), partition_ptr.size()); AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint16_t_); - AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint32_t_); + AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint32_t_); for (size_t i = 0; i < partitioned_data.size(); ++i) { const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; const std::vector& data_for_this_partition = partitioned_data[i]; @@ -274,6 +275,20 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_num_data_in_smaller_leaf, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist, num_data_in_smaller_leaf); SynchronizeCUDADevice(); + /*std::vector root_hist(20000); + CopyFromCUDADeviceToHost(root_hist.data(), cuda_hist_, 20000); + for (int real_feature_index = 0; real_feature_index < train_data_->num_total_features(); ++real_feature_index) { + const int inner_feature_index = train_data_->InnerFeatureIndex(real_feature_index); + if (inner_feature_index >= 0) { + const uint32_t feature_hist_start = feature_hist_offsets_[inner_feature_index]; + const uint32_t feature_hist_end = feature_hist_offsets_[inner_feature_index + 1]; + Log::Warning("real_feature_index = %d, inner_feature_index = %d", real_feature_index, inner_feature_index); + for (uint32_t hist_position = feature_hist_start; hist_position < feature_hist_end; ++hist_position) { + Log::Warning("hist_position = %d, bin_in_feature = %d, grad = %f, hess = %f", + hist_position, hist_position - feature_hist_start, root_hist[hist_position * 2], root_hist[hist_position * 2 + 1]); + } + } + }*/ global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); LaunchSubtractHistogramKernel(cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_smaller_leaf_sum_gradients, cuda_smaller_leaf_sum_hessians, @@ -290,6 +305,7 @@ void CUDAHistogramConstructor::CalcConstructHistogramKernelDim( const int min_grid_dim_y = 160; *grid_dim_y = std::max(min_grid_dim_y, ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + (*block_dim_y) - 1) / (*block_dim_y)); + //Log::Warning("block_dim_x = %d, block_dim_y = %d, grid_dim_x = %d, grid_dim_y = %d", *block_dim_x, *block_dim_y, *grid_dim_x, *grid_dim_y); } void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state) { @@ -372,6 +388,17 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data } } + Log::Warning("max_num_column_per_partition_ = %d", max_num_column_per_partition_); + for (size_t i = 0; i < feature_partition_column_index_offsets_.size(); ++i) { + Log::Warning("feature_partition_column_index_offsets_[%d] = %d", i, feature_partition_column_index_offsets_[i]); + } + for (size_t i = 0; i < column_hist_offsets_full_.size(); ++i) { + Log::Warning("column_hist_offsets_full_[%d] = %d", i, column_hist_offsets_full_[i]); + } + for (size_t i = 0; i < column_hist_offsets_.size(); ++i) { + Log::Warning("column_hist_offsets_[%d] = %d", i, column_hist_offsets_[i]); + } + InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, feature_partition_column_index_offsets_.data(), feature_partition_column_index_offsets_.size()); @@ -429,6 +456,8 @@ void CUDAHistogramConstructor::GetSparseDataPartitioned( const int partition_hist_start = column_hist_offsets_full_[partition_index]; const int partition_hist_end = column_hist_offsets_full_[partition_index + 1]; DATA_PTR_TYPE offset = 0; + row_ptr_for_this_partition.clear(); + data_for_this_partition.clear(); row_ptr_for_this_partition.emplace_back(offset); for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { const DATA_PTR_TYPE row_start = row_ptr[data_index]; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 27920e009ccc..f843b619232a 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -62,7 +62,6 @@ __global__ void CUDAConstructHistogramDenseKernel( const int* feature_partition_column_index_offsets, const data_size_t num_data) { - const int num_feature_groups_ref = *num_feature_groups; const int leaf_index_ref = *leaf_index; const int dim_y = static_cast(gridDim.y * blockDim.y); const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; @@ -132,7 +131,6 @@ __global__ void CUDAConstructHistogramSparseKernel( const uint32_t* column_hist_offsets_full, const data_size_t num_data) { - const int num_feature_groups_ref = *num_feature_groups; const int leaf_index_ref = *leaf_index; const int dim_y = static_cast(gridDim.y * blockDim.y); const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; @@ -176,8 +174,8 @@ __global__ void CUDAConstructHistogramSparseKernel( float* pos_ptr = shared_hist + pos; atomicAdd_system(pos_ptr, grad); atomicAdd_system(pos_ptr + 1, hess); - inner_data_index += blockDim.y; } + inner_data_index += blockDim.y; } __syncthreads(); hist_t* feature_histogram_ptr = (*feature_histogram) + (partition_hist_start << 1); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index aff2543dcfff..8e2348fd5fa6 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -138,6 +138,7 @@ class CUDAHistogramConstructor { int max_num_column_per_partition_; uint8_t data_ptr_bit_type_; uint8_t bit_type_; + const Dataset* train_data_; // CUDA memory, held by this object uint32_t* cuda_feature_group_bin_offsets_; diff --git a/src/treelearner/cuda/cuda_ranking_objective.cpp b/src/treelearner/cuda/cuda_ranking_objective.cpp new file mode 100644 index 000000000000..d39cbb61e938 --- /dev/null +++ b/src/treelearner/cuda/cuda_ranking_objective.cpp @@ -0,0 +1,65 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_ranking_objective.hpp" + +namespace LightGBM { + +CUDARankingObjective::CUDARankingObjective( + const data_size_t num_data, + const label_t* cuda_labels, + const data_size_t* cuda_query_boundaries, + const int num_queries, + const bool norm, + const double sigmoid, + const int truncation_level, + const label_t* labels, + const int num_threads): +CUDAObjective(num_data), +cuda_labels_(cuda_labels), +cuda_query_boundaries_(cuda_query_boundaries), +num_queries_(num_queries), +norm_(norm), +sigmoid_(sigmoid), +truncation_level_(truncation_level), +num_threads_(num_threads) { + std::vector thread_max_label(num_threads, 0.0f); + Threading::For(0, num_data_, 512, + [labels, &thread_max_label, this] (int thread_index, data_size_t start, data_size_t end) { + if (start < num_data_) { + thread_max_label[thread_index] = labels[start]; + } + for (data_size_t data_index = start + 1; data_index < end; ++data_index) { + const label_t label = labels[data_index]; + if (label > thread_max_label[thread_index]) { + thread_max_label[thread_index] = label; + } + } + }); + max_label_ = thread_max_label[0]; + for (int thread_index = 1; thread_index < num_threads_; ++thread_index) { + max_label_ = std::max(max_label_, thread_max_label[thread_index]); + } +} + +void CUDARankingObjective::Init() { + AllocateCUDAMemory(1, &cuda_init_score_); + SetCUDAMemory(cuda_init_score_, 0, 1); + AllocateCUDAMemory(num_data_, &cuda_lambdas_); + AllocateCUDAMemory(num_queries_, &cuda_inverse_max_dcgs_); +} + +void CUDARankingObjective::GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + LaunchGetGradientsKernel(cuda_scores, cuda_out_gradients, cuda_out_hessians); +} + +void CUDARankingObjective::CalcInitScore() {} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_ranking_objective.cu b/src/treelearner/cuda/cuda_ranking_objective.cu new file mode 100644 index 000000000000..a50035d0f37e --- /dev/null +++ b/src/treelearner/cuda/cuda_ranking_objective.cu @@ -0,0 +1,252 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_ranking_objective.hpp" + +namespace LightGBM { + +__device__ void ArgSort(const double* scores, uint16_t* indices, const uint16_t num_items) { + uint16_t num_items_aligned = 1; + uint16_t num_items_ref = num_items - 1; + uint16_t depth = 1; + while (num_items_ref > 0) { + num_items_aligned <<= 1; + num_items_ref >>= 1; + ++depth; + } + for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { + const uint16_t outer_segment_length = 1 << (depth - outer_depth); + const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; + const bool ascending = (outer_segment_index % 2 == 0); + for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { + const uint16_t segment_length = 1 << (depth - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (threadIdx.x < num_items_aligned) { + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length; + if (ascending) { + if (scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) { + const uint16_t index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } else { + if (scores[indices[threadIdx.x]] < scores[indices[index_to_compare]]) { + const uint16_t index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } + } + } + __syncthreads(); + } + } +} + +__global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, + const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, + const bool norm, const double sigmoid, const int truncation_level, + score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + __shared__ double shared_scores[MAX_NUM_ITEM_IN_QUERY]; + __shared__ uint16_t shared_indices[MAX_NUM_ITEM_IN_QUERY]; + __shared__ double shared_lambdas[MAX_NUM_ITEM_IN_QUERY]; + __shared__ double shared_hessians[MAX_NUM_ITEM_IN_QUERY]; + const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; + const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); + const double min_score = -100000000000.0f; + for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { + const double inverse_max_dcg = cuda_inverse_max_dcgs[query_index]; + const data_size_t query_start = cuda_query_boundaries[query_index]; + const data_size_t query_end = cuda_query_boundaries[query_index + 1]; + const data_size_t query_item_count = query_end - query_start; + const double* cuda_scores_pointer = cuda_scores + query_start; + score_t* cuda_out_gradients_pointer = cuda_out_gradients + query_start; + score_t* cuda_out_hessians_pointer = cuda_out_hessians + query_start; + const label_t* cuda_label_pointer = cuda_labels + query_start; + if (threadIdx.x < query_item_count) { + shared_scores[threadIdx.x] = cuda_scores_pointer[threadIdx.x]; + shared_indices[threadIdx.x] = static_cast(threadIdx.x); + shared_lambdas[threadIdx.x] = 0.0f; + shared_hessians[threadIdx.x] = 0.0f; + } else { + shared_scores[threadIdx.x] = min_score; + shared_indices[threadIdx.x] = 0; + } + __syncthreads(); + ArgSort(shared_scores, shared_indices, static_cast(query_item_count)); + // get best and worst score + const double best_score = shared_scores[shared_indices[0]]; + data_size_t worst_idx = query_item_count - 1; + if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == min_score) { + worst_idx -= 1; + } + const double worst_score = shared_scores[shared_indices[worst_idx]]; + double sum_lambdas = 0.0; + // start accmulate lambdas by pairs that contain at least one document above truncation level + for (data_size_t i = 0; i < query_item_count - 1 && i < truncation_level; ++i) { + if (shared_scores[shared_indices[i]] == min_score) { continue; } + if (threadIdx.x > static_cast(i) && threadIdx.x < static_cast(query_item_count)) { + const data_size_t j = static_cast(threadIdx.x); + // skip pairs with the same labels + if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != min_score) { + data_size_t high_rank, low_rank; + if (cuda_label_pointer[shared_indices[i]] > cuda_label_pointer[shared_indices[j]]) { + high_rank = i; + low_rank = j; + } else { + high_rank = j; + low_rank = i; + } + const data_size_t high = shared_indices[high_rank]; + const int high_label = static_cast(cuda_label_pointer[high]); + const double high_score = shared_scores[high]; + const double high_label_gain = static_cast((1 << high_label) - 1); + const double high_discount = log2(2.0f + high_rank); + const data_size_t low = shared_indices[low_rank]; + const int low_label = static_cast(cuda_label_pointer[low]); + const double low_score = shared_scores[low]; + const double low_label_gain = static_cast((1 << low_label) - 1); + const double low_discount = log2(2.0f + low_rank); + + const double delta_score = high_score - low_score; + + // get dcg gap + const double dcg_gap = high_label_gain - low_label_gain; + // get discount of this pair + const double paired_discount = fabs(high_discount - low_discount); + // get delta NDCG + double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; + // regular the delta_pair_NDCG by score distance + if (norm && best_score != worst_score) { + delta_pair_NDCG /= (0.01f + fabs(delta_score)); + } + // calculate lambda for this pair + double p_lambda = 1.0f / (1.0f + exp(sigmoid * delta_score)); + double p_hessian = p_lambda * (1.0f - p_lambda); + // update + p_lambda *= -sigmoid * delta_pair_NDCG; + p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; + atomicAdd_system(shared_lambdas + low, -static_cast(p_lambda)); + atomicAdd_system(shared_hessians + low, static_cast(p_hessian)); + atomicAdd_system(shared_lambdas + high, static_cast(p_lambda)); + atomicAdd_system(shared_hessians + high, static_cast(p_hessian)); + // lambda is negative, so use minus to accumulate + atomicAdd_system(&sum_lambdas, -2 * p_lambda); + } + } + } + __syncthreads(); + if (norm && sum_lambdas > 0) { + double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; + for (data_size_t i = 0; i < query_item_count; ++i) { + cuda_out_gradients_pointer[i] = static_cast(shared_lambdas[i] * norm_factor); + cuda_out_hessians_pointer[i] = static_cast(shared_hessians[i] * norm_factor); + } + } + } +} + +void CUDARankingObjective::LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; + GetGradientsKernel_Ranking<<>>(cuda_scores, cuda_labels_, num_data_, + num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, + norm_, sigmoid_, truncation_level_, + cuda_out_gradients, cuda_out_hessians); +} + +__device__ void PrefixSumBankConflict(uint16_t* elements, unsigned int n) { + unsigned int offset = 1; + unsigned int threadIdx_x = threadIdx.x; + __syncthreads(); + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + elements[dst_pos] += elements[src_pos]; + } + offset <<= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[n - 1] = 0; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const uint32_t src_val = elements[src_pos]; + elements[src_pos] = elements[dst_pos]; + elements[dst_pos] += src_val; + } + __syncthreads(); + } +} + +__global__ void CalcInverseMaxDCGKernel( + const data_size_t* cuda_query_boundaries, + const label_t* cuda_labels, + const int truncation_level, + const data_size_t num_queries, + double* cuda_inverse_max_dcgs) { + __shared__ uint32_t label_sum[MAX_RANK_LABEL]; + __shared__ uint16_t label_pos[MAX_RANK_LABEL]; + const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; + const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); + for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { + const data_size_t query_start = cuda_query_boundaries[query_index]; + const data_size_t query_end = cuda_query_boundaries[query_index + 1]; + const data_size_t query_count = query_end - query_start; + if (threadIdx.x < MAX_RANK_LABEL) { + label_sum[threadIdx.x] = 0; + } + __syncthreads(); + const label_t* label_pointer = cuda_labels + query_start; + if (threadIdx.x < static_cast(query_count)) { + atomicAdd_system(label_sum + static_cast(label_pointer[threadIdx.x]), 1); + } + __syncthreads(); + if (threadIdx.x < MAX_RANK_LABEL) { + if (label_sum[threadIdx.x] > 0) { + label_pos[threadIdx.x] = 1; + } else { + label_pos[threadIdx.x] = 0; + } + } + __syncthreads(); + PrefixSumBankConflict(label_pos, MAX_RANK_LABEL); + double gain = 0.0f; + if (threadIdx.x < MAX_RANK_LABEL && label_sum[threadIdx.x] > 0) { + const double label_gain = (1 << threadIdx.x - 1) / log2(2.0f + label_pos[threadIdx.x]); + atomicAdd_system(&gain, label_gain); + } + __syncthreads(); + if (gain > 0.0f) { + cuda_inverse_max_dcgs[query_index] = 1.0f / gain; + } else { + cuda_inverse_max_dcgs[query_index] = 0.0f; + } + } +} + +void CUDARankingObjective::LaunchCalcInverseMaxDCGKernel() { + const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; + CalcInverseMaxDCGKernel<<>>( + cuda_query_boundaries_, + cuda_labels_, + truncation_level_, + num_queries_, + cuda_inverse_max_dcgs_); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_ranking_objective.hpp b/src/treelearner/cuda/cuda_ranking_objective.hpp new file mode 100644 index 000000000000..c2d8981b6dca --- /dev/null +++ b/src/treelearner/cuda/cuda_ranking_objective.hpp @@ -0,0 +1,72 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifndef LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ +#define LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ + +#ifdef USE_CUDA + +#define GET_GRADIENTS_BLOCK_SIZE_RANKING_RANKING (128) +#define MAX_NUM_ITEM_IN_QUERY (1024) +#define NUM_QUERY_PER_BLOCK (100) +#define MAX_RANK_LABEL (32) + +#include "cuda_objective.hpp" +#include + +namespace LightGBM { + +class CUDARankingObjective : public CUDAObjective { + public: + CUDARankingObjective( + const data_size_t num_data, + const label_t* cuda_label, + const int* cpu_query_boundaries, + const int num_queries, + const bool norm, + const double sigmoid, + const int truncation_level, + const label_t* labels, + const int num_threads); + + void Init() override; + + void CalcInitScore() override; + + const double* cuda_init_score() const override { + return cuda_init_score_; + } + + void GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) override; + + private: + + void LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians); + + void LaunchCalcInverseMaxDCGKernel(); + + // CUDA memory, held by this object + double* cuda_init_score_; + double* cuda_lambdas_; + double* cuda_inverse_max_dcgs_; + + // CUDA memory, held by other objects + const label_t* cuda_labels_; + const data_size_t* cuda_query_boundaries_; + + // Host memory + const int num_queries_; + const bool norm_; + const double sigmoid_; + const int truncation_level_; + label_t max_label_; + const int num_threads_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ diff --git a/src/treelearner/cuda/cuda_regression_objective.cu b/src/treelearner/cuda/cuda_regression_objective.cu index a8769c744950..750188bb31a2 100644 --- a/src/treelearner/cuda/cuda_regression_objective.cu +++ b/src/treelearner/cuda/cuda_regression_objective.cu @@ -11,19 +11,19 @@ namespace LightGBM { __global__ void CalcInitScoreKernel_1_Regression(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_init_score) { - __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE]; + __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION]; const unsigned int tid = threadIdx.x; - const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; + const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; shared_label[tid] = 0.0f; __syncthreads(); - for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; ++j) { + for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; ++j) { if (i + j < num_data) { shared_label[tid] += cuda_labels[i + j]; } } __syncthreads(); for (unsigned int s = 1; s < blockDim.x; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE) { + if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION) { shared_label[tid] += shared_label[tid + s]; } __syncthreads(); @@ -41,9 +41,9 @@ __global__ void CalcInitScoreKernel_2_Regression(double* out_cuda_init_score, co } void CUDARegressionObjective::LaunchCalcInitScoreKernel() { - const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE; + const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; - CalcInitScoreKernel_1_Regression<<>>(cuda_labels_, num_data_, cuda_init_score_); + CalcInitScoreKernel_1_Regression<<>>(cuda_labels_, num_data_, cuda_init_score_); SynchronizeCUDADevice(); CalcInitScoreKernel_2_Regression<<<1, 1>>>(cuda_init_score_, num_data_); SynchronizeCUDADevice(); @@ -59,8 +59,8 @@ __global__ void GetGradientsKernel_Regression(const double* cuda_scores, const l } void CUDARegressionObjective::LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE - 1) / GET_GRADIENTS_BLOCK_SIZE; - GetGradientsKernel_Regression<<>>(cuda_scores, cuda_labels_, num_data_, + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + GetGradientsKernel_Regression<<>>(cuda_scores, cuda_labels_, num_data_, cuda_out_gradients, cuda_out_hessians); } diff --git a/src/treelearner/cuda/cuda_regression_objective.hpp b/src/treelearner/cuda/cuda_regression_objective.hpp index 488b52ea3790..6fcb29ed50a1 100644 --- a/src/treelearner/cuda/cuda_regression_objective.hpp +++ b/src/treelearner/cuda/cuda_regression_objective.hpp @@ -9,9 +9,9 @@ #ifdef USE_CUDA -#define GET_GRADIENTS_BLOCK_SIZE (1024) -#define CALC_INIT_SCORE_BLOCK_SIZE (1024) -#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE (6) +#define GET_GRADIENTS_BLOCK_SIZE_REGRESSION (1024) +#define CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION (1024) +#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION (6) #include "cuda_objective.hpp" diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index a7be7c150a76..143f7e23ada9 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -24,7 +24,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); const label_t* labels = train_data->metadata().label(); cuda_centralized_info_.reset(new CUDACentralizedInfo(num_data_, this->config_->num_leaves, num_features_)); - cuda_centralized_info_->Init(labels); + cuda_centralized_info_->Init(labels, train_data_); cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0, cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); cuda_smaller_leaf_splits_->Init(); @@ -59,29 +59,17 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia } void NewCUDATreeLearner::BeforeTrain() { - auto start = std::chrono::steady_clock::now(); cuda_data_partition_->BeforeTrain(nullptr); - auto end = std::chrono::steady_clock::now(); - auto duration = static_cast>(end - start); global_timer.Start("CUDACentralizedInfo::BeforeTrain"); - start = std::chrono::steady_clock::now(); cuda_objective_->GetGradients(cuda_score_updater_->cuda_scores(), cuda_centralized_info_->cuda_gradients_ref(), cuda_centralized_info_->cuda_hessians_ref()); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); global_timer.Stop("CUDACentralizedInfo::BeforeTrain"); cuda_smaller_leaf_splits_->InitValues(cuda_data_partition_->cuda_data_indices(), cuda_histogram_constructor_->cuda_hist_pointer(), &leaf_sum_hessians_[0]); cuda_larger_leaf_splits_->InitValues(); - start = std::chrono::steady_clock::now(); cuda_histogram_constructor_->BeforeTrain(); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); - start = std::chrono::steady_clock::now(); cuda_best_split_finder_->BeforeTrain(); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); leaf_num_data_[0] = num_data_; leaf_data_start_[0] = 0; smaller_leaf_index_ = 0; @@ -295,6 +283,16 @@ void NewCUDATreeLearner::InitObjective() { cuda_centralized_info_->cuda_labels(), config_->sigmoid)); } else if (config_->objective == std::string("regression")) { cuda_objective_.reset(new CUDARegressionObjective(num_data_, cuda_centralized_info_->cuda_labels())); + } else if (config_->objective == std::string("ranking")) { + cuda_objective_.reset(new CUDARankingObjective(num_data_, + cuda_centralized_info_->cuda_labels(), + cuda_centralized_info_->cuda_query_boundaries(), + train_data_->metadata().num_queries(), + config_->lambdarank_norm, + config_->sigmoid, + config_->lambdarank_truncation_level, + train_data_->metadata().label(), + config_->num_threads)); } else { Log::Fatal("Unsupported objective %s for CUDA.", config_->objective.c_str()); } diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index baa4bf56ca26..827ae02f3408 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -17,6 +17,7 @@ #include "cuda_score_updater.hpp" #include "cuda_binary_objective.hpp" #include "cuda_regression_objective.hpp" +#include "cuda_ranking_objective.hpp" namespace LightGBM { From a5fadfb63dd2c902109275c62099d82dc335d08f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 16 Jun 2021 07:25:39 +0000 Subject: [PATCH 031/166] fix cuda ranking objective --- .../cuda/cuda_centralized_info.cpp | 10 +- .../cuda/cuda_histogram_constructor.cpp | 21 ++- src/treelearner/cuda/cuda_leaf_splits.cu | 1 - src/treelearner/cuda/cuda_objective.hpp | 2 + .../cuda/cuda_ranking_objective.cpp | 32 +++++ .../cuda/cuda_ranking_objective.cu | 126 +++++++++++++++--- .../cuda/cuda_ranking_objective.hpp | 8 +- .../cuda/new_cuda_tree_learner.cpp | 3 +- 8 files changed, 169 insertions(+), 34 deletions(-) diff --git a/src/treelearner/cuda/cuda_centralized_info.cpp b/src/treelearner/cuda/cuda_centralized_info.cpp index 64874a3bd767..b99e340e7fdf 100644 --- a/src/treelearner/cuda/cuda_centralized_info.cpp +++ b/src/treelearner/cuda/cuda_centralized_info.cpp @@ -23,10 +23,12 @@ void CUDACentralizedInfo::Init(const score_t* labels, const Dataset* train_data) InitCUDAMemoryFromHostMemory(&cuda_labels_, labels, num_data_); - InitCUDAMemoryFromHostMemory( - &cuda_query_boundaries_, - train_data->metadata().query_boundaries(), - static_cast(train_data->metadata().num_queries())); + if (train_data->metadata().query_boundaries() != nullptr) { + InitCUDAMemoryFromHostMemory( + &cuda_query_boundaries_, + train_data->metadata().query_boundaries(), + static_cast(train_data->metadata().num_queries() + 1)); + } } void CUDACentralizedInfo::BeforeTrain(const score_t* gradients, const score_t* hessians) { diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 74dfbceab2f7..339a080e4bc5 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -302,7 +302,7 @@ void CUDAHistogramConstructor::CalcConstructHistogramKernelDim( *block_dim_x = max_num_column_per_partition_; *block_dim_y = NUM_THRADS_PER_BLOCK / max_num_column_per_partition_; *grid_dim_x = num_feature_partitions_; - const int min_grid_dim_y = 160; + const int min_grid_dim_y = 10; *grid_dim_y = std::max(min_grid_dim_y, ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + (*block_dim_y) - 1) / (*block_dim_y)); //Log::Warning("block_dim_x = %d, block_dim_y = %d, grid_dim_x = %d, grid_dim_y = %d", *block_dim_x, *block_dim_y, *grid_dim_x, *grid_dim_y); @@ -388,7 +388,7 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data } } - Log::Warning("max_num_column_per_partition_ = %d", max_num_column_per_partition_); + /*Log::Warning("max_num_column_per_partition_ = %d", max_num_column_per_partition_); for (size_t i = 0; i < feature_partition_column_index_offsets_.size(); ++i) { Log::Warning("feature_partition_column_index_offsets_[%d] = %d", i, feature_partition_column_index_offsets_[i]); } @@ -397,7 +397,7 @@ void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data } for (size_t i = 0; i < column_hist_offsets_.size(); ++i) { Log::Warning("column_hist_offsets_[%d] = %d", i, column_hist_offsets_[i]); - } + }*/ InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, feature_partition_column_index_offsets_.data(), @@ -448,8 +448,9 @@ void CUDAHistogramConstructor::GetSparseDataPartitioned( const int num_partitions = static_cast(feature_partition_column_index_offsets_.size()) - 1; partitioned_data->resize(num_partitions); partitioned_row_ptr->resize(num_partitions); + std::vector thread_max_elements_per_row(num_threads_, 0); Threading::For(0, num_partitions, 1, - [partitioned_data, partitioned_row_ptr, row_ptr, row_wise_data, this] (int /*thread_index*/, int start, int end) { + [partitioned_data, partitioned_row_ptr, row_ptr, row_wise_data, &thread_max_elements_per_row, this] (int thread_index, int start, int end) { for (int partition_index = start; partition_index < end; ++partition_index) { std::vector& data_for_this_partition = partitioned_data->at(partition_index); std::vector& row_ptr_for_this_partition = partitioned_row_ptr->at(partition_index); @@ -472,8 +473,12 @@ void CUDAHistogramConstructor::GetSparseDataPartitioned( data_for_this_partition.emplace_back(bin - partition_hist_start); } CHECK_GE(partition_end_in_row, partition_start_in_row); - offset += static_cast(partition_end_in_row - partition_start_in_row); + const data_size_t num_elements_in_row = partition_end_in_row - partition_start_in_row; + offset += static_cast(num_elements_in_row); row_ptr_for_this_partition.emplace_back(offset); + if (num_elements_in_row > thread_max_elements_per_row[thread_index]) { + thread_max_elements_per_row[thread_index] = num_elements_in_row; + } } } }); @@ -484,6 +489,12 @@ void CUDAHistogramConstructor::GetSparseDataPartitioned( offset += partitioned_row_ptr->at(i).back(); partition_ptr->emplace_back(offset); } + max_num_column_per_partition_ = 0; + for (int thread_index = 0; thread_index < num_threads_; ++thread_index) { + if (thread_max_elements_per_row[thread_index] > max_num_column_per_partition_) { + max_num_column_per_partition_ = thread_max_elements_per_row[thread_index]; + } + } } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 5ed07f1a8f59..bc1a8870dbd3 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -54,7 +54,6 @@ __global__ void CUDAInitValuesKernel2(double* cuda_sum_of_gradients, double* cud } void CUDALeafSplits::LaunchInitValuesKernal() { - auto start = std::chrono::steady_clock::now(); CUDAInitValuesKernel1<<>>( cuda_gradients_, cuda_hessians_, cuda_num_data_, cuda_sum_of_gradients_, cuda_sum_of_hessians_); diff --git a/src/treelearner/cuda/cuda_objective.hpp b/src/treelearner/cuda/cuda_objective.hpp index 98cc9a630341..abba984919a1 100644 --- a/src/treelearner/cuda/cuda_objective.hpp +++ b/src/treelearner/cuda/cuda_objective.hpp @@ -26,6 +26,8 @@ class CUDAObjective { virtual const double* cuda_init_score() const = 0; + virtual void TestGlobalArgSort() const {} + protected: const data_size_t num_data_; }; diff --git a/src/treelearner/cuda/cuda_ranking_objective.cpp b/src/treelearner/cuda/cuda_ranking_objective.cpp index d39cbb61e938..e8a88f929f98 100644 --- a/src/treelearner/cuda/cuda_ranking_objective.cpp +++ b/src/treelearner/cuda/cuda_ranking_objective.cpp @@ -14,6 +14,7 @@ CUDARankingObjective::CUDARankingObjective( const data_size_t num_data, const label_t* cuda_labels, const data_size_t* cuda_query_boundaries, + const data_size_t* cpu_query_boundaries, const int num_queries, const bool norm, const double sigmoid, @@ -45,6 +46,32 @@ num_threads_(num_threads) { for (int thread_index = 1; thread_index < num_threads_; ++thread_index) { max_label_ = std::max(max_label_, thread_max_label[thread_index]); } + + std::vector thread_max_num_items_in_query(num_threads_); + Threading::For(0, num_queries_, 1, + [cpu_query_boundaries, &thread_max_num_items_in_query] (int thread_index, data_size_t start, data_size_t end) { + for (data_size_t query_index = start; query_index < end; ++query_index) { + const data_size_t query_item_count = cpu_query_boundaries[query_index + 1] - cpu_query_boundaries[query_index]; + if (query_item_count > thread_max_num_items_in_query[thread_index]) { + thread_max_num_items_in_query[thread_index] = query_item_count; + } + } + }); + data_size_t max_items_in_query = 0; + for (int thread_index = 0; thread_index < num_threads_; ++thread_index) { + if (thread_max_num_items_in_query[thread_index] > max_items_in_query) { + max_items_in_query = thread_max_num_items_in_query[thread_index]; + } + } + max_items_in_query_aligned_ = 1; + --max_items_in_query; + while (max_items_in_query > 0) { + max_items_in_query >>= 1; + max_items_in_query_aligned_ <<= 1; + } + if (max_items_in_query_aligned_ > MAX_NUM_ITEM_IN_QUERY) { + Log::Warning("Too many items in a query."); + } } void CUDARankingObjective::Init() { @@ -52,6 +79,7 @@ void CUDARankingObjective::Init() { SetCUDAMemory(cuda_init_score_, 0, 1); AllocateCUDAMemory(num_data_, &cuda_lambdas_); AllocateCUDAMemory(num_queries_, &cuda_inverse_max_dcgs_); + LaunchCalcInverseMaxDCGKernel(); } void CUDARankingObjective::GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { @@ -60,6 +88,10 @@ void CUDARankingObjective::GetGradients(const double* cuda_scores, score_t* cuda void CUDARankingObjective::CalcInitScore() {} +void CUDARankingObjective::TestGlobalArgSort() const { + LaunchGlobalArgSort(); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_ranking_objective.cu b/src/treelearner/cuda/cuda_ranking_objective.cu index a50035d0f37e..a15b665b82a6 100644 --- a/src/treelearner/cuda/cuda_ranking_objective.cu +++ b/src/treelearner/cuda/cuda_ranking_objective.cu @@ -22,7 +22,7 @@ __device__ void ArgSort(const double* scores, uint16_t* indices, const uint16_t for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { const uint16_t outer_segment_length = 1 << (depth - outer_depth); const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; - const bool ascending = (outer_segment_index % 2 == 0); + const bool ascending = (outer_segment_index % 2 > 0); for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { const uint16_t segment_length = 1 << (depth - inner_depth); const uint16_t half_segment_length = segment_length >> 1; @@ -50,6 +50,61 @@ __device__ void ArgSort(const double* scores, uint16_t* indices, const uint16_t } } +__global__ void ArgSortGlobal(const double* scores, uint16_t* indices, const uint16_t num_items) { + uint16_t num_items_aligned = 1; + uint16_t num_items_ref = num_items - 1; + uint16_t depth = 1; + while (num_items_ref > 0) { + num_items_aligned <<= 1; + num_items_ref >>= 1; + ++depth; + } + for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { + const uint16_t outer_segment_length = 1 << (depth - outer_depth); + const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; + const bool ascending = (outer_segment_index % 2 > 0); + for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { + const uint16_t segment_length = 1 << (depth - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (threadIdx.x < num_items_aligned) { + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length; + if (ascending) { + if (scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) { + const uint16_t index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } else { + if (scores[indices[threadIdx.x]] < scores[indices[index_to_compare]]) { + const uint16_t index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } + } + } + __syncthreads(); + } + } +} + +void CUDARankingObjective::LaunchGlobalArgSort() const { + std::vector scores{1.0f, -2.0f, 3.0f, 0.1f, -8.0f, 1.2f, -10000000.0f, -10000000.0f}; + std::vector indices{0, 1, 2, 3, 4, 5, 6, 7}; + double* cuda_scores = nullptr; + uint16_t* cuda_indices = nullptr; + InitCUDAMemoryFromHostMemory(&cuda_scores, scores.data(), scores.size()); + InitCUDAMemoryFromHostMemory(&cuda_indices, indices.data(), indices.size()); + ArgSortGlobal<<<1, 8>>>(cuda_scores, cuda_indices, indices.size()); + std::vector sorted_indices(indices.size()); + CopyFromCUDADeviceToHost(sorted_indices.data(), cuda_indices, sorted_indices.size()); + for (size_t i = 0; i < sorted_indices.size(); ++i) { + Log::Warning("sorted_indices[%d] = %d", i, sorted_indices[i]); + } +} + __global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, const bool norm, const double sigmoid, const int truncation_level, @@ -77,10 +132,11 @@ __global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const labe shared_hessians[threadIdx.x] = 0.0f; } else { shared_scores[threadIdx.x] = min_score; - shared_indices[threadIdx.x] = 0; + shared_indices[threadIdx.x] = static_cast(threadIdx.x); } __syncthreads(); ArgSort(shared_scores, shared_indices, static_cast(query_item_count)); + __syncthreads(); // get best and worst score const double best_score = shared_scores[shared_indices[0]]; data_size_t worst_idx = query_item_count - 1; @@ -88,8 +144,12 @@ __global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const labe worst_idx -= 1; } const double worst_score = shared_scores[shared_indices[worst_idx]]; - double sum_lambdas = 0.0; - // start accmulate lambdas by pairs that contain at least one document above truncation level + __shared__ double sum_lambdas; + if (threadIdx.x == 0) { + sum_lambdas = 0.0f; + } + __syncthreads(); + // start accumulate lambdas by pairs that contain at least one document above truncation level for (data_size_t i = 0; i < query_item_count - 1 && i < truncation_level; ++i) { if (shared_scores[shared_indices[i]] == min_score) { continue; } if (threadIdx.x > static_cast(i) && threadIdx.x < static_cast(query_item_count)) { @@ -141,21 +201,28 @@ __global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const labe atomicAdd_system(&sum_lambdas, -2 * p_lambda); } } + __syncthreads(); } __syncthreads(); if (norm && sum_lambdas > 0) { double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; - for (data_size_t i = 0; i < query_item_count; ++i) { - cuda_out_gradients_pointer[i] = static_cast(shared_lambdas[i] * norm_factor); - cuda_out_hessians_pointer[i] = static_cast(shared_hessians[i] * norm_factor); + if (threadIdx.x < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x] * norm_factor); + cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x] * norm_factor); + } + } else { + if (threadIdx.x < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x]); + cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x]); } } + __syncthreads(); } } void CUDARankingObjective::LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; - GetGradientsKernel_Ranking<<>>(cuda_scores, cuda_labels_, num_data_, + GetGradientsKernel_Ranking<<>>(cuda_scores, cuda_labels_, num_data_, num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, norm_, sigmoid_, truncation_level_, cuda_out_gradients, cuda_out_hessians); @@ -164,6 +231,7 @@ void CUDARankingObjective::LaunchGetGradientsKernel(const double* cuda_scores, s __device__ void PrefixSumBankConflict(uint16_t* elements, unsigned int n) { unsigned int offset = 1; unsigned int threadIdx_x = threadIdx.x; + const uint16_t last_element = elements[n - 1]; __syncthreads(); for (int d = (n >> 1); d > 0; d >>= 1) { if (threadIdx_x < d) { @@ -189,6 +257,10 @@ __device__ void PrefixSumBankConflict(uint16_t* elements, unsigned int n) { } __syncthreads(); } + if (threadIdx.x == 0) { + elements[n] = elements[n - 1] + last_element; + } + __syncthreads(); } __global__ void CalcInverseMaxDCGKernel( @@ -198,7 +270,7 @@ __global__ void CalcInverseMaxDCGKernel( const data_size_t num_queries, double* cuda_inverse_max_dcgs) { __shared__ uint32_t label_sum[MAX_RANK_LABEL]; - __shared__ uint16_t label_pos[MAX_RANK_LABEL]; + __shared__ uint16_t label_pos[MAX_RANK_LABEL + 1]; const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { @@ -211,40 +283,50 @@ __global__ void CalcInverseMaxDCGKernel( __syncthreads(); const label_t* label_pointer = cuda_labels + query_start; if (threadIdx.x < static_cast(query_count)) { - atomicAdd_system(label_sum + static_cast(label_pointer[threadIdx.x]), 1); + atomicAdd_system(label_sum + (MAX_RANK_LABEL - 1 - static_cast(label_pointer[threadIdx.x])), 1); } __syncthreads(); if (threadIdx.x < MAX_RANK_LABEL) { - if (label_sum[threadIdx.x] > 0) { - label_pos[threadIdx.x] = 1; - } else { - label_pos[threadIdx.x] = 0; - } + label_pos[threadIdx.x] = label_sum[threadIdx.x]; } __syncthreads(); PrefixSumBankConflict(label_pos, MAX_RANK_LABEL); - double gain = 0.0f; + __syncthreads(); + __shared__ double gain; + if (threadIdx.x == 0) { + gain = 0.0f; + } + __syncthreads(); if (threadIdx.x < MAX_RANK_LABEL && label_sum[threadIdx.x] > 0) { - const double label_gain = (1 << threadIdx.x - 1) / log2(2.0f + label_pos[threadIdx.x]); + const uint16_t start_pos = label_pos[threadIdx.x]; + const uint16_t end_pos = min(label_pos[threadIdx.x + 1], truncation_level); + double label_gain = 0.0f; + for (uint16_t k = start_pos; k < end_pos; ++k) { + label_gain += ((1 << (MAX_RANK_LABEL - 1 - threadIdx.x)) - 1) / log(2.0f + k); + } atomicAdd_system(&gain, label_gain); } __syncthreads(); - if (gain > 0.0f) { - cuda_inverse_max_dcgs[query_index] = 1.0f / gain; - } else { - cuda_inverse_max_dcgs[query_index] = 0.0f; + if (threadIdx.x == 0) { + if (gain > 0.0f) { + cuda_inverse_max_dcgs[query_index] = 1.0f / gain; + } else { + cuda_inverse_max_dcgs[query_index] = 0.0f; + } } + __syncthreads(); } } void CUDARankingObjective::LaunchCalcInverseMaxDCGKernel() { const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; - CalcInverseMaxDCGKernel<<>>( + CalcInverseMaxDCGKernel<<>>( cuda_query_boundaries_, cuda_labels_, truncation_level_, num_queries_, cuda_inverse_max_dcgs_); + SynchronizeCUDADevice(); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_ranking_objective.hpp b/src/treelearner/cuda/cuda_ranking_objective.hpp index c2d8981b6dca..7f311b3c9222 100644 --- a/src/treelearner/cuda/cuda_ranking_objective.hpp +++ b/src/treelearner/cuda/cuda_ranking_objective.hpp @@ -24,7 +24,8 @@ class CUDARankingObjective : public CUDAObjective { CUDARankingObjective( const data_size_t num_data, const label_t* cuda_label, - const int* cpu_query_boundaries, + const data_size_t* cuda_query_boundaries, + const data_size_t* cpu_query_boundaries, const int num_queries, const bool norm, const double sigmoid, @@ -42,12 +43,16 @@ class CUDARankingObjective : public CUDAObjective { void GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) override; + void TestGlobalArgSort() const override; + private: void LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians); void LaunchCalcInverseMaxDCGKernel(); + void LaunchGlobalArgSort() const; + // CUDA memory, held by this object double* cuda_init_score_; double* cuda_lambdas_; @@ -64,6 +69,7 @@ class CUDARankingObjective : public CUDAObjective { const int truncation_level_; label_t max_label_; const int num_threads_; + int max_items_in_query_aligned_; }; } // namespace LightGBM diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 143f7e23ada9..bb152f4bdfa9 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -283,10 +283,11 @@ void NewCUDATreeLearner::InitObjective() { cuda_centralized_info_->cuda_labels(), config_->sigmoid)); } else if (config_->objective == std::string("regression")) { cuda_objective_.reset(new CUDARegressionObjective(num_data_, cuda_centralized_info_->cuda_labels())); - } else if (config_->objective == std::string("ranking")) { + } else if (config_->objective == std::string("lambdarank")) { cuda_objective_.reset(new CUDARankingObjective(num_data_, cuda_centralized_info_->cuda_labels(), cuda_centralized_info_->cuda_query_boundaries(), + train_data_->metadata().query_boundaries(), train_data_->metadata().num_queries(), config_->lambdarank_norm, config_->sigmoid, From 3202b79bb30981118d6c80ea60fdf084761565d8 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 16 Jun 2021 12:17:51 +0000 Subject: [PATCH 032/166] support for number of items <= 2048 per query --- .../cuda/cuda_histogram_constructor.cu | 8 +- .../cuda/cuda_ranking_objective.cpp | 2 +- .../cuda/cuda_ranking_objective.cu | 310 ++++++++++++++++-- .../cuda/cuda_ranking_objective.hpp | 5 +- 4 files changed, 286 insertions(+), 39 deletions(-) diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index f843b619232a..dab927e1e9e0 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -104,8 +104,8 @@ __global__ void CUDAConstructHistogramDenseKernel( const uint32_t bin = static_cast(data_ptr[data_index * num_columns_in_partition + threadIdx.x]); const uint32_t pos = bin << 1; float* pos_ptr = shared_hist_ptr + pos; - atomicAdd_system(pos_ptr, grad); - atomicAdd_system(pos_ptr + 1, hess); + atomicAdd_block(pos_ptr, grad); + atomicAdd_block(pos_ptr + 1, hess); inner_data_index += blockDim.y; } } @@ -172,8 +172,8 @@ __global__ void CUDAConstructHistogramSparseKernel( const uint32_t bin = static_cast(data_ptr[row_start + threadIdx.x]); const uint32_t pos = bin << 1; float* pos_ptr = shared_hist + pos; - atomicAdd_system(pos_ptr, grad); - atomicAdd_system(pos_ptr + 1, hess); + atomicAdd_block(pos_ptr, grad); + atomicAdd_block(pos_ptr + 1, hess); } inner_data_index += blockDim.y; } diff --git a/src/treelearner/cuda/cuda_ranking_objective.cpp b/src/treelearner/cuda/cuda_ranking_objective.cpp index e8a88f929f98..eb158118df15 100644 --- a/src/treelearner/cuda/cuda_ranking_objective.cpp +++ b/src/treelearner/cuda/cuda_ranking_objective.cpp @@ -70,7 +70,7 @@ num_threads_(num_threads) { max_items_in_query_aligned_ <<= 1; } if (max_items_in_query_aligned_ > MAX_NUM_ITEM_IN_QUERY) { - Log::Warning("Too many items in a query."); + Log::Warning("Too many items (%d) in a query.", max_items_in_query_aligned_); } } diff --git a/src/treelearner/cuda/cuda_ranking_objective.cu b/src/treelearner/cuda/cuda_ranking_objective.cu index a15b665b82a6..8f52916a8fec 100644 --- a/src/treelearner/cuda/cuda_ranking_objective.cu +++ b/src/treelearner/cuda/cuda_ranking_objective.cu @@ -10,7 +10,7 @@ namespace LightGBM { -__device__ void ArgSort(const double* scores, uint16_t* indices, const uint16_t num_items) { +__device__ void ArgSort(const score_t* scores, uint16_t* indices, const uint16_t num_items) { uint16_t num_items_aligned = 1; uint16_t num_items_ref = num_items - 1; uint16_t depth = 1; @@ -30,18 +30,42 @@ __device__ void ArgSort(const double* scores, uint16_t* indices, const uint16_t if (threadIdx.x < num_items_aligned) { if (half_segment_index % 2 == 0) { const uint16_t index_to_compare = threadIdx.x + half_segment_length; - if (ascending) { - if (scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) { - const uint16_t index = indices[threadIdx.x]; - indices[threadIdx.x] = indices[index_to_compare]; - indices[index_to_compare] = index; - } - } else { - if (scores[indices[threadIdx.x]] < scores[indices[index_to_compare]]) { - const uint16_t index = indices[threadIdx.x]; - indices[threadIdx.x] = indices[index_to_compare]; - indices[index_to_compare] = index; - } + if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) { + const uint16_t index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } + } + __syncthreads(); + } + } +} + +__device__ void ArgSort_Partial(const score_t* scores, uint16_t* indices, const uint16_t num_items, const bool outer_decending) { + uint16_t num_items_aligned = 1; + uint16_t num_items_ref = num_items - 1; + uint16_t depth = 1; + while (num_items_ref > 0) { + num_items_aligned <<= 1; + num_items_ref >>= 1; + ++depth; + } + for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { + const uint16_t outer_segment_length = 1 << (depth - outer_depth); + const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; + const bool ascending = outer_decending ? (outer_segment_index % 2 > 0) : (outer_segment_index % 2 == 0); + for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { + const uint16_t segment_length = 1 << (depth - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (threadIdx.x < num_items_aligned) { + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length; + if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) { + const uint16_t index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; } } } @@ -50,6 +74,54 @@ __device__ void ArgSort(const double* scores, uint16_t* indices, const uint16_t } } +__device__ void ArgSort_2048(const score_t* scores, uint16_t* indices, const uint16_t num_items) { + const uint16_t depth = 11; + const uint16_t half_num_items_aligned = 1024; + ArgSort_Partial(scores, indices, half_num_items_aligned, true); + ArgSort_Partial(scores + half_num_items_aligned, indices + half_num_items_aligned, half_num_items_aligned, false); + const unsigned int index_to_compare = threadIdx.x + half_num_items_aligned; + if (scores[indices[index_to_compare]] > scores[indices[threadIdx.x]]) { + const uint16_t temp_index = indices[index_to_compare]; + indices[index_to_compare] = indices[threadIdx.x]; + indices[threadIdx.x] = temp_index; + } + __syncthreads(); + for (uint16_t inner_depth = 1; inner_depth < depth; ++inner_depth) { + const uint16_t segment_length = 1 << (depth - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (threadIdx.x < half_num_items_aligned) { + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length; + if (scores[indices[threadIdx.x]] < scores[indices[index_to_compare]]) { + const uint16_t index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } + } + __syncthreads(); + } + const score_t* scores_ptr = scores + half_num_items_aligned; + uint16_t* indices_ptr = indices + half_num_items_aligned; + for (uint16_t inner_depth = 1; inner_depth < depth; ++inner_depth) { + const uint16_t segment_length = 1 << (depth - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (threadIdx.x < half_num_items_aligned) { + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length; + if (scores_ptr[indices_ptr[threadIdx.x]] < scores_ptr[indices_ptr[index_to_compare]]) { + const uint16_t index = indices_ptr[threadIdx.x]; + indices_ptr[threadIdx.x] = indices_ptr[index_to_compare]; + indices_ptr[index_to_compare] = index; + } + } + } + __syncthreads(); + } +} + __global__ void ArgSortGlobal(const double* scores, uint16_t* indices, const uint16_t num_items) { uint16_t num_items_aligned = 1; uint16_t num_items_ref = num_items - 1; @@ -109,10 +181,10 @@ __global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const labe const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, const bool norm, const double sigmoid, const int truncation_level, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - __shared__ double shared_scores[MAX_NUM_ITEM_IN_QUERY]; + __shared__ score_t shared_scores[MAX_NUM_ITEM_IN_QUERY]; __shared__ uint16_t shared_indices[MAX_NUM_ITEM_IN_QUERY]; - __shared__ double shared_lambdas[MAX_NUM_ITEM_IN_QUERY]; - __shared__ double shared_hessians[MAX_NUM_ITEM_IN_QUERY]; + __shared__ score_t shared_lambdas[MAX_NUM_ITEM_IN_QUERY]; + __shared__ score_t shared_hessians[MAX_NUM_ITEM_IN_QUERY]; const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); const double min_score = -100000000000.0f; @@ -150,10 +222,16 @@ __global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const labe } __syncthreads(); // start accumulate lambdas by pairs that contain at least one document above truncation level - for (data_size_t i = 0; i < query_item_count - 1 && i < truncation_level; ++i) { - if (shared_scores[shared_indices[i]] == min_score) { continue; } - if (threadIdx.x > static_cast(i) && threadIdx.x < static_cast(query_item_count)) { - const data_size_t j = static_cast(threadIdx.x); + const data_size_t num_items_i = min(query_item_count - 1, truncation_level); + const data_size_t num_j_per_i = query_item_count - 1; + const data_size_t num_pairs = num_items_i * num_j_per_i; + const data_size_t num_pairs_per_thread = (num_pairs + blockDim.x - 1) / blockDim.x; + const data_size_t thread_start = static_cast(threadIdx.x) * num_pairs_per_thread; + const data_size_t thread_end = min(thread_start + num_pairs_per_thread, num_pairs); + for (data_size_t pair_index = thread_start; pair_index < thread_end; ++pair_index) { + const data_size_t i = pair_index / num_j_per_i; + const data_size_t j = pair_index % num_j_per_i + 1; + if (j > i) { // skip pairs with the same labels if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != min_score) { data_size_t high_rank, low_rank; @@ -193,15 +271,14 @@ __global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const labe // update p_lambda *= -sigmoid * delta_pair_NDCG; p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; - atomicAdd_system(shared_lambdas + low, -static_cast(p_lambda)); - atomicAdd_system(shared_hessians + low, static_cast(p_hessian)); - atomicAdd_system(shared_lambdas + high, static_cast(p_lambda)); - atomicAdd_system(shared_hessians + high, static_cast(p_hessian)); + atomicAdd_block(shared_lambdas + low, -static_cast(p_lambda)); + atomicAdd_block(shared_hessians + low, static_cast(p_hessian)); + atomicAdd_block(shared_lambdas + high, static_cast(p_lambda)); + atomicAdd_block(shared_hessians + high, static_cast(p_hessian)); // lambda is negative, so use minus to accumulate - atomicAdd_system(&sum_lambdas, -2 * p_lambda); + atomicAdd_block(&sum_lambdas, -2 * p_lambda); } } - __syncthreads(); } __syncthreads(); if (norm && sum_lambdas > 0) { @@ -220,12 +297,183 @@ __global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const labe } } +/*__device__ void ReduceSumRanking(double* array, const size_t size) { + //const unsigned int threadIdx_x = threadIdx.x; + for (int s = 1; s < size; s <<= 1) { + if (threadIdx.x; % (2 * s) == 0 && (threadIdx.x; + s) < size) { + array[threadIdx.x;] += array[threadIdx.x; + s]; + } + __syncthreads(); + } +}*/ + +__global__ void GetGradientsKernel_Ranking_2048(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, + const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, + const bool norm, const double sigmoid, const int truncation_level, + score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + __shared__ score_t shared_scores[MAX_NUM_ITEM_IN_QUERY]; + __shared__ uint16_t shared_indices[MAX_NUM_ITEM_IN_QUERY]; + __shared__ score_t shared_lambdas[MAX_NUM_ITEM_IN_QUERY]; + __shared__ score_t shared_hessians[MAX_NUM_ITEM_IN_QUERY]; + const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; + const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); + const double min_score = -100000000000.0f; + for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { + const double inverse_max_dcg = cuda_inverse_max_dcgs[query_index]; + const data_size_t query_start = cuda_query_boundaries[query_index]; + const data_size_t query_end = cuda_query_boundaries[query_index + 1]; + const data_size_t query_item_count = query_end - query_start; + const double* cuda_scores_pointer = cuda_scores + query_start; + score_t* cuda_out_gradients_pointer = cuda_out_gradients + query_start; + score_t* cuda_out_hessians_pointer = cuda_out_hessians + query_start; + const label_t* cuda_label_pointer = cuda_labels + query_start; + if (threadIdx.x < query_item_count) { + shared_scores[threadIdx.x] = cuda_scores_pointer[threadIdx.x]; + shared_indices[threadIdx.x] = static_cast(threadIdx.x); + shared_lambdas[threadIdx.x] = 0.0f; + shared_hessians[threadIdx.x] = 0.0f; + } else { + shared_scores[threadIdx.x] = min_score; + shared_indices[threadIdx.x] = static_cast(threadIdx.x); + } + if (query_item_count > 1024) { + const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; + if (threadIdx_x_plus_1024 < query_item_count) { + shared_scores[threadIdx_x_plus_1024] = cuda_scores_pointer[threadIdx_x_plus_1024]; + shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); + shared_lambdas[threadIdx_x_plus_1024] = 0.0f; + shared_hessians[threadIdx_x_plus_1024] = 0.0f; + } else { + shared_scores[threadIdx_x_plus_1024] = min_score; + shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); + } + } + __syncthreads(); + if (query_item_count > 1024) { + ArgSort_2048(shared_scores, shared_indices, static_cast(query_item_count)); + } else { + ArgSort(shared_scores, shared_indices, static_cast(query_item_count)); + } + __syncthreads(); + // get best and worst score + const double best_score = shared_scores[shared_indices[0]]; + data_size_t worst_idx = query_item_count - 1; + if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == min_score) { + worst_idx -= 1; + } + const double worst_score = shared_scores[shared_indices[worst_idx]]; + __shared__ double sum_lambdas; + if (threadIdx.x == 0) { + sum_lambdas = 0.0f; + } + __syncthreads(); + // start accumulate lambdas by pairs that contain at least one document above truncation level + const data_size_t num_items_i = min(query_item_count - 1, truncation_level); + const data_size_t num_j_per_i = query_item_count - 1; + const data_size_t num_pairs = num_items_i * num_j_per_i; + const data_size_t num_pairs_per_thread = (num_pairs + blockDim.x - 1) / blockDim.x; + const data_size_t thread_start = static_cast(threadIdx.x) * num_pairs_per_thread; + const data_size_t thread_end = min(thread_start + num_pairs_per_thread, num_pairs); + double thread_sum_lambdas = 0.0f; + for (data_size_t pair_index = thread_start; pair_index < thread_end; ++pair_index) { + const data_size_t i = pair_index / num_j_per_i; + const data_size_t j = pair_index % num_j_per_i + 1; + if (j > i) { + // skip pairs with the same labels + if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != min_score) { + data_size_t high_rank, low_rank; + if (cuda_label_pointer[shared_indices[i]] > cuda_label_pointer[shared_indices[j]]) { + high_rank = i; + low_rank = j; + } else { + high_rank = j; + low_rank = i; + } + const data_size_t high = shared_indices[high_rank]; + const int high_label = static_cast(cuda_label_pointer[high]); + const double high_score = shared_scores[high]; + const double high_label_gain = static_cast((1 << high_label) - 1); + const double high_discount = log2(2.0f + high_rank); + const data_size_t low = shared_indices[low_rank]; + const int low_label = static_cast(cuda_label_pointer[low]); + const double low_score = shared_scores[low]; + const double low_label_gain = static_cast((1 << low_label) - 1); + const double low_discount = log2(2.0f + low_rank); + + const double delta_score = high_score - low_score; + + // get dcg gap + const double dcg_gap = high_label_gain - low_label_gain; + // get discount of this pair + const double paired_discount = fabs(high_discount - low_discount); + // get delta NDCG + double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; + // regular the delta_pair_NDCG by score distance + if (norm && best_score != worst_score) { + delta_pair_NDCG /= (0.01f + fabs(delta_score)); + } + // calculate lambda for this pair + double p_lambda = 1.0f / (1.0f + exp(sigmoid * delta_score)); + double p_hessian = p_lambda * (1.0f - p_lambda); + // update + p_lambda *= -sigmoid * delta_pair_NDCG; + p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; + atomicAdd_block(shared_lambdas + low, -static_cast(p_lambda)); + atomicAdd_block(shared_hessians + low, static_cast(p_hessian)); + atomicAdd_block(shared_lambdas + high, static_cast(p_lambda)); + atomicAdd_block(shared_hessians + high, static_cast(p_hessian)); + // lambda is negative, so use minus to accumulate + thread_sum_lambdas -= 2 * p_lambda; + } + } + } + atomicAdd_block(&sum_lambdas, thread_sum_lambdas); + __syncthreads(); + if (norm && sum_lambdas > 0) { + double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; + if (threadIdx.x < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x] * norm_factor); + cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x] * norm_factor); + } + if (query_item_count > 1024) { + const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; + if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024] * norm_factor); + cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024] * norm_factor); + } + } + } else { + if (threadIdx.x < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x]); + cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x]); + } + if (query_item_count > 1024) { + const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; + if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024]); + cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024]); + } + } + } + __syncthreads(); + } +} + void CUDARankingObjective::LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; - GetGradientsKernel_Ranking<<>>(cuda_scores, cuda_labels_, num_data_, - num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, - norm_, sigmoid_, truncation_level_, - cuda_out_gradients, cuda_out_hessians); + if (max_items_in_query_aligned_ <= 1024) { + GetGradientsKernel_Ranking<<>>(cuda_scores, cuda_labels_, num_data_, + num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, + norm_, sigmoid_, truncation_level_, + cuda_out_gradients, cuda_out_hessians); + } else if (max_items_in_query_aligned_ <= 2048) { + GetGradientsKernel_Ranking_2048<<>>(cuda_scores, cuda_labels_, num_data_, + num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, + norm_, sigmoid_, truncation_level_, + cuda_out_gradients, cuda_out_hessians); + } else { + Log::Fatal("Too large max_items_in_query_aligned_ = %d", max_items_in_query_aligned_); + } } __device__ void PrefixSumBankConflict(uint16_t* elements, unsigned int n) { @@ -320,7 +568,7 @@ __global__ void CalcInverseMaxDCGKernel( void CUDARankingObjective::LaunchCalcInverseMaxDCGKernel() { const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; - CalcInverseMaxDCGKernel<<>>( + CalcInverseMaxDCGKernel<<>>( cuda_query_boundaries_, cuda_labels_, truncation_level_, diff --git a/src/treelearner/cuda/cuda_ranking_objective.hpp b/src/treelearner/cuda/cuda_ranking_objective.hpp index 7f311b3c9222..88df461485a9 100644 --- a/src/treelearner/cuda/cuda_ranking_objective.hpp +++ b/src/treelearner/cuda/cuda_ranking_objective.hpp @@ -9,9 +9,8 @@ #ifdef USE_CUDA -#define GET_GRADIENTS_BLOCK_SIZE_RANKING_RANKING (128) -#define MAX_NUM_ITEM_IN_QUERY (1024) -#define NUM_QUERY_PER_BLOCK (100) +#define MAX_NUM_ITEM_IN_QUERY (2048) +#define NUM_QUERY_PER_BLOCK (10) #define MAX_RANK_LABEL (32) #include "cuda_objective.hpp" From cd687c96e610ae2774de3add8b664e0444126bf5 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 23 Jun 2021 05:17:40 +0000 Subject: [PATCH 033/166] speedup histogram construction by interleaving global memory access --- .../cuda/cuda_best_split_finder.cpp | 4 +- .../cuda/cuda_best_split_finder.cu | 264 +++- src/treelearner/cuda/cuda_data_partition.cpp | 22 +- src/treelearner/cuda/cuda_data_partition.cu | 1149 ++++++++++++++--- src/treelearner/cuda/cuda_data_partition.hpp | 25 +- .../cuda/cuda_histogram_constructor.cpp | 43 +- .../cuda/cuda_histogram_constructor.cu | 469 +++++-- .../cuda/cuda_histogram_constructor.hpp | 16 + .../cuda/new_cuda_tree_learner.cpp | 4 +- 9 files changed, 1660 insertions(+), 336 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index efcfc3ea3b12..dc376cf1c410 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -168,10 +168,10 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ && sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_); - const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ && sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_); + const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ && sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0); LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits, smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); - SynchronizeCUDADevice(); + //SynchronizeCUDADevice(); global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); SynchronizeCUDADevice(); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index cb81bad2805d..1aff354fbaa8 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -520,7 +520,110 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( if (!is_smaller_leaf_valid) { larger_only = true; } - const int num_blocks = (larger_leaf_index >= 0 && !larger_only) ? num_tasks_ * 2 : num_tasks_; + if (!larger_only) { + FindBestSplitsForLeafKernel<<>>( + // input feature information + cuda_feature_hist_offsets_, + cuda_feature_mfb_offsets_, + cuda_feature_default_bins_, + cuda_feature_missing_type_, + cuda_feature_num_bins_, + // input task information + larger_only, + num_tasks_, + cuda_task_feature_index_, + cuda_task_reverse_, + cuda_task_skip_default_bin_, + cuda_task_na_as_missing_, + cuda_task_out_default_left_, + // input leaf information + smaller_leaf_index, + smaller_leaf_splits->cuda_gain(), + smaller_leaf_splits->cuda_sum_of_gradients(), + smaller_leaf_splits->cuda_sum_of_hessians(), + smaller_leaf_splits->cuda_num_data_in_leaf(), + smaller_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), + larger_leaf_index, + larger_leaf_splits->cuda_gain(), + larger_leaf_splits->cuda_sum_of_gradients(), + larger_leaf_splits->cuda_sum_of_hessians(), + larger_leaf_splits->cuda_num_data_in_leaf(), + larger_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), + // configuration parameter values + min_data_in_leaf_, + min_sum_hessian_in_leaf_, + min_gain_to_split_, + lambda_l1_, + lambda_l2_, + // output parameters + cuda_best_split_threshold_, + cuda_best_split_default_left_, + cuda_best_split_gain_, + cuda_best_split_left_sum_gradient_, + cuda_best_split_left_sum_hessian_, + cuda_best_split_left_count_, + cuda_best_split_left_gain_, + cuda_best_split_left_output_, + cuda_best_split_right_sum_gradient_, + cuda_best_split_right_sum_hessian_, + cuda_best_split_right_count_, + cuda_best_split_right_gain_, + cuda_best_split_right_output_, + cuda_best_split_found_); + } + SynchronizeCUDADevice(); + if (larger_leaf_index >= 0) { + FindBestSplitsForLeafKernel<<>>( + // input feature information + cuda_feature_hist_offsets_, + cuda_feature_mfb_offsets_, + cuda_feature_default_bins_, + cuda_feature_missing_type_, + cuda_feature_num_bins_, + // input task information + true, + num_tasks_, + cuda_task_feature_index_, + cuda_task_reverse_, + cuda_task_skip_default_bin_, + cuda_task_na_as_missing_, + cuda_task_out_default_left_, + // input leaf information + smaller_leaf_index, + smaller_leaf_splits->cuda_gain(), + smaller_leaf_splits->cuda_sum_of_gradients(), + smaller_leaf_splits->cuda_sum_of_hessians(), + smaller_leaf_splits->cuda_num_data_in_leaf(), + smaller_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), + larger_leaf_index, + larger_leaf_splits->cuda_gain(), + larger_leaf_splits->cuda_sum_of_gradients(), + larger_leaf_splits->cuda_sum_of_hessians(), + larger_leaf_splits->cuda_num_data_in_leaf(), + larger_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), + // configuration parameter values + min_data_in_leaf_, + min_sum_hessian_in_leaf_, + min_gain_to_split_, + lambda_l1_, + lambda_l2_, + // output parameters + cuda_best_split_threshold_, + cuda_best_split_default_left_, + cuda_best_split_gain_, + cuda_best_split_left_sum_gradient_, + cuda_best_split_left_sum_hessian_, + cuda_best_split_left_count_, + cuda_best_split_left_gain_, + cuda_best_split_left_output_, + cuda_best_split_right_sum_gradient_, + cuda_best_split_right_sum_hessian_, + cuda_best_split_right_count_, + cuda_best_split_right_gain_, + cuda_best_split_right_output_, + cuda_best_split_found_); + } + /*const int num_blocks = (larger_leaf_index >= 0 && !larger_only) ? num_tasks_ * 2 : num_tasks_; FindBestSplitsForLeafKernel<<>>( // input feature information cuda_feature_hist_offsets_, @@ -569,7 +672,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_best_split_right_count_, cuda_best_split_right_gain_, cuda_best_split_right_output_, - cuda_best_split_found_); + cuda_best_split_found_);*/ } __device__ void ReduceBestSplit(uint8_t* found, double* gain, uint32_t* shared_read_index, @@ -687,50 +790,54 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( uint8_t* cuda_leaf_best_split_found, const bool larger_only) { if (!larger_only) { - for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { - const unsigned int leaf_read_pos = static_cast(smaller_leaf_index) + block_index * static_cast(num_leaves); - if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[smaller_leaf_index] == 1 && - cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[smaller_leaf_index]) || - (cuda_leaf_best_split_found[smaller_leaf_index] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { - cuda_leaf_best_split_found[smaller_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; - cuda_leaf_best_split_feature[smaller_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; - cuda_leaf_best_split_default_left[smaller_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; - cuda_leaf_best_split_threshold[smaller_leaf_index] = cuda_leaf_best_split_threshold[leaf_read_pos]; - cuda_leaf_best_split_gain[smaller_leaf_index] = cuda_leaf_best_split_gain[leaf_read_pos]; - cuda_leaf_best_split_left_sum_gradient[smaller_leaf_index] = cuda_leaf_best_split_left_sum_gradient[leaf_read_pos]; - cuda_leaf_best_split_left_sum_hessian[smaller_leaf_index] = cuda_leaf_best_split_left_sum_hessian[leaf_read_pos]; - cuda_leaf_best_split_left_count[smaller_leaf_index] = cuda_leaf_best_split_left_count[leaf_read_pos]; - cuda_leaf_best_split_left_gain[smaller_leaf_index] = cuda_leaf_best_split_left_gain[leaf_read_pos]; - cuda_leaf_best_split_left_output[smaller_leaf_index] = cuda_leaf_best_split_left_output[leaf_read_pos]; - cuda_leaf_best_split_right_sum_gradient[smaller_leaf_index] = cuda_leaf_best_split_right_sum_gradient[leaf_read_pos]; - cuda_leaf_best_split_right_sum_hessian[smaller_leaf_index] = cuda_leaf_best_split_right_sum_hessian[leaf_read_pos]; - cuda_leaf_best_split_right_count[smaller_leaf_index] = cuda_leaf_best_split_right_count[leaf_read_pos]; - cuda_leaf_best_split_right_gain[smaller_leaf_index] = cuda_leaf_best_split_right_gain[leaf_read_pos]; - cuda_leaf_best_split_right_output[smaller_leaf_index] = cuda_leaf_best_split_right_output[leaf_read_pos]; + if (blockIdx.x == 0) { + for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { + const unsigned int leaf_read_pos = static_cast(smaller_leaf_index) + block_index * static_cast(num_leaves); + if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[smaller_leaf_index] == 1 && + cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[smaller_leaf_index]) || + (cuda_leaf_best_split_found[smaller_leaf_index] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { + cuda_leaf_best_split_found[smaller_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; + cuda_leaf_best_split_feature[smaller_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; + cuda_leaf_best_split_default_left[smaller_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; + cuda_leaf_best_split_threshold[smaller_leaf_index] = cuda_leaf_best_split_threshold[leaf_read_pos]; + cuda_leaf_best_split_gain[smaller_leaf_index] = cuda_leaf_best_split_gain[leaf_read_pos]; + cuda_leaf_best_split_left_sum_gradient[smaller_leaf_index] = cuda_leaf_best_split_left_sum_gradient[leaf_read_pos]; + cuda_leaf_best_split_left_sum_hessian[smaller_leaf_index] = cuda_leaf_best_split_left_sum_hessian[leaf_read_pos]; + cuda_leaf_best_split_left_count[smaller_leaf_index] = cuda_leaf_best_split_left_count[leaf_read_pos]; + cuda_leaf_best_split_left_gain[smaller_leaf_index] = cuda_leaf_best_split_left_gain[leaf_read_pos]; + cuda_leaf_best_split_left_output[smaller_leaf_index] = cuda_leaf_best_split_left_output[leaf_read_pos]; + cuda_leaf_best_split_right_sum_gradient[smaller_leaf_index] = cuda_leaf_best_split_right_sum_gradient[leaf_read_pos]; + cuda_leaf_best_split_right_sum_hessian[smaller_leaf_index] = cuda_leaf_best_split_right_sum_hessian[leaf_read_pos]; + cuda_leaf_best_split_right_count[smaller_leaf_index] = cuda_leaf_best_split_right_count[leaf_read_pos]; + cuda_leaf_best_split_right_gain[smaller_leaf_index] = cuda_leaf_best_split_right_gain[leaf_read_pos]; + cuda_leaf_best_split_right_output[smaller_leaf_index] = cuda_leaf_best_split_right_output[leaf_read_pos]; + } } } } if (larger_leaf_index >= 0) { - for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { - const unsigned int leaf_read_pos = static_cast(larger_leaf_index) + block_index * static_cast(num_leaves); - if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[larger_leaf_index] == 1 && - cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[larger_leaf_index]) || - (cuda_leaf_best_split_found[larger_leaf_index] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { - cuda_leaf_best_split_found[larger_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; - cuda_leaf_best_split_feature[larger_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; - cuda_leaf_best_split_default_left[larger_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; - cuda_leaf_best_split_threshold[larger_leaf_index] = cuda_leaf_best_split_threshold[leaf_read_pos]; - cuda_leaf_best_split_gain[larger_leaf_index] = cuda_leaf_best_split_gain[leaf_read_pos]; - cuda_leaf_best_split_left_sum_gradient[larger_leaf_index] = cuda_leaf_best_split_left_sum_gradient[leaf_read_pos]; - cuda_leaf_best_split_left_sum_hessian[larger_leaf_index] = cuda_leaf_best_split_left_sum_hessian[leaf_read_pos]; - cuda_leaf_best_split_left_count[larger_leaf_index] = cuda_leaf_best_split_left_count[leaf_read_pos]; - cuda_leaf_best_split_left_gain[larger_leaf_index] = cuda_leaf_best_split_left_gain[leaf_read_pos]; - cuda_leaf_best_split_left_output[larger_leaf_index] = cuda_leaf_best_split_left_output[leaf_read_pos]; - cuda_leaf_best_split_right_sum_gradient[larger_leaf_index] = cuda_leaf_best_split_right_sum_gradient[leaf_read_pos]; - cuda_leaf_best_split_right_sum_hessian[larger_leaf_index] = cuda_leaf_best_split_right_sum_hessian[leaf_read_pos]; - cuda_leaf_best_split_right_count[larger_leaf_index] = cuda_leaf_best_split_right_count[leaf_read_pos]; - cuda_leaf_best_split_right_gain[larger_leaf_index] = cuda_leaf_best_split_right_gain[leaf_read_pos]; - cuda_leaf_best_split_right_output[larger_leaf_index] = cuda_leaf_best_split_right_output[leaf_read_pos]; + if (blockIdx.x == 1 || larger_only) { + for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { + const unsigned int leaf_read_pos = static_cast(larger_leaf_index) + block_index * static_cast(num_leaves); + if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[larger_leaf_index] == 1 && + cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[larger_leaf_index]) || + (cuda_leaf_best_split_found[larger_leaf_index] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { + cuda_leaf_best_split_found[larger_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; + cuda_leaf_best_split_feature[larger_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; + cuda_leaf_best_split_default_left[larger_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; + cuda_leaf_best_split_threshold[larger_leaf_index] = cuda_leaf_best_split_threshold[leaf_read_pos]; + cuda_leaf_best_split_gain[larger_leaf_index] = cuda_leaf_best_split_gain[leaf_read_pos]; + cuda_leaf_best_split_left_sum_gradient[larger_leaf_index] = cuda_leaf_best_split_left_sum_gradient[leaf_read_pos]; + cuda_leaf_best_split_left_sum_hessian[larger_leaf_index] = cuda_leaf_best_split_left_sum_hessian[leaf_read_pos]; + cuda_leaf_best_split_left_count[larger_leaf_index] = cuda_leaf_best_split_left_count[leaf_read_pos]; + cuda_leaf_best_split_left_gain[larger_leaf_index] = cuda_leaf_best_split_left_gain[leaf_read_pos]; + cuda_leaf_best_split_left_output[larger_leaf_index] = cuda_leaf_best_split_left_output[leaf_read_pos]; + cuda_leaf_best_split_right_sum_gradient[larger_leaf_index] = cuda_leaf_best_split_right_sum_gradient[leaf_read_pos]; + cuda_leaf_best_split_right_sum_hessian[larger_leaf_index] = cuda_leaf_best_split_right_sum_hessian[leaf_read_pos]; + cuda_leaf_best_split_right_count[larger_leaf_index] = cuda_leaf_best_split_right_count[leaf_read_pos]; + cuda_leaf_best_split_right_gain[larger_leaf_index] = cuda_leaf_best_split_right_gain[leaf_read_pos]; + cuda_leaf_best_split_right_output[larger_leaf_index] = cuda_leaf_best_split_right_output[leaf_read_pos]; + } } } } @@ -751,7 +858,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( } const int num_blocks_per_leaf = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; if (cpu_larger_leaf_index >= 0 && is_smaller_leaf_valid && is_larger_leaf_valid) { - SyncBestSplitForLeafKernel<<<2 * num_blocks_per_leaf, NUM_TASKS_PER_SYNC_BLOCK>>>( + SyncBestSplitForLeafKernel<<>>( cpu_smaller_leaf_index, cpu_larger_leaf_index, cuda_num_features_, @@ -792,8 +899,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( false, num_leaves_); if (num_blocks_per_leaf > 1) { - SynchronizeCUDADevice(); - SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( + SyncBestSplitForLeafKernelAllBlocks<<<1, 1, 0, cuda_streams_[0]>>>( cpu_smaller_leaf_index, cpu_larger_leaf_index, num_blocks_per_leaf, @@ -815,6 +921,70 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cuda_leaf_best_split_found_, false); } + SynchronizeCUDADevice(); + SyncBestSplitForLeafKernel<<>>( + cpu_smaller_leaf_index, + cpu_larger_leaf_index, + cuda_num_features_, + cuda_leaf_best_split_feature_, + cuda_leaf_best_split_default_left_, + cuda_leaf_best_split_threshold_, + cuda_leaf_best_split_gain_, + cuda_leaf_best_split_left_sum_gradient_, + cuda_leaf_best_split_left_sum_hessian_, + cuda_leaf_best_split_left_count_, + cuda_leaf_best_split_left_gain_, + cuda_leaf_best_split_left_output_, + cuda_leaf_best_split_right_sum_gradient_, + cuda_leaf_best_split_right_sum_hessian_, + cuda_leaf_best_split_right_count_, + cuda_leaf_best_split_right_gain_, + cuda_leaf_best_split_right_output_, + cuda_leaf_best_split_found_, + cuda_task_feature_index_, + cuda_best_split_default_left_, + cuda_best_split_threshold_, + cuda_best_split_gain_, + cuda_best_split_left_sum_gradient_, + cuda_best_split_left_sum_hessian_, + cuda_best_split_left_count_, + cuda_best_split_left_gain_, + cuda_best_split_left_output_, + cuda_best_split_right_sum_gradient_, + cuda_best_split_right_sum_hessian_, + cuda_best_split_right_count_, + cuda_best_split_right_gain_, + cuda_best_split_right_output_, + cuda_best_split_found_, + cuda_feature_default_bins_, + num_tasks_, + num_tasks_aligned, + num_blocks_per_leaf, + true, + num_leaves_); + if (num_blocks_per_leaf > 1) { + SyncBestSplitForLeafKernelAllBlocks<<<1, 1, 0, cuda_streams_[1]>>>( + cpu_smaller_leaf_index, + cpu_larger_leaf_index, + num_blocks_per_leaf, + num_leaves_, + cuda_leaf_best_split_feature_, + cuda_leaf_best_split_default_left_, + cuda_leaf_best_split_threshold_, + cuda_leaf_best_split_gain_, + cuda_leaf_best_split_left_sum_gradient_, + cuda_leaf_best_split_left_sum_hessian_, + cuda_leaf_best_split_left_count_, + cuda_leaf_best_split_left_gain_, + cuda_leaf_best_split_left_output_, + cuda_leaf_best_split_right_sum_gradient_, + cuda_leaf_best_split_right_sum_hessian_, + cuda_leaf_best_split_right_count_, + cuda_leaf_best_split_right_gain_, + cuda_leaf_best_split_right_output_, + cuda_leaf_best_split_found_, + true); + } } else { const bool larger_only = (!is_smaller_leaf_valid && is_larger_leaf_valid); SyncBestSplitForLeafKernel<<>>( @@ -926,7 +1096,7 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, __global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int larger_leaf_index, int* cuda_best_split_info_buffer, const int* cuda_leaf_best_split_feature, const uint32_t* cuda_leaf_best_split_threshold, const uint8_t* cuda_leaf_best_split_default_left) { - const unsigned int threadIdx_x = threadIdx.x; + const unsigned int threadIdx_x = blockIdx.x; if (threadIdx_x == 0) { cuda_best_split_info_buffer[0] = cuda_leaf_best_split_feature[smaller_leaf_index]; } else if (threadIdx_x == 1) { @@ -958,7 +1128,7 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_ cuda_leaf_best_split_right_count_, cuda_leaf_best_split_found_, cuda_best_split_info_buffer_); - PrepareLeafBestSplitInfo<<<1, 6, 0, cuda_streams_[0]>>>(smaller_leaf_index, larger_leaf_index, + PrepareLeafBestSplitInfo<<<6, 1, 0, cuda_streams_[0]>>>(smaller_leaf_index, larger_leaf_index, cuda_best_split_info_buffer_, cuda_leaf_best_split_feature_, cuda_leaf_best_split_threshold_, cuda_leaf_best_split_default_left_); std::vector cpu_leaf_best_split_info_buffer(7); diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index c14ecb1b2133..324337bb0f1d 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -88,8 +88,11 @@ void CUDADataPartition::Init(const Dataset* train_data) { InitCUDAValueFromConstant(&cuda_num_total_bin_, num_total_bin_); InitCUDAValueFromConstant(&cuda_cur_num_leaves_, 1); AllocateCUDAMemory(static_cast(num_data_), &cuda_data_to_left_); - AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_), &cuda_block_data_to_left_offset_); - AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_), &cuda_block_data_to_right_offset_); + AllocateCUDAMemory(static_cast(num_data_), &cuda_data_index_to_leaf_index_); + AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_) + 1, &cuda_block_data_to_left_offset_); + AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_) + 1, &cuda_block_data_to_right_offset_); + SetCUDAMemory(cuda_block_data_to_left_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1); + SetCUDAMemory(cuda_block_data_to_right_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1); AllocateCUDAMemory(static_cast(num_data_), &cuda_out_data_indices_in_leaf_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_hist_pool_); CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1); @@ -371,7 +374,7 @@ void CUDADataPartition::Split(const int* leaf_id, const std::vector& cpu_leaf_best_split_threshold, const std::vector& cpu_leaf_best_split_default_left, int* smaller_leaf_index, int* larger_leaf_index, - const int cpu_leaf_index) { + const int cpu_leaf_index, const int cur_max_leaf_index) { global_timer.Start("GenDataToLeftBitVector"); global_timer.Start("SplitInner Copy CUDA To Host"); const data_size_t num_data_in_leaf = cpu_leaf_num_data->at(cpu_leaf_index); @@ -383,7 +386,7 @@ void CUDADataPartition::Split(const int* leaf_id, //Log::Warning("split threshold = %d", split_threshold); //Log::Warning("split default left = %d", split_default_left); global_timer.Stop("SplitInner Copy CUDA To Host"); - GenDataToLeftBitVector(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start); + GenDataToLeftBitVector(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, cpu_leaf_index, cur_max_leaf_index); global_timer.Stop("GenDataToLeftBitVector"); global_timer.Start("SplitInner"); @@ -403,14 +406,15 @@ void CUDADataPartition::Split(const int* leaf_id, larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, - smaller_leaf_index, larger_leaf_index); + smaller_leaf_index, larger_leaf_index, cpu_leaf_index); global_timer.Stop("SplitInner"); } void CUDADataPartition::GenDataToLeftBitVector(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, - const uint8_t split_default_left, const data_size_t leaf_data_start) { - LaunchGenDataToLeftBitVectorKernel(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start); + const uint8_t split_default_left, const data_size_t leaf_data_start, + const int left_leaf_index, const int right_leaf_index) { + LaunchGenDataToLeftBitVectorKernel(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, left_leaf_index, right_leaf_index); } void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, @@ -433,7 +437,7 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ hist_t** larger_leaf_cuda_hist_pointer_pointer, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index) { + int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { LaunchSplitInnerKernel(leaf_index, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, @@ -450,7 +454,7 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, - smaller_leaf_index, larger_leaf_index); + smaller_leaf_index, larger_leaf_index, cpu_leaf_index); ++cur_num_leaves_; } diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 781c8f86d001..b83bfdaf5ba7 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -51,6 +51,226 @@ __device__ void PrefixSum(uint32_t* elements, unsigned int n) { } } +__device__ void PrefixSum_1024(uint32_t* elements, unsigned int n) { + //unsigned int offset = 1; + unsigned int threadIdx_x = threadIdx.x; + const unsigned int conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); + const uint32_t last_element = elements[conflict_free_n_minus_1]; + __syncthreads(); + + if (threadIdx_x < 512) { + const unsigned int src_pos = (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = (2 * threadIdx_x + 2) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + __syncthreads(); + + if (threadIdx_x < 256) { + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 1) - 1; + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 1) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + __syncthreads(); + + if (threadIdx_x < 128) { + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 2) - 1; + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 2) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + __syncthreads(); + + if (threadIdx_x < 64) { + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 3) - 1; + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 3) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + __syncthreads(); + + if (threadIdx_x < 32) { + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 4) - 1; + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 4) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + __syncthreads(); + + if (threadIdx_x < 16) { + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 5) - 1; + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 5) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + __syncthreads(); + + if (threadIdx_x < 8) { + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 6) - 1; + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 6) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + __syncthreads(); + + if (threadIdx_x < 4) { + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 7) - 1; + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 7) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + __syncthreads(); + + if (threadIdx_x < 2) { + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 8) - 1; + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 8) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + __syncthreads(); + + if (threadIdx_x == 0) { + //const unsigned int src_pos = 511; + //const unsigned int dst_pos = 1023; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(1023); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(511); + //elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + elements[conflict_free_dst_pos] += elements[conflict_free_src_pos]; + //} + //__syncthreads(); + + /*for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; + } + offset <<= 1; + __syncthreads(); + }*/ + //if (threadIdx_x == 0) { + elements[conflict_free_n_minus_1] = 0; + //} + //__syncthreads(); + + //if (threadIdx_x == 0) { + //const unsigned int dst_pos = 1023; + //const unsigned int src_pos = 511; + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + + if (threadIdx_x < 2) { + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 8) - 1; + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 8) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + + if (threadIdx_x < 4) { + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 7) - 1; + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 7) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + + if (threadIdx_x < 8) { + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 6) - 1; + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 6) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + + if (threadIdx_x < 16) { + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 5) - 1; + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 5) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + + if (threadIdx_x < 32) { + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 4) - 1; + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 4) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + + if (threadIdx_x < 64) { + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 3) - 1; + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 3) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + + if (threadIdx_x < 128) { + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 2) - 1; + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 2) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + + if (threadIdx_x < 256) { + const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 1) - 1; + const unsigned int src_pos = ((2 * threadIdx_x + 1) << 1) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + + if (threadIdx_x < 512) { + const unsigned int dst_pos = (2 * threadIdx_x + 2) - 1; + const unsigned int src_pos = (2 * threadIdx_x + 1) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + + /*for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const uint32_t src_val = elements[conflict_free_src_pos]; + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; + elements[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + }*/ + if (threadIdx_x == 0) { + elements[CONFLICT_FREE_INDEX(n)] = elements[conflict_free_n_minus_1] + last_element; + } +} + __device__ void PrefixSum(uint16_t* elements, unsigned int n) { unsigned int offset = 1; unsigned int threadIdx_x = threadIdx.x; @@ -109,17 +329,18 @@ __device__ void ReduceSum(double* array, const size_t size) { } __global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_data, - data_size_t* data_indices) { + data_size_t* data_indices, int* cuda_data_index_to_leaf_index) { const data_size_t num_data_ref = *cuda_num_data; const unsigned int data_index = threadIdx.x + blockIdx.x * blockDim.x; if (data_index < num_data_ref) { data_indices[data_index] = data_index; + cuda_data_index_to_leaf_index[data_index] = 0; } } void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() { const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; - FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_); + FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_, cuda_data_index_to_leaf_index_); } __device__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, const uint8_t* split_to_left_bit_vector, @@ -145,6 +366,208 @@ __device__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, const uint } } +template +__global__ void UpdateDataIndexToLeafIndexKernel(const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const BIN_TYPE* column_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { + const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(column_data[global_data_index]); + if (!MIN_IS_MAX) { + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { + cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + } else if (bin < min_bin_ref || bin > max_bin_ref) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + } else { + cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; + } + } else if (bin > th) { + cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + }/* else { + cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + }*/ + } else { + if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { + cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + } else if (bin != max_bin_ref) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + } else { + cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; + } + } else { + if (MISSING_IS_NA && !MFB_IS_NA) { + cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + } else { + if (!MAX_TO_LEFT) { + /*cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + } else {*/ + cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + } + } + } + } + } +} + +#define UpdateDataIndexToLeafIndex_ARGS leaf_data_start, \ + num_data_in_leaf, cuda_data_indices, th, column_data, \ + t_zero_bin, max_bin_ref, min_bin_ref, cuda_data_index_to_leaf_index, left_leaf_index, right_leaf_index, \ + default_leaf_index, missing_default_leaf_index + +template +void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel(const data_size_t leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const BIN_TYPE* column_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index, + const bool missing_is_zero, const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, const bool max_to_left, + const int num_blocks, const int block_size) { + if (min_bin_ref < max_bin_ref) { + if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } + } else { + if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + } + } +} + // missing_is_zero = 0, missing_is_na = 0, min_bin_ref < max_bin_ref template __global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, @@ -155,7 +578,9 @@ __global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature const uint8_t split_default_to_left, const uint8_t /*split_missing_default_to_left*/, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -166,12 +591,15 @@ __global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -191,7 +619,9 @@ __global__ void GenDataToLeftBitVectorKernel4(const int best_split_feature_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -202,15 +632,19 @@ __global__ void GenDataToLeftBitVectorKernel4(const int best_split_feature_ref, if (bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -230,7 +664,9 @@ __global__ void GenDataToLeftBitVectorKernel5(const int best_split_feature_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -241,12 +677,15 @@ __global__ void GenDataToLeftBitVectorKernel5(const int best_split_feature_ref, if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -266,7 +705,9 @@ __global__ void GenDataToLeftBitVectorKernel6(const int best_split_feature_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -277,15 +718,19 @@ __global__ void GenDataToLeftBitVectorKernel6(const int best_split_feature_ref, if (bin == max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -305,7 +750,9 @@ __global__ void GenDataToLeftBitVectorKernel7(const int best_split_feature_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -316,12 +763,15 @@ __global__ void GenDataToLeftBitVectorKernel7(const int best_split_feature_ref, if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -341,7 +791,9 @@ __global__ void GenDataToLeftBitVectorKernel8(const int best_split_feature_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -354,12 +806,15 @@ __global__ void GenDataToLeftBitVectorKernel8(const int best_split_feature_ref, } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -379,7 +834,9 @@ __global__ void GenDataToLeftBitVectorKernel9(const int best_split_feature_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -390,15 +847,19 @@ __global__ void GenDataToLeftBitVectorKernel9(const int best_split_feature_ref, if (bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -418,7 +879,9 @@ __global__ void GenDataToLeftBitVectorKernel10(const int best_split_feature_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -429,12 +892,15 @@ __global__ void GenDataToLeftBitVectorKernel10(const int best_split_feature_ref, if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -454,7 +920,9 @@ __global__ void GenDataToLeftBitVectorKernel11(const int best_split_feature_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -465,12 +933,15 @@ __global__ void GenDataToLeftBitVectorKernel11(const int best_split_feature_ref, if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -490,7 +961,9 @@ __global__ void GenDataToLeftBitVectorKernel12(const int best_split_feature_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -501,15 +974,19 @@ __global__ void GenDataToLeftBitVectorKernel12(const int best_split_feature_ref, if (bin == t_zero_bin || bin == max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -529,7 +1006,9 @@ __global__ void GenDataToLeftBitVectorKernel13(const int best_split_feature_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -540,15 +1019,19 @@ __global__ void GenDataToLeftBitVectorKernel13(const int best_split_feature_ref, if (bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -568,7 +1051,9 @@ __global__ void GenDataToLeftBitVectorKernel14(const int best_split_feature_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -579,15 +1064,19 @@ __global__ void GenDataToLeftBitVectorKernel14(const int best_split_feature_ref, if (bin == max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -607,7 +1096,9 @@ __global__ void GenDataToLeftBitVectorKernel15(const int best_split_feature_ref, const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -618,12 +1109,15 @@ __global__ void GenDataToLeftBitVectorKernel15(const int best_split_feature_ref, if (bin < min_bin_ref || bin > max_bin_ref) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -643,7 +1137,9 @@ __global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition) { + const int split_indices_block_size_data_partition, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; @@ -654,25 +1150,31 @@ __global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else if (bin != max_bin_ref) { if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else { cuda_data_to_left[local_data_index] = split_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; } } else { if (MISSING_IS_NA && !MFB_IS_NA) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else { if (MAX_TO_LEFT) { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } else { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } } } @@ -689,11 +1191,13 @@ __global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, th, num_features_, \ column_data, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, \ split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, \ - split_indices_block_size_data_partition_aligned + split_indices_block_size_data_partition_aligned, \ + cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, - const uint8_t split_default_left, const data_size_t leaf_data_start) { + const uint8_t split_default_left, const data_size_t leaf_data_start, + const int left_leaf_index, const int right_leaf_index) { const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; @@ -720,27 +1224,34 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num } uint8_t split_default_to_left = 0; uint8_t split_missing_default_to_left = 0; + int default_leaf_index = right_leaf_index; + int missing_default_leaf_index = right_leaf_index; if (most_freq_bin <= split_threshold) { split_default_to_left = 1; + default_leaf_index = left_leaf_index; } if (missing_is_zero || missing_is_na) { if (split_default_left) { split_missing_default_to_left = 1; + missing_default_leaf_index = left_leaf_index; } } const int column_index = feature_index_to_column_index_[split_feature_index]; const uint8_t bit_type = column_bit_type_[column_index]; + + const bool max_bin_to_left = (max_bin <= th); + if (min_bin < max_bin) { if (!missing_is_zero && !missing_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } @@ -748,265 +1259,302 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); } else { Log::Fatal("Unknown bit type %d", bit_type); } } } } else { - const bool max_bin_to_left = (max_bin <= th); if (bit_type == 8) { if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } } } - SynchronizeCUDADevice(); + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, + cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, + left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, + static_cast(missing_is_zero), + static_cast(missing_is_na), + static_cast(mfb_is_zero), + static_cast(mfb_is_na), + max_bin_to_left, + num_blocks_final, + split_indices_block_size_data_partition_aligned); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, + cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, + left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, + static_cast(missing_is_zero), + static_cast(missing_is_na), + static_cast(mfb_is_zero), + static_cast(mfb_is_na), + max_bin_to_left, + num_blocks_final, + split_indices_block_size_data_partition_aligned); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, + cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, + left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, + static_cast(missing_is_zero), + static_cast(missing_is_na), + static_cast(mfb_is_zero), + static_cast(mfb_is_na), + max_bin_to_left, + num_blocks_final, + split_indices_block_size_data_partition_aligned); + } + + //SynchronizeCUDADevice(); } __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* block_to_left_offset_buffer, @@ -1133,6 +1681,165 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b } } +__global__ void AggregateBlockOffsetKernel2(const int* leaf_index, data_size_t* block_to_left_offset_buffer, + data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, + data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, + int* cuda_cur_num_leaves, + const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left, const double* best_split_gain, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, + // for leaf splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer, + const int* cuda_num_total_bin, + hist_t* cuda_hist, hist_t** cuda_hist_pool, + + int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, + double* tree_left_output, double* tree_right_output, data_size_t* tree_left_count, data_size_t* tree_right_count, + double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, + double* data_partition_leaf_output, const data_size_t num_blocks) { + __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE + 2 + + (AGGREGATE_BLOCK_SIZE + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE + 2 + + (AGGREGATE_BLOCK_SIZE + 2) / NUM_BANKS_DATA_PARTITION]; + const int leaf_index_ref = *leaf_index; + const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; + const unsigned int blockDim_x = blockDim.x; + const unsigned int threadIdx_x = threadIdx.x; + const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); + const data_size_t num_blocks_plus_1 = num_blocks + 1; + const uint32_t num_blocks_per_thread = (num_blocks_plus_1 + blockDim_x - 1) / blockDim_x; + const uint32_t remain = num_blocks_plus_1 - ((num_blocks_per_thread - 1) * blockDim_x); + const uint32_t remain_offset = remain * num_blocks_per_thread; + uint32_t thread_start_block_index = 0; + uint32_t thread_end_block_index = 0; + if (threadIdx_x < remain) { + thread_start_block_index = threadIdx_x * num_blocks_per_thread; + thread_end_block_index = min(thread_start_block_index + num_blocks_per_thread, num_blocks_plus_1); + } else { + thread_start_block_index = remain_offset + (num_blocks_per_thread - 1) * (threadIdx_x - remain); + thread_end_block_index = min(thread_start_block_index + num_blocks_per_thread - 1, num_blocks_plus_1); + } + if (threadIdx.x == 0) { + block_to_right_offset_buffer[0] = 0; + } + __syncthreads(); + for (uint32_t block_index = thread_start_block_index + 1; block_index < thread_end_block_index; ++block_index) { + block_to_left_offset_buffer[block_index] += block_to_left_offset_buffer[block_index - 1]; + block_to_right_offset_buffer[block_index] += block_to_right_offset_buffer[block_index - 1]; + } + __syncthreads(); + if (thread_start_block_index < thread_end_block_index) { + block_to_left_offset[conflict_free_threadIdx_x] = block_to_left_offset_buffer[thread_end_block_index - 1]; + block_to_right_offset[conflict_free_threadIdx_x] = block_to_right_offset_buffer[thread_end_block_index - 1]; + } else { + block_to_left_offset[conflict_free_threadIdx_x] = 0; + block_to_right_offset[conflict_free_threadIdx_x] = 0; + } + __syncthreads(); + PrefixSum_1024(block_to_left_offset, blockDim_x); + PrefixSum_1024(block_to_right_offset, blockDim_x); + __syncthreads(); + const uint32_t to_left_total_count = block_to_left_offset[CONFLICT_FREE_INDEX(blockDim_x)]; + const uint32_t to_left_thread_block_offset = block_to_left_offset[conflict_free_threadIdx_x]; + const uint32_t to_right_thread_block_offset = block_to_right_offset[conflict_free_threadIdx_x] + to_left_total_count; + for (uint32_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { + block_to_left_offset_buffer[block_index] += to_left_thread_block_offset; + block_to_right_offset_buffer[block_index] += to_right_thread_block_offset; + } + __syncthreads(); + if (blockIdx.x == 0 && threadIdx.x == 0) { + ++(*cuda_cur_num_leaves); + const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; + const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index_ref]; + cuda_leaf_data_end[leaf_index_ref] = cuda_leaf_data_start[leaf_index_ref] + static_cast(to_left_total_count); + cuda_leaf_num_data[leaf_index_ref] = static_cast(to_left_total_count); + cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index_ref]; + cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; + cuda_leaf_num_data[cur_max_leaf_index] = num_data_in_leaf - static_cast(to_left_total_count); + } +} + +__global__ void AggregateBlockOffsetKernel3(const int* leaf_index, data_size_t* block_to_left_offset_buffer, + data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, + data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, + int* cuda_cur_num_leaves, + const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left, const double* best_split_gain, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, + // for leaf splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer, + const int* cuda_num_total_bin, + hist_t* cuda_hist, hist_t** cuda_hist_pool, + + int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, + double* tree_left_output, double* tree_right_output, data_size_t* tree_left_count, data_size_t* tree_right_count, + double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, + double* data_partition_leaf_output, const data_size_t num_blocks, const data_size_t num_blocks_aligned) { + __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE + 2 + + (AGGREGATE_BLOCK_SIZE + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE + 2 + + (AGGREGATE_BLOCK_SIZE + 2) / NUM_BANKS_DATA_PARTITION]; + const int leaf_index_ref = *leaf_index; + const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; + const unsigned int threadIdx_x = threadIdx.x; + const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); + const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); + if (threadIdx_x < static_cast(num_blocks)) { + block_to_left_offset[conflict_free_threadIdx_x] = block_to_left_offset_buffer[threadIdx_x + 1]; + block_to_right_offset[conflict_free_threadIdx_x] = block_to_right_offset_buffer[threadIdx_x + 1]; + } else { + block_to_left_offset[conflict_free_threadIdx_x] = 0; + block_to_right_offset[conflict_free_threadIdx_x] = 0; + } + __syncthreads(); + PrefixSum(block_to_left_offset, num_blocks_aligned); + PrefixSum(block_to_right_offset, num_blocks_aligned); + __syncthreads(); + const uint32_t to_left_total_count = block_to_left_offset[CONFLICT_FREE_INDEX(num_blocks_aligned)]; + if (threadIdx_x < static_cast(num_blocks)) { + block_to_left_offset_buffer[threadIdx_x + 1] = block_to_left_offset[conflict_free_threadIdx_x_plus_1]; + block_to_right_offset_buffer[threadIdx_x + 1] = block_to_right_offset[conflict_free_threadIdx_x_plus_1] + to_left_total_count; + } + if (threadIdx_x == 0) { + block_to_right_offset_buffer[0] = to_left_total_count; + } + __syncthreads(); + if (blockIdx.x == 0 && threadIdx.x == 0) { + ++(*cuda_cur_num_leaves); + const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; + const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index_ref]; + cuda_leaf_data_end[leaf_index_ref] = cuda_leaf_data_start[leaf_index_ref] + static_cast(to_left_total_count); + cuda_leaf_num_data[leaf_index_ref] = static_cast(to_left_total_count); + cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index_ref]; + cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; + cuda_leaf_num_data[cur_max_leaf_index] = num_data_in_leaf - static_cast(to_left_total_count); + } +} + __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, @@ -1162,84 +1869,137 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, double* data_partition_leaf_output, int* cuda_split_info_buffer) { - if (blockIdx.x == 0 && threadIdx.x == 0) { - const int leaf_index_ref = *leaf_index; - const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; - const unsigned int to_left_total_cnt = cuda_leaf_num_data[leaf_index_ref]; - const int cuda_num_total_bin_ref = *cuda_num_total_bin; - double* cuda_split_info_buffer_for_hessians = reinterpret_cast(cuda_split_info_buffer + 8); - + const int leaf_index_ref = *leaf_index; + const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; + const unsigned int to_left_total_cnt = cuda_leaf_num_data[leaf_index_ref]; + const int cuda_num_total_bin_ref = *cuda_num_total_bin; + double* cuda_split_info_buffer_for_hessians = reinterpret_cast(cuda_split_info_buffer + 8); + const unsigned int global_thread_index = blockIdx.x * blockDim.x + threadIdx.x; + if (global_thread_index == 0) { tree_split_leaf_index[cur_max_leaf_index - 1] = leaf_index_ref; + } else if (global_thread_index == 1) { tree_inner_feature_index[cur_max_leaf_index - 1] = best_split_feature[leaf_index_ref]; + } else if (global_thread_index == 2) { tree_threshold[cur_max_leaf_index - 1] = best_split_threshold[leaf_index_ref]; + } else if (global_thread_index == 3) { tree_left_output[cur_max_leaf_index - 1] = best_left_leaf_value[leaf_index_ref]; + } else if (global_thread_index == 4) { tree_right_output[cur_max_leaf_index - 1] = best_right_leaf_value[leaf_index_ref]; + } else if (global_thread_index == 5) { tree_left_count[cur_max_leaf_index - 1] = best_left_count[leaf_index_ref]; + } else if (global_thread_index == 6) { tree_right_count[cur_max_leaf_index - 1] = best_right_count[leaf_index_ref]; + } else if (global_thread_index == 7) { tree_left_sum_hessian[cur_max_leaf_index - 1] = best_left_sum_hessians[leaf_index_ref]; + } else if (global_thread_index == 8) { tree_right_sum_hessian[cur_max_leaf_index - 1] = best_right_sum_hessians[leaf_index_ref]; + } else if (global_thread_index == 9) { tree_gain[cur_max_leaf_index - 1] = best_split_gain[leaf_index_ref]; + } else if (global_thread_index == 10) { tree_default_left[cur_max_leaf_index - 1] = best_split_default_left[leaf_index_ref]; + } else if (global_thread_index == 11) { data_partition_leaf_output[leaf_index_ref] = best_left_leaf_value[leaf_index_ref]; + } else if (global_thread_index == 12) { data_partition_leaf_output[cur_max_leaf_index] = best_right_leaf_value[leaf_index_ref]; - + } else if (global_thread_index == 13) { cuda_split_info_buffer[0] = leaf_index_ref; + } else if (global_thread_index == 14) { cuda_split_info_buffer[1] = cuda_leaf_num_data[leaf_index_ref]; + } else if (global_thread_index == 15) { cuda_split_info_buffer[2] = cuda_leaf_data_start[leaf_index_ref]; + } else if (global_thread_index == 16) { cuda_split_info_buffer[3] = cur_max_leaf_index; + } else if (global_thread_index == 17) { cuda_split_info_buffer[4] = cuda_leaf_num_data[cur_max_leaf_index]; + } else if (global_thread_index == 18) { cuda_split_info_buffer[5] = cuda_leaf_data_start[cur_max_leaf_index]; + } else if (global_thread_index == 19) { cuda_split_info_buffer_for_hessians[0] = best_left_sum_hessians[leaf_index_ref]; + } else if (global_thread_index == 20) { cuda_split_info_buffer_for_hessians[1] = best_right_sum_hessians[leaf_index_ref]; - + } else if (global_thread_index == 21) { best_split_found[leaf_index_ref] = 0; + } else if (global_thread_index == 22) { best_split_found[cur_max_leaf_index] = 0; + } - if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { - *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; + if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { + if (global_thread_index == 0) { + hist_t* parent_hist_ptr = cuda_hist_pool[leaf_index_ref]; + cuda_hist_pool[cur_max_leaf_index] = parent_hist_ptr; + cuda_hist_pool[leaf_index_ref] = cuda_hist + 2 * cur_max_leaf_index * cuda_num_total_bin_ref; + *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; + *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; + } else if (global_thread_index == 1) { *smaller_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; + } else if (global_thread_index == 2) { *smaller_leaf_cuda_sum_of_hessians_pointer = best_left_sum_hessians[leaf_index_ref]; + } else if (global_thread_index == 3) { *smaller_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt; + } else if (global_thread_index == 4) { *smaller_leaf_cuda_gain_pointer = best_left_gain[leaf_index_ref]; + } else if (global_thread_index == 5) { *smaller_leaf_cuda_leaf_value_pointer = best_left_leaf_value[leaf_index_ref]; - *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[leaf_index_ref]; - + } else if (global_thread_index == 6) { + *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices; + } else if (global_thread_index == 7) { *larger_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; + } else if (global_thread_index == 8) { *larger_leaf_cuda_sum_of_gradients_pointer = best_right_sum_gradients[leaf_index_ref]; + } else if (global_thread_index == 9) { *larger_leaf_cuda_sum_of_hessians_pointer = best_right_sum_hessians[leaf_index_ref]; + } else if (global_thread_index == 10) { *larger_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index]; + } else if (global_thread_index == 11) { *larger_leaf_cuda_gain_pointer = best_right_gain[leaf_index_ref]; + } else if (global_thread_index == 12) { *larger_leaf_cuda_leaf_value_pointer = best_right_leaf_value[leaf_index_ref]; - *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[cur_max_leaf_index]; - - hist_t* parent_hist_ptr = cuda_hist_pool[leaf_index_ref]; - cuda_hist_pool[cur_max_leaf_index] = parent_hist_ptr; - cuda_hist_pool[leaf_index_ref] = cuda_hist + 2 * cur_max_leaf_index * cuda_num_total_bin_ref; - *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; - *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; + } else if (global_thread_index == 13) { + *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_num_data[leaf_index_ref]; + } else if (global_thread_index == 14) { cuda_split_info_buffer[6] = leaf_index_ref; + } else if (global_thread_index == 15) { cuda_split_info_buffer[7] = cur_max_leaf_index; - } else { + } else if (global_thread_index == 16) { + *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; + } + } else { + if (global_thread_index == 0) { *larger_leaf_cuda_leaf_index_pointer = leaf_index_ref; + } else if (global_thread_index == 1) { *larger_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; + } else if (global_thread_index == 2) { *larger_leaf_cuda_sum_of_hessians_pointer = best_left_sum_hessians[leaf_index_ref]; + } else if (global_thread_index == 3) { *larger_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt; + } else if (global_thread_index == 4) { *larger_leaf_cuda_gain_pointer = best_left_gain[leaf_index_ref]; + } else if (global_thread_index == 5) { *larger_leaf_cuda_leaf_value_pointer = best_left_leaf_value[leaf_index_ref]; - *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[leaf_index_ref]; - + } else if (global_thread_index == 6) { + *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices; + } else if (global_thread_index == 7) { *smaller_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; + } else if (global_thread_index == 8) { *smaller_leaf_cuda_sum_of_gradients_pointer = best_right_sum_gradients[leaf_index_ref]; + } else if (global_thread_index == 9) { *smaller_leaf_cuda_sum_of_hessians_pointer = best_right_sum_hessians[leaf_index_ref]; + } else if (global_thread_index == 10) { *smaller_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index]; + } else if (global_thread_index == 11) { *smaller_leaf_cuda_gain_pointer = best_right_gain[leaf_index_ref]; + } else if (global_thread_index == 12) { *smaller_leaf_cuda_leaf_value_pointer = best_right_leaf_value[leaf_index_ref]; - *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_data_start[cur_max_leaf_index]; - + } else if (global_thread_index == 13) { + *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_num_data[leaf_index_ref]; + } else if (global_thread_index == 14) { cuda_hist_pool[cur_max_leaf_index] = cuda_hist + 2 * cur_max_leaf_index * cuda_num_total_bin_ref; *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; + } else if (global_thread_index == 15) { *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; + } else if (global_thread_index == 16) { cuda_split_info_buffer[6] = cur_max_leaf_index; + } else if (global_thread_index == 17) { cuda_split_info_buffer[7] = leaf_index_ref; } } @@ -1285,14 +2045,13 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x]; if (threadIdx_x == 0) { thread_to_left_pos[0] = 0; + thread_to_right_pos[0] = 0; } __syncthreads(); PrefixSum(thread_to_left_pos, split_indices_block_size_data_partition); __syncthreads(); if (threadIdx_x > 0) { thread_to_right_pos[threadIdx_x] = (threadIdx_x - thread_to_left_pos[conflict_free_threadIdx_x_plus_1]); - } else { - thread_to_right_pos[threadIdx_x] = 0; } thread_to_right_pos[threadIdx_x + blockDim_x] = (threadIdx_x + blockDim_x - thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]); __syncthreads(); @@ -1314,20 +2073,14 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ } } -__global__ void CopyDataIndicesKernel(const int* leaf_index, - const int* cuda_cur_num_leaves, - const data_size_t* cuda_leaf_data_start, - const data_size_t* cuda_leaf_num_data, +__global__ void CopyDataIndicesKernel( + const data_size_t num_data_in_leaf, const data_size_t* out_data_indices_in_leaf, data_size_t* cuda_data_indices) { - const int leaf_index_ref = *leaf_index; - const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; - const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index_ref] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; const unsigned int threadIdx_x = threadIdx.x; const unsigned int global_thread_index = blockIdx.x * blockDim.x + threadIdx_x; - data_size_t* cuda_data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; - if (global_thread_index < num_data_in_leaf_ref) { - cuda_data_indices_in_leaf[global_thread_index] = out_data_indices_in_leaf[global_thread_index]; + if (global_thread_index < num_data_in_leaf) { + cuda_data_indices[global_thread_index] = out_data_indices_in_leaf[global_thread_index]; } } @@ -1351,7 +2104,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data hist_t** larger_leaf_cuda_hist_pointer_pointer, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index) { + int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; @@ -1361,35 +2114,73 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data split_indices_block_size_data_partition >>= 1; } const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; + int num_blocks_final_ref = num_blocks_final - 1; + int num_blocks_final_aligned = 1; + while (num_blocks_final_ref > 0) { + num_blocks_final_aligned <<= 1; + num_blocks_final_ref >>= 1; + } global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); - AggregateBlockOffsetKernel<<<1, split_indices_block_size_data_partition_aligned / 2>>>(leaf_index, cuda_block_data_to_left_offset_, - cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, - cuda_leaf_num_data_, cuda_data_indices_, - cuda_cur_num_leaves_, - best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, - best_left_sum_gradients, best_left_sum_hessians, best_left_count, - best_left_gain, best_left_leaf_value, - best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, - smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, - smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, - smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, - smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - smaller_leaf_cuda_hist_pointer_pointer, - larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, - larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, - larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, - cuda_num_total_bin_, - cuda_hist_, - cuda_hist_pool_, split_indices_block_size_data_partition_aligned, - - tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, - tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, - tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, - data_partition_leaf_output_); + if (num_blocks_final > AGGREGATE_BLOCK_SIZE) { + AggregateBlockOffsetKernel2<<<1, AGGREGATE_BLOCK_SIZE, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, + cuda_leaf_num_data_, cuda_data_indices_, + cuda_cur_num_leaves_, + best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, + best_left_sum_gradients, best_left_sum_hessians, best_left_count, + best_left_gain, best_left_leaf_value, + best_right_sum_gradients, best_right_sum_hessians, best_right_count, + best_right_gain, best_right_leaf_value, + + smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, + smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, + smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, + smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + smaller_leaf_cuda_hist_pointer_pointer, + larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, + larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, + larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_hist_pointer_pointer, + cuda_num_total_bin_, + cuda_hist_, + cuda_hist_pool_, + + tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, + tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, + tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, + data_partition_leaf_output_, num_blocks_final); + } else { + AggregateBlockOffsetKernel3<<<1, num_blocks_final_aligned, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, + cuda_leaf_num_data_, cuda_data_indices_, + cuda_cur_num_leaves_, + best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, + best_left_sum_gradients, best_left_sum_hessians, best_left_count, + best_left_gain, best_left_leaf_value, + best_right_sum_gradients, best_right_sum_hessians, best_right_count, + best_right_gain, best_right_leaf_value, + + smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, + smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, + smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, + smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + smaller_leaf_cuda_hist_pointer_pointer, + larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, + larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, + larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_hist_pointer_pointer, + cuda_num_total_bin_, + cuda_hist_, + cuda_hist_pool_, + + tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, + tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, + tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, + data_partition_leaf_output_, num_blocks_final, num_blocks_final_aligned); + } SynchronizeCUDADevice(); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); global_timer.Start("CUDADataPartition::SplitInnerKernel"); @@ -1398,16 +2189,13 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); + //SynchronizeCUDADevice(); global_timer.Stop("CUDADataPartition::SplitInnerKernel"); - global_timer.Start("CUDADataPartition::CopyDataIndicesKernel"); - CopyDataIndicesKernel<<>>( - leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_data_indices_); - global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); - SplitTreeStructureKernel<<<1, 1, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + SplitTreeStructureKernel<<<4, 6, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, - cuda_leaf_num_data_, cuda_data_indices_, + cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_cur_num_leaves_, best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, @@ -1428,24 +2216,29 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data cuda_num_total_bin_, cuda_hist_, cuda_hist_pool_, split_indices_block_size_data_partition_aligned, - + tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, data_partition_leaf_output_, cuda_split_info_buffer_); + //SynchronizeCUDADevice(); global_timer.Stop("CUDADataPartition::SplitTreeStructureKernel"); std::vector cpu_split_info_buffer(12); const double* cpu_sum_hessians_info = reinterpret_cast(cpu_split_info_buffer.data() + 8); + global_timer.Start("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); CopyFromCUDADeviceToHostAsync(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 12, cuda_streams_[0]); + global_timer.Stop("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); SynchronizeCUDADevice(); - const int left_leaf_index = cpu_split_info_buffer[0]; const data_size_t left_leaf_num_data = cpu_split_info_buffer[1]; const data_size_t left_leaf_data_start = cpu_split_info_buffer[2]; - const int right_leaf_index = cpu_split_info_buffer[3]; const data_size_t right_leaf_num_data = cpu_split_info_buffer[4]; + global_timer.Start("CUDADataPartition::CopyDataIndicesKernel"); + CopyDataIndicesKernel<<>>( + left_leaf_num_data + right_leaf_num_data, cuda_out_data_indices_in_leaf_, cuda_data_indices_ + left_leaf_data_start); + global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); + const int left_leaf_index = cpu_split_info_buffer[0]; + const int right_leaf_index = cpu_split_info_buffer[3]; const data_size_t right_leaf_data_start = cpu_split_info_buffer[5]; - //Log::Warning("################################# left_leaf_num_data = %d, right_leaf_num_data = %d #################################", - // left_leaf_num_data, right_leaf_num_data); (*cpu_leaf_num_data)[left_leaf_index] = left_leaf_num_data; (*cpu_leaf_data_start)[left_leaf_index] = left_leaf_data_start; (*cpu_leaf_num_data)[right_leaf_index] = right_leaf_num_data; @@ -1476,26 +2269,34 @@ void CUDADataPartition::LaunchPrefixSumKernel(uint32_t* cuda_elements) { __global__ void AddPredictionToScoreKernel(const double* data_partition_leaf_output, const data_size_t* num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const data_size_t* leaf_data_start, const double learning_rate, double* cuda_scores) { + const data_size_t* leaf_data_start, const double learning_rate, double* cuda_scores, + const int* cuda_data_index_to_leaf_index, const data_size_t num_data) { const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockIdx_x = blockIdx.x; const unsigned int blockDim_x = blockDim.x; - const data_size_t num_data = num_data_in_leaf[blockIdx_x]; - const data_size_t* data_indices = data_indices_in_leaf + leaf_data_start[blockIdx_x]; - const double leaf_prediction_value = data_partition_leaf_output[blockIdx_x] * learning_rate; - for (unsigned int offset = 0; offset < static_cast(num_data); offset += blockDim_x) { + //const data_size_t num_data = num_data_in_leaf[blockIdx_x]; + //const data_size_t* data_indices = data_indices_in_leaf + leaf_data_start[blockIdx_x]; + const int data_index = static_cast(blockIdx_x * blockDim_x + threadIdx_x); + //const double leaf_prediction_value = data_partition_leaf_output[blockIdx_x] * learning_rate; + /*for (unsigned int offset = 0; offset < static_cast(num_data); offset += blockDim_x) { const data_size_t inner_data_index = static_cast(offset + threadIdx_x); if (inner_data_index < num_data) { const data_size_t data_index = data_indices[inner_data_index]; cuda_scores[data_index] += leaf_prediction_value; } + }*/ + if (data_index < num_data) { + const int leaf_index = cuda_data_index_to_leaf_index[data_index]; + const double leaf_prediction_value = data_partition_leaf_output[leaf_index] * learning_rate; + cuda_scores[data_index] += leaf_prediction_value; } } void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double learning_rate, double* cuda_scores) { global_timer.Start("CUDADataPartition::AddPredictionToScoreKernel"); - AddPredictionToScoreKernel<<>>(data_partition_leaf_output_, - cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, learning_rate, cuda_scores); + const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; + AddPredictionToScoreKernel<<>>(data_partition_leaf_output_, + cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, learning_rate, cuda_scores, cuda_data_index_to_leaf_index_, num_data_); SynchronizeCUDADevice(); global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel"); } diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 7d24508f7a50..b639dd8facc2 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -18,6 +18,7 @@ #define SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION (512) #define NUM_BANKS_DATA_PARTITION (32) #define LOG_NUM_BANKS_DATA_PARTITION (5) +#define AGGREGATE_BLOCK_SIZE (1024) namespace LightGBM { @@ -57,7 +58,7 @@ class CUDADataPartition { const std::vector& cpu_leaf_best_split_threshold, const std::vector& cpu_leaf_best_split_default_left, int* smaller_leaf_index, int* larger_leaf_index, - const int cpu_leaf_index); + const int cpu_leaf_index, const int cur_max_leaf_index); void CUDACheck( const int smaller_leaf_index, @@ -192,7 +193,8 @@ class CUDADataPartition { void GenDataToLeftBitVector(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, - const uint8_t split_default_left, const data_size_t leaf_data_start); + const uint8_t split_default_left, const data_size_t leaf_data_start, + const int left_leaf_index, const int right_leaf_index); void SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, const int* best_split_feature, const uint32_t* best_split_threshold, @@ -214,7 +216,7 @@ class CUDADataPartition { hist_t** larger_leaf_cuda_hist_pointer_pointer, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index); + int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index); // kernel launch functions void LaunchFillDataIndicesBeforeTrain(); @@ -239,11 +241,23 @@ class CUDADataPartition { hist_t** larger_leaf_cuda_hist_pointer_pointer, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index); + int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index); void LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, - const uint8_t split_default_left, const data_size_t leaf_data_start); + const uint8_t split_default_left, const data_size_t leaf_data_start, + const int left_leaf_index, const int right_leaf_index); + + template + void LaunchUpdateDataIndexToLeafIndexKernel(const data_size_t cuda_leaf_data_start, + const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, + const uint32_t th, const BIN_TYPE* column_data, + // values from feature + const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index, const int missing_default_leaf_index, + const bool missing_is_zero, const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, const bool max_to_left, + const int num_blocks, const int block_size); void LaunchPrefixSumKernel(uint32_t* cuda_elements); @@ -291,6 +305,7 @@ class CUDADataPartition { int* cuda_cur_num_leaves_; // for split uint8_t* cuda_data_to_left_; + int* cuda_data_index_to_leaf_index_; data_size_t* cuda_block_data_to_left_offset_; data_size_t* cuda_block_data_to_right_offset_; data_size_t* cuda_out_data_indices_in_leaf_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 339a080e4bc5..cca5bc8be091 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -25,6 +25,8 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, feature_group_bin_offsets_.emplace_back(offset); offset += train_data->FeatureGroupNumBin(group_id); } + need_fix_histogram_features_.clear(); + need_fix_histogram_features_num_bin_aligend_.clear(); for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) { const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); const uint32_t most_freq_bin = bin_mapper->GetMostFreqBin(); @@ -32,6 +34,14 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, feature_mfb_offsets_.emplace_back(1); } else { feature_mfb_offsets_.emplace_back(0); + need_fix_histogram_features_.emplace_back(feature_index); + uint32_t num_bin_ref = static_cast(bin_mapper->num_bin()) - 1; + uint32_t num_bin_aligned = 1; + while (num_bin_ref > 0) { + num_bin_aligned <<= 1; + num_bin_ref >>= 1; + } + need_fix_histogram_features_num_bin_aligend_.emplace_back(num_bin_aligned); } feature_num_bins_.emplace_back(static_cast(bin_mapper->num_bin())); feature_most_freq_bins_.emplace_back(most_freq_bin); @@ -79,6 +89,22 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat DivideCUDAFeatureGroups(train_data, share_state); InitCUDAData(share_state); + + cuda_streams_.resize(5); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[2])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[3])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[4])); + + InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size()); + InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(), + need_fix_histogram_features_num_bin_aligend_.size()); + + const int max_block_dim_y = NUM_THRADS_PER_BLOCK / max_num_column_per_partition_; + const int max_grid_dim_y = std::max(min_grid_dim_y_, + ((num_data_ + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + max_block_dim_y - 1) / max_block_dim_y); + AllocateCUDAMemory(num_total_bin_ * 2 * max_grid_dim_y, &block_cuda_hist_buffer_); } void CUDAHistogramConstructor::InitCUDAData(TrainingShareStates* share_state) { @@ -275,20 +301,6 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_num_data_in_smaller_leaf, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist, num_data_in_smaller_leaf); SynchronizeCUDADevice(); - /*std::vector root_hist(20000); - CopyFromCUDADeviceToHost(root_hist.data(), cuda_hist_, 20000); - for (int real_feature_index = 0; real_feature_index < train_data_->num_total_features(); ++real_feature_index) { - const int inner_feature_index = train_data_->InnerFeatureIndex(real_feature_index); - if (inner_feature_index >= 0) { - const uint32_t feature_hist_start = feature_hist_offsets_[inner_feature_index]; - const uint32_t feature_hist_end = feature_hist_offsets_[inner_feature_index + 1]; - Log::Warning("real_feature_index = %d, inner_feature_index = %d", real_feature_index, inner_feature_index); - for (uint32_t hist_position = feature_hist_start; hist_position < feature_hist_end; ++hist_position) { - Log::Warning("hist_position = %d, bin_in_feature = %d, grad = %f, hess = %f", - hist_position, hist_position - feature_hist_start, root_hist[hist_position * 2], root_hist[hist_position * 2 + 1]); - } - } - }*/ global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); LaunchSubtractHistogramKernel(cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_smaller_leaf_sum_gradients, cuda_smaller_leaf_sum_hessians, @@ -302,8 +314,7 @@ void CUDAHistogramConstructor::CalcConstructHistogramKernelDim( *block_dim_x = max_num_column_per_partition_; *block_dim_y = NUM_THRADS_PER_BLOCK / max_num_column_per_partition_; *grid_dim_x = num_feature_partitions_; - const int min_grid_dim_y = 10; - *grid_dim_y = std::max(min_grid_dim_y, + *grid_dim_y = std::max(min_grid_dim_y_, ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + (*block_dim_y) - 1) / (*block_dim_y)); //Log::Warning("block_dim_x = %d, block_dim_y = %d, grid_dim_x = %d, grid_dim_y = %d", *block_dim_x, *block_dim_y, *grid_dim_x, *grid_dim_y); } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index dab927e1e9e0..cfe83321d51d 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -47,6 +47,36 @@ __device__ void PrefixSum(hist_t* elements, unsigned int n) { } } +__device__ void ReduceSumHistogramConstructor(hist_t* array, const size_t size) { + const unsigned int threadIdx_x = threadIdx.x; + const size_t atomic_size = size / 4; + for (int s = 1; s < atomic_size; s <<= 1) { + if (threadIdx_x % (2 * s) == 0 && (threadIdx_x + s) < size) { + array[threadIdx_x] += array[threadIdx_x + s]; + } + __syncthreads(); + } + if (threadIdx_x > 0 && threadIdx_x % atomic_size == 0) { + atomicAdd_block(array, array[threadIdx_x]); + } + __syncthreads(); +} + +__device__ void ReduceSumHistogramConstructorMerge(hist_t* array, const size_t size) { + const unsigned int threadIdx_x = (threadIdx.x % USED_HISTOGRAM_BUFFER_NUM); + const size_t atomic_size = size / 4; + for (int s = 1; s < atomic_size; s <<= 1) { + if (threadIdx_x % (2 * s) == 0 && (threadIdx_x + s) < size) { + array[threadIdx_x] += array[threadIdx_x + s]; + } + __syncthreads(); + } + if (threadIdx_x > 0 && threadIdx_x % atomic_size == 0) { + atomicAdd_block(array, array[threadIdx_x]); + } + __syncthreads(); +} + template __global__ void CUDAConstructHistogramDenseKernel( const int* leaf_index, @@ -75,13 +105,9 @@ __global__ void CUDAConstructHistogramDenseKernel( const int num_columns_in_partition = partition_column_end - partition_column_start; const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; - const uint32_t num_bins_in_partition = partition_hist_end - partition_hist_start; - const uint32_t num_items_per_thread = (2 * num_bins_in_partition + num_threads_per_block - 1) / num_threads_per_block; + const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1; const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; - const uint32_t thread_start = thread_idx * num_items_per_thread; - const uint32_t thread_end = thread_start + num_items_per_thread > num_bins_in_partition * 2 ? - num_bins_in_partition * 2 : thread_start + num_items_per_thread; - for (uint32_t i = thread_start; i < thread_end; ++i) { + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { shared_hist[i] = 0.0f; } __syncthreads(); @@ -111,7 +137,7 @@ __global__ void CUDAConstructHistogramDenseKernel( } __syncthreads(); hist_t* feature_histogram_ptr = (*feature_histogram) + (partition_hist_start << 1); - for (uint32_t i = thread_start; i < thread_end; ++i) { + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); } } @@ -142,13 +168,9 @@ __global__ void CUDAConstructHistogramSparseKernel( const BIN_TYPE* data_ptr = data + partition_ptr[blockIdx.x]; const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; - const uint32_t num_bins_in_partition = partition_hist_end - partition_hist_start; - const uint32_t num_items_per_thread = (2 * num_bins_in_partition + num_threads_per_block - 1) / num_threads_per_block; + const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1; const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; - const uint32_t thread_start = thread_idx * num_items_per_thread; - const uint32_t thread_end = thread_start + num_items_per_thread > num_bins_in_partition * 2 ? - num_bins_in_partition * 2 : thread_start + num_items_per_thread; - for (uint32_t i = thread_start; i < thread_end; ++i) { + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { shared_hist[i] = 0.0f; } __syncthreads(); @@ -179,11 +201,171 @@ __global__ void CUDAConstructHistogramSparseKernel( } __syncthreads(); hist_t* feature_histogram_ptr = (*feature_histogram) + (partition_hist_start << 1); - for (uint32_t i = thread_start; i < thread_end; ++i) { + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); } } +template +__global__ void CUDAConstructHistogramDenseKernel2( + const int* leaf_index, + const score_t* cuda_gradients, + const score_t* cuda_hessians, + const data_size_t** data_indices_ptr, + const int* num_feature_groups, + const data_size_t* leaf_num_data, + const BIN_TYPE* data, + const uint32_t* column_hist_offsets, + const uint32_t* column_hist_offsets_full, + const int* feature_partition_column_index_offsets, + const data_size_t num_data, + hist_t* histogram_buffer, + const int total_num_bin) { + + const int leaf_index_ref = *leaf_index; + const int dim_y = static_cast(gridDim.y * blockDim.y); + const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; + const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; + const data_size_t* data_indices_ref = *data_indices_ptr; + __shared__ float shared_hist[SHRAE_HIST_SIZE]; + const unsigned int num_threads_per_block = blockDim.x * blockDim.y; + const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x]; + const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1]; + const BIN_TYPE* data_ptr = data + partition_column_start * num_data; + const int num_columns_in_partition = partition_column_end - partition_column_start; + const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; + const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; + const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1; + const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; + hist_t* feature_histogram_ptr = histogram_buffer + total_num_bin * (blockIdx.y % USED_HISTOGRAM_BUFFER_NUM) * 2 + (partition_hist_start << 1); + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { + shared_hist[i] = 0.0f; + //feature_histogram_ptr[i] = 0.0f; + } + __syncthreads(); + const unsigned int threadIdx_y = threadIdx.y; + const unsigned int blockIdx_y = blockIdx.y; + const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread; + const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; + data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf_ref - block_start, num_data_per_thread * static_cast(blockDim.y))); + const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y; + const data_size_t remainder = block_num_data % blockDim.y; + const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); + data_size_t inner_data_index = static_cast(threadIdx_y); + const int column_index = static_cast(threadIdx.x) + partition_column_start; + if (threadIdx.x < static_cast(num_columns_in_partition)) { + float* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); + for (data_size_t i = 0; i < num_iteration_this; ++i) { + const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; + const score_t grad = cuda_gradients[data_index]; + const score_t hess = cuda_hessians[data_index]; + const uint32_t bin = static_cast(data_ptr[data_index * num_columns_in_partition + threadIdx.x]); + const uint32_t pos = bin << 1; + float* pos_ptr = shared_hist_ptr + pos; + atomicAdd_block(pos_ptr, grad); + atomicAdd_block(pos_ptr + 1, hess); + inner_data_index += blockDim.y; + } + } + __syncthreads(); + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { + atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); + } +} + +template +__global__ void CUDAConstructHistogramSparseKernel2( + const int* leaf_index, + const score_t* cuda_gradients, + const score_t* cuda_hessians, + const data_size_t** data_indices_ptr, + const int* num_feature_groups, + const data_size_t* leaf_num_data, + const BIN_TYPE* data, + const DATA_PTR_TYPE* row_ptr, + const DATA_PTR_TYPE* partition_ptr, + const uint32_t* column_hist_offsets_full, + const data_size_t num_data, + hist_t* histogram_buffer, + const int total_num_bin) { + + const int leaf_index_ref = *leaf_index; + const int dim_y = static_cast(gridDim.y * blockDim.y); + const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; + const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; + const data_size_t* data_indices_ref = *data_indices_ptr; + __shared__ float shared_hist[SHRAE_HIST_SIZE]; + const unsigned int num_threads_per_block = blockDim.x * blockDim.y; + const DATA_PTR_TYPE* block_row_ptr = row_ptr + blockIdx.x * (num_data + 1); + const BIN_TYPE* data_ptr = data + partition_ptr[blockIdx.x]; + const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; + const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; + const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1; + const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { + shared_hist[i] = 0.0f; + } + __syncthreads(); + const unsigned int threadIdx_y = threadIdx.y; + const unsigned int blockIdx_y = blockIdx.y; + const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread; + const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; + data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf_ref - block_start, num_data_per_thread * static_cast(blockDim.y))); + const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y; + const data_size_t remainder = block_num_data % blockDim.y; + const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); + data_size_t inner_data_index = static_cast(threadIdx_y); + for (data_size_t i = 0; i < num_iteration_this; ++i) { + const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; + const DATA_PTR_TYPE row_start = block_row_ptr[data_index]; + const DATA_PTR_TYPE row_end = block_row_ptr[data_index + 1]; + const DATA_PTR_TYPE row_size = row_end - row_start; + if (threadIdx.x < row_size) { + const score_t grad = cuda_gradients[data_index]; + const score_t hess = cuda_hessians[data_index]; + const uint32_t bin = static_cast(data_ptr[row_start + threadIdx.x]); + const uint32_t pos = bin << 1; + float* pos_ptr = shared_hist + pos; + atomicAdd_block(pos_ptr, grad); + atomicAdd_block(pos_ptr + 1, hess); + } + inner_data_index += blockDim.y; + } + __syncthreads(); + hist_t* feature_histogram_ptr = histogram_buffer + total_num_bin * blockIdx.y * 2 + (partition_hist_start << 1); + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { + atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); + } +} + +__global__ void MergeHistogramBufferKernel( + hist_t* histogram_buffer, + const int num_total_bin, + const int num_bin_per_block, + hist_t** output_histogram_ptr) { + hist_t* output_histogram = *output_histogram_ptr; + __shared__ hist_t gradient_buffer[1024]; + __shared__ hist_t hessian_buffer[1024]; + const uint32_t threadIdx_x = threadIdx.x; + const uint32_t blockIdx_x = blockIdx.x; + const uint32_t bin_index = threadIdx_x / USED_HISTOGRAM_BUFFER_NUM + num_bin_per_block * blockIdx_x; + const uint32_t histogram_position = (num_total_bin * (threadIdx_x % USED_HISTOGRAM_BUFFER_NUM) + bin_index) << 1; + if (bin_index < num_total_bin) { + gradient_buffer[threadIdx_x] = histogram_buffer[histogram_position]; + hessian_buffer[threadIdx_x] = histogram_buffer[histogram_position + 1]; + } + const uint32_t start = threadIdx_x / USED_HISTOGRAM_BUFFER_NUM * USED_HISTOGRAM_BUFFER_NUM; + __syncthreads(); + ReduceSumHistogramConstructorMerge(gradient_buffer + start, USED_HISTOGRAM_BUFFER_NUM); + ReduceSumHistogramConstructorMerge(hessian_buffer + start, USED_HISTOGRAM_BUFFER_NUM); + __syncthreads(); + const unsigned int global_histogram_position = bin_index << 1; + if (threadIdx_x % USED_HISTOGRAM_BUFFER_NUM == 0 && bin_index < num_total_bin) { + output_histogram[global_histogram_position] = gradient_buffer[threadIdx_x]; + output_histogram[global_histogram_position + 1] = hessian_buffer[threadIdx_x]; + } +} + void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const int* cuda_smaller_leaf_index, const data_size_t* cuda_smaller_leaf_num_data, @@ -201,7 +383,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( if (is_sparse_) { if (bit_type_ == 8) { if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, cuda_row_ptr_uint16_t_, @@ -209,7 +391,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, cuda_row_ptr_uint32_t_, @@ -217,7 +399,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, cuda_row_ptr_uint64_t_, @@ -227,7 +409,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( } } else if (bit_type_ == 16) { if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, cuda_row_ptr_uint16_t_, @@ -235,7 +417,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, cuda_row_ptr_uint32_t_, @@ -243,7 +425,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, cuda_row_ptr_uint64_t_, @@ -253,7 +435,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( } } else if (bit_type_ == 32) { if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, cuda_row_ptr_uint16_t_, @@ -261,7 +443,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, cuda_row_ptr_uint32_t_, @@ -269,7 +451,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, cuda_row_ptr_uint64_t_, @@ -280,21 +462,21 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( } } else { if (bit_type_ == 8) { - CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, cuda_column_hist_offsets_, cuda_column_hist_offsets_full_, cuda_feature_partition_column_index_offsets_, num_data_); } else if (bit_type_ == 16) { - CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, cuda_column_hist_offsets_, cuda_column_hist_offsets_full_, cuda_feature_partition_column_index_offsets_, num_data_); } else if (bit_type_ == 32) { - CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, cuda_column_hist_offsets_, cuda_column_hist_offsets_full_, @@ -304,6 +486,138 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( } } +void CUDAHistogramConstructor::LaunchConstructHistogramKernel2( + const int* cuda_smaller_leaf_index, + const data_size_t* cuda_smaller_leaf_num_data, + const data_size_t** cuda_data_indices_in_smaller_leaf, + const data_size_t* cuda_leaf_num_data, + hist_t** cuda_leaf_hist, + const data_size_t num_data_in_smaller_leaf) { + int grid_dim_x = 0; + int grid_dim_y = 0; + int block_dim_x = 0; + int block_dim_y = 0; + CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf); + dim3 grid_dim(grid_dim_x, grid_dim_y); + dim3 block_dim(block_dim_x, block_dim_y); + SetCUDAMemory(block_cuda_hist_buffer_, 0, 2 * num_total_bin_ * USED_HISTOGRAM_BUFFER_NUM); + global_timer.Start("CUDAConstructHistogramKernel2"); + if (is_sparse_) { + if (bit_type_ == 8) { + if (data_ptr_bit_type_ == 16) { + CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint8_t_, + cuda_row_ptr_uint16_t_, + cuda_partition_ptr_uint16_t_, + cuda_column_hist_offsets_full_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } else if (data_ptr_bit_type_ == 32) { + CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint8_t_, + cuda_row_ptr_uint32_t_, + cuda_partition_ptr_uint32_t_, + cuda_column_hist_offsets_full_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } else if (data_ptr_bit_type_ == 64) { + CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint8_t_, + cuda_row_ptr_uint64_t_, + cuda_partition_ptr_uint64_t_, + cuda_column_hist_offsets_full_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } + } else if (bit_type_ == 16) { + if (data_ptr_bit_type_ == 16) { + CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint16_t_, + cuda_row_ptr_uint16_t_, + cuda_partition_ptr_uint16_t_, + cuda_column_hist_offsets_full_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } else if (data_ptr_bit_type_ == 32) { + CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint16_t_, + cuda_row_ptr_uint32_t_, + cuda_partition_ptr_uint32_t_, + cuda_column_hist_offsets_full_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } else if (data_ptr_bit_type_ == 64) { + CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint16_t_, + cuda_row_ptr_uint64_t_, + cuda_partition_ptr_uint64_t_, + cuda_column_hist_offsets_full_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } + } else if (bit_type_ == 32) { + if (data_ptr_bit_type_ == 16) { + CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint32_t_, + cuda_row_ptr_uint16_t_, + cuda_partition_ptr_uint16_t_, + cuda_column_hist_offsets_full_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } else if (data_ptr_bit_type_ == 32) { + CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint32_t_, + cuda_row_ptr_uint32_t_, + cuda_partition_ptr_uint32_t_, + cuda_column_hist_offsets_full_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } else if (data_ptr_bit_type_ == 64) { + CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_data_uint32_t_, + cuda_row_ptr_uint64_t_, + cuda_partition_ptr_uint64_t_, + cuda_column_hist_offsets_full_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } + } + } else { + if (bit_type_ == 8) { + CUDAConstructHistogramDenseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, + cuda_column_hist_offsets_, + cuda_column_hist_offsets_full_, + cuda_feature_partition_column_index_offsets_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } else if (bit_type_ == 16) { + CUDAConstructHistogramDenseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, + cuda_column_hist_offsets_, + cuda_column_hist_offsets_full_, + cuda_feature_partition_column_index_offsets_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } else if (bit_type_ == 32) { + CUDAConstructHistogramDenseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, + cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, + cuda_column_hist_offsets_, + cuda_column_hist_offsets_full_, + cuda_feature_partition_column_index_offsets_, + num_data_, block_cuda_hist_buffer_, num_total_bin_); + } + } + SynchronizeCUDADevice(); + global_timer.Stop("CUDAConstructHistogramKernel2"); + const int merge_block_dim = 1024; + const int num_bin_per_block = merge_block_dim / USED_HISTOGRAM_BUFFER_NUM; + const int num_blocks = (num_total_bin_ + num_bin_per_block - 1) / num_bin_per_block; + global_timer.Start("MergeHistogramBufferKernel"); + MergeHistogramBufferKernel<<>>( + block_cuda_hist_buffer_, num_total_bin_, num_bin_per_block, cuda_leaf_hist); + SynchronizeCUDADevice(); + global_timer.Stop("MergeHistogramBufferKernel"); +} + __global__ void SubtractHistogramKernel(const int* /*cuda_smaller_leaf_index*/, const int* cuda_larger_leaf_index, const uint8_t* cuda_feature_mfb_offsets, const uint32_t* cuda_feature_num_bins, const int* cuda_num_total_bin, @@ -320,58 +634,46 @@ __global__ void SubtractHistogramKernel(const int* /*cuda_smaller_leaf_index*/, } } -__global__ void FixHistogramKernel(const int* cuda_smaller_leaf_index, - const int* cuda_larger_leaf_index, - const uint32_t* cuda_feature_num_bins, const int* cuda_num_features, - const int* /*cuda_num_total_bin*/, const uint32_t* cuda_feature_hist_offsets, +__global__ void FixHistogramKernel( + const uint32_t* cuda_feature_num_bins, + const uint32_t* cuda_feature_hist_offsets, const uint32_t* cuda_feature_most_freq_bins, const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, - const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians, - hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist) { - const int cuda_num_features_ref = *cuda_num_features; + hist_t** cuda_smaller_leaf_hist, + const int* cuda_need_fix_histogram_features, + const uint32_t* cuda_need_fix_histogram_features_num_bin_aligned) { const unsigned int blockIdx_x = blockIdx.x; - const int feature_index = blockIdx_x % cuda_num_features_ref; - const bool larger_or_smaller = static_cast(blockIdx_x / cuda_num_features_ref); - const int leaf_index_ref = larger_or_smaller ? *cuda_larger_leaf_index : *cuda_smaller_leaf_index; + const int feature_index = cuda_need_fix_histogram_features[blockIdx_x]; __shared__ double hist_gradients[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; __shared__ double hist_hessians[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; - if (leaf_index_ref >= 0) { - const uint32_t feature_hist_offset = cuda_feature_hist_offsets[feature_index]; - const uint32_t most_freq_bin = cuda_feature_most_freq_bins[feature_index]; - if (most_freq_bin > 0) { - const double leaf_sum_gradients = larger_or_smaller ? *larger_leaf_sum_gradients : *smaller_leaf_sum_gradients; - const double leaf_sum_hessians = larger_or_smaller ? *larger_leaf_sum_hessians : *smaller_leaf_sum_hessians; - hist_t* feature_hist = larger_or_smaller ? (*cuda_larger_leaf_hist) + feature_hist_offset * 2 : - (*cuda_smaller_leaf_hist) + feature_hist_offset * 2; - const unsigned int threadIdx_x = threadIdx.x; - const uint32_t num_bin = cuda_feature_num_bins[feature_index]; - if (threadIdx_x < num_bin) { - if (threadIdx_x == most_freq_bin) { - hist_gradients[threadIdx_x] = 0.0f; - hist_hessians[threadIdx_x] = 0.0f; - } else { - hist_gradients[threadIdx_x] = feature_hist[threadIdx_x << 1]; - hist_hessians[threadIdx_x] = feature_hist[(threadIdx_x << 1) + 1]; - } - } else { - hist_gradients[threadIdx_x] = 0.0f; - hist_hessians[threadIdx_x] = 0.0f; - } - uint32_t num_bin_aligned = 1; - uint32_t num_bin_to_shift = num_bin - 1; - while (num_bin_to_shift > 0) { - num_bin_to_shift >>= 1; - num_bin_aligned <<= 1; - } - __syncthreads(); - PrefixSum(hist_gradients, num_bin_aligned); - PrefixSum(hist_hessians, num_bin_aligned); - __syncthreads(); - if (threadIdx_x == most_freq_bin) { - feature_hist[most_freq_bin << 1] = leaf_sum_gradients - hist_gradients[num_bin_aligned]; - feature_hist[(most_freq_bin << 1) + 1] = leaf_sum_hessians - hist_hessians[num_bin_aligned]; - } + const uint32_t num_bin_aligned = cuda_need_fix_histogram_features_num_bin_aligned[blockIdx_x]; + const uint32_t feature_hist_offset = cuda_feature_hist_offsets[feature_index]; + const uint32_t most_freq_bin = cuda_feature_most_freq_bins[feature_index]; + const double leaf_sum_gradients = *smaller_leaf_sum_gradients; + const double leaf_sum_hessians = *smaller_leaf_sum_hessians; + hist_t* feature_hist = (*cuda_smaller_leaf_hist) + feature_hist_offset * 2; + const unsigned int threadIdx_x = threadIdx.x; + const uint32_t num_bin = cuda_feature_num_bins[feature_index]; + const uint32_t hist_pos = threadIdx_x << 1; + if (threadIdx_x < num_bin) { + if (threadIdx_x == most_freq_bin) { + hist_gradients[threadIdx_x] = 0.0f; + hist_hessians[threadIdx_x] = 0.0f; + } else { + hist_gradients[threadIdx_x] = feature_hist[hist_pos]; + hist_hessians[threadIdx_x] = feature_hist[hist_pos + 1]; } + } else { + hist_gradients[threadIdx_x] = 0.0f; + hist_hessians[threadIdx_x] = 0.0f; + } + __syncthreads(); + ReduceSumHistogramConstructor(hist_gradients, num_bin_aligned); + ReduceSumHistogramConstructor(hist_hessians, num_bin_aligned); + __syncthreads(); + if (threadIdx_x == most_freq_bin) { + feature_hist[hist_pos] = leaf_sum_gradients - hist_gradients[0]; + feature_hist[hist_pos + 1] = leaf_sum_hessians - hist_hessians[0]; } } @@ -381,18 +683,21 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(const int* cuda_sma hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist) { const int num_subtract_threads = 2 * num_total_bin_; const int num_subtract_blocks = (num_subtract_threads + SUBTRACT_BLOCK_SIZE - 1) / SUBTRACT_BLOCK_SIZE; - SubtractHistogramKernel<<>>( + global_timer.Start("CUDAHistogramConstructor::FixHistogramKernel"); + FixHistogramKernel<<>>( + cuda_feature_num_bins_, + cuda_feature_hist_offsets_, + cuda_feature_most_freq_bins_, smaller_leaf_sum_gradients, smaller_leaf_sum_hessians, + cuda_smaller_leaf_hist, cuda_need_fix_histogram_features_, + cuda_need_fix_histogram_features_num_bin_aligned_); + //SynchronizeCUDADevice(); + global_timer.Stop("CUDAHistogramConstructor::FixHistogramKernel"); + global_timer.Start("CUDAHistogramConstructor::SubtractHistogramKernel"); + SubtractHistogramKernel<<>>( cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_feature_mfb_offsets_, cuda_feature_num_bins_, cuda_num_total_bin_, cuda_smaller_leaf_hist, cuda_larger_leaf_hist); - SynchronizeCUDADevice(); - FixHistogramKernel<<<2 * num_features_, FIX_HISTOGRAM_BLOCK_SIZE>>>( - cuda_smaller_leaf_index, cuda_larger_leaf_index, - cuda_feature_num_bins_, cuda_num_features_, - cuda_num_total_bin_, cuda_feature_hist_offsets_, - cuda_feature_most_freq_bins_, smaller_leaf_sum_gradients, smaller_leaf_sum_hessians, - larger_leaf_sum_gradients, larger_leaf_sum_hessians, - cuda_smaller_leaf_hist, cuda_larger_leaf_hist); - SynchronizeCUDADevice(); + //SynchronizeCUDADevice(); + global_timer.Stop("CUDAHistogramConstructor::SubtractHistogramKernel"); } __global__ void GetOrderedGradientsKernel(const data_size_t num_data_in_leaf, const data_size_t** cuda_data_indices_in_leaf, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 8e2348fd5fa6..45b8c14942ec 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -24,6 +24,7 @@ #define SUBTRACT_BLOCK_SIZE (1024) #define FIX_HISTOGRAM_SHARED_MEM_SIZE (1024) #define FIX_HISTOGRAM_BLOCK_SIZE (512) +#define USED_HISTOGRAM_BUFFER_NUM (8) namespace LightGBM { @@ -93,6 +94,13 @@ class CUDAHistogramConstructor { hist_t** cuda_leaf_hist, const data_size_t num_data_in_smaller_leaf); + void LaunchConstructHistogramKernel2(const int* cuda_leaf_index, + const data_size_t* cuda_smaller_leaf_num_data, + const data_size_t** cuda_data_indices_in_leaf, + const data_size_t* cuda_leaf_num_data, + hist_t** cuda_leaf_hist, + const data_size_t num_data_in_smaller_leaf); + void LaunchSubtractHistogramKernel(const int* cuda_smaller_leaf_index, const int* cuda_larger_leaf_index, const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians, @@ -139,6 +147,11 @@ class CUDAHistogramConstructor { uint8_t data_ptr_bit_type_; uint8_t bit_type_; const Dataset* train_data_; + std::vector cuda_streams_; + std::vector need_fix_histogram_features_; + std::vector need_fix_histogram_features_num_bin_aligend_; + + const int min_grid_dim_y_ = 10; // CUDA memory, held by this object uint32_t* cuda_feature_group_bin_offsets_; @@ -147,6 +160,7 @@ class CUDAHistogramConstructor { uint32_t* cuda_feature_hist_offsets_; uint32_t* cuda_feature_most_freq_bins_; hist_t* cuda_hist_; + hist_t* block_cuda_hist_buffer_; int* cuda_num_total_bin_; int* cuda_num_feature_groups_; uint8_t* cuda_data_uint8_t_; @@ -164,6 +178,8 @@ class CUDAHistogramConstructor { int* cuda_feature_partition_column_index_offsets_; uint32_t* cuda_column_hist_offsets_; uint32_t* cuda_column_hist_offsets_full_; + int* cuda_need_fix_histogram_features_; + uint32_t* cuda_need_fix_histogram_features_num_bin_aligned_; // CUDA memory, held by other objects const score_t* cuda_gradients_; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index bb152f4bdfa9..9dfbc00c63bd 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -248,13 +248,15 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, leaf_best_split_default_left_, &smaller_leaf_index_, &larger_leaf_index_, - best_leaf_index_); + best_leaf_index_, + num_leaves); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::Split"); split_data_indices_time += duration.count(); ++num_leaves; } + SynchronizeCUDADevice(); const auto end = std::chrono::steady_clock::now(); const double duration = (static_cast>(end - start)).count(); const auto build_tree_start = std::chrono::steady_clock::now(); From 320c449ddc35ba100b8b3526c0b950f61dd35ac3 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 28 Jun 2021 09:07:29 +0000 Subject: [PATCH 034/166] split optimization --- .../cuda/cuda_best_split_finder.cu | 52 +- src/treelearner/cuda/cuda_data_partition.cpp | 54 +- src/treelearner/cuda/cuda_data_partition.cu | 2322 ++++++++++++----- src/treelearner/cuda/cuda_data_partition.hpp | 133 + .../cuda/cuda_histogram_constructor.cu | 7 +- .../cuda/cuda_histogram_constructor.hpp | 2 +- 6 files changed, 1833 insertions(+), 737 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 1aff354fbaa8..3e462cae97ab 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -623,56 +623,6 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_best_split_right_output_, cuda_best_split_found_); } - /*const int num_blocks = (larger_leaf_index >= 0 && !larger_only) ? num_tasks_ * 2 : num_tasks_; - FindBestSplitsForLeafKernel<<>>( - // input feature information - cuda_feature_hist_offsets_, - cuda_feature_mfb_offsets_, - cuda_feature_default_bins_, - cuda_feature_missing_type_, - cuda_feature_num_bins_, - // input task information - larger_only, - num_tasks_, - cuda_task_feature_index_, - cuda_task_reverse_, - cuda_task_skip_default_bin_, - cuda_task_na_as_missing_, - cuda_task_out_default_left_, - // input leaf information - smaller_leaf_index, - smaller_leaf_splits->cuda_gain(), - smaller_leaf_splits->cuda_sum_of_gradients(), - smaller_leaf_splits->cuda_sum_of_hessians(), - smaller_leaf_splits->cuda_num_data_in_leaf(), - smaller_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), - larger_leaf_index, - larger_leaf_splits->cuda_gain(), - larger_leaf_splits->cuda_sum_of_gradients(), - larger_leaf_splits->cuda_sum_of_hessians(), - larger_leaf_splits->cuda_num_data_in_leaf(), - larger_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), - // configuration parameter values - min_data_in_leaf_, - min_sum_hessian_in_leaf_, - min_gain_to_split_, - lambda_l1_, - lambda_l2_, - // output parameters - cuda_best_split_threshold_, - cuda_best_split_default_left_, - cuda_best_split_gain_, - cuda_best_split_left_sum_gradient_, - cuda_best_split_left_sum_hessian_, - cuda_best_split_left_count_, - cuda_best_split_left_gain_, - cuda_best_split_left_output_, - cuda_best_split_right_sum_gradient_, - cuda_best_split_right_sum_hessian_, - cuda_best_split_right_count_, - cuda_best_split_right_gain_, - cuda_best_split_right_output_, - cuda_best_split_found_);*/ } __device__ void ReduceBestSplit(uint8_t* found, double* gain, uint32_t* shared_read_index, @@ -1077,7 +1027,7 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, if (threadIdx_x < static_cast(cur_num_valid_threads)) { const int start = num_leaves_per_thread * threadIdx_x; const int end = min(start + num_leaves_per_thread, cuda_cur_num_leaves_ref); - for (int leaf_index = start; leaf_index < end; ++leaf_index) { + for (int leaf_index = threadIdx_x; leaf_index < cuda_cur_num_leaves_ref; leaf_index += cur_num_valid_threads) { const double leaf_best_gain = cuda_leaf_best_split_gain[leaf_index]; if (cuda_leaf_best_split_found[leaf_index] && leaf_best_gain > thread_best_gain[threadIdx_x]) { thread_best_gain[threadIdx_x] = leaf_best_gain; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 324337bb0f1d..3efe449da60b 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -87,7 +87,8 @@ void CUDADataPartition::Init(const Dataset* train_data) { AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_num_data_); InitCUDAValueFromConstant(&cuda_num_total_bin_, num_total_bin_); InitCUDAValueFromConstant(&cuda_cur_num_leaves_, 1); - AllocateCUDAMemory(static_cast(num_data_), &cuda_data_to_left_); + // leave some space for alignment + AllocateCUDAMemory(static_cast(num_data_) + 1024 * 8, &cuda_data_to_left_); AllocateCUDAMemory(static_cast(num_data_), &cuda_data_index_to_leaf_index_); AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_) + 1, &cuda_block_data_to_left_offset_); AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_) + 1, &cuda_block_data_to_right_offset_); @@ -414,7 +415,17 @@ void CUDADataPartition::GenDataToLeftBitVector(const data_size_t num_data_in_lea const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start, const int left_leaf_index, const int right_leaf_index) { - LaunchGenDataToLeftBitVectorKernel(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, left_leaf_index, right_leaf_index); + LaunchGenDataToLeftBitVectorKernel2(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, left_leaf_index, right_leaf_index); + /*if (num_data_in_leaf == 10500000) { + std::vector cpu_bit_vector(num_data_in_leaf, 0); + CopyFromCUDADeviceToHost(cpu_bit_vector.data(), cuda_data_to_left_, num_data_in_leaf); + for (size_t i = 0; i < 100; ++i) { + Log::Warning("cpu_bit_vector[%d] = %d", i, cpu_bit_vector[i]); + } + for (size_t i = 10500000 - 100; i < 10500000; ++i) { + Log::Warning("cpu_bit_vector[%d] = %d", i, cpu_bit_vector[i]); + } + }*/ } void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, @@ -438,7 +449,7 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { - LaunchSplitInnerKernel(leaf_index, num_data_in_leaf, + LaunchSplitInnerKernel2(leaf_index, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, best_left_gain, best_left_leaf_value, @@ -475,6 +486,43 @@ void CUDADataPartition::CUDACheck( LaunchCUDACheckKernel(smaller_leaf_index, larger_leaf_index, num_data_in_leaf, smaller_leaf_splits, larger_leaf_splits, gradients, hessians); } +void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf, + int* grid_dim, + int* block_dim) { + const int num_threads_per_block = SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; + const int min_grid_dim = num_data_in_leaf <= 100 ? 1 : 10; + const int num_data_per_block = (num_threads_per_block * 8); + const int num_blocks = std::max(min_grid_dim, (num_data_in_leaf + num_data_per_block - 1) / num_data_per_block); + const int num_threads_per_block_final = (num_data_in_leaf + (num_blocks * 8) - 1) / (num_blocks * 8); + int num_threads_per_block_final_ref = num_threads_per_block_final - 1; + CHECK_GT(num_threads_per_block_final_ref, 0); + int num_threads_per_block_final_aligned = 1; + while (num_threads_per_block_final_ref > 0) { + num_threads_per_block_final_aligned <<= 1; + num_threads_per_block_final_ref >>= 1; + } + const int num_blocks_final = (num_data_in_leaf + (num_threads_per_block_final_aligned * 8) - 1) / (num_threads_per_block_final_aligned * 8); + *grid_dim = num_blocks_final; + *block_dim = num_threads_per_block_final_aligned; +} + +void CUDADataPartition::CalcBlockDimInCopy(const data_size_t num_data_in_leaf, + int* grid_dim, + int* block_dim) { + const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; + const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; + CHECK_GT(split_indices_block_size_data_partition, 0); + int split_indices_block_size_data_partition_aligned = 1; + while (split_indices_block_size_data_partition > 0) { + split_indices_block_size_data_partition_aligned <<= 1; + split_indices_block_size_data_partition >>= 1; + } + const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; + *grid_dim = num_blocks_final; + *block_dim = split_indices_block_size_data_partition_aligned; +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index b83bfdaf5ba7..7b32a1eba269 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -348,13 +348,13 @@ __device__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, const uint const int split_indices_block_size_data_partition, uint16_t* thread_to_left_offset_cnt) { const unsigned int threadIdx_x = threadIdx.x; - const unsigned int blockDim_x = blockDim.x / 2; + const unsigned int blockDim_x = blockDim.x; __syncthreads(); ReduceSum(thread_to_left_offset_cnt, split_indices_block_size_data_partition); __syncthreads(); if (threadIdx_x == 0) { - const data_size_t num_data_in_block = (blockIdx.x + 1) * blockDim_x * 2 <= num_data_in_leaf_ref ? static_cast(blockDim_x * 2) : - num_data_in_leaf_ref - static_cast(blockIdx.x * blockDim_x * 2); + const data_size_t num_data_in_block = (blockIdx.x + 1) * blockDim_x <= num_data_in_leaf_ref ? static_cast(blockDim_x) : + num_data_in_leaf_ref - static_cast(blockIdx.x * blockDim_x); if (num_data_in_block > 0) { const data_size_t data_to_left = static_cast(thread_to_left_offset_cnt[0]); block_to_left_offset_buffer[blockIdx.x + 1] = data_to_left; @@ -568,14 +568,14 @@ void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel(const data_size_t } } -// missing_is_zero = 0, missing_is_na = 0, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, +// min_bin_ref < max_bin_ref +template +__global__ void GenDataToLeftBitVectorKernel0(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t split_default_to_left, const uint8_t /*split_missing_default_to_left*/, + const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, const int split_indices_block_size_data_partition, @@ -588,18 +588,24 @@ __global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin < min_bin_ref || bin > max_bin_ref) { - cuda_data_to_left[local_data_index] = split_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + } else if ((bin < min_bin_ref || bin > max_bin_ref)) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + } } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -609,9 +615,9 @@ __global__ void GenDataToLeftBitVectorKernel0_1_2_3(const int best_split_feature split_indices_block_size_data_partition, thread_to_left_offset_cnt); } -// missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel4(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, +// min_bin_ref < max_bin_ref +template +__global__ void GenDataToLeftBitVectorKernelPacked0(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature @@ -622,29 +628,30 @@ __global__ void GenDataToLeftBitVectorKernel4(const int best_split_feature_ref, const int split_indices_block_size_data_partition, int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION * 4]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin == t_zero_bin) { + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin < min_bin_ref || bin > max_bin_ref) { - cuda_data_to_left[local_data_index] = split_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; + } else if ((bin < min_bin_ref || bin > max_bin_ref)) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + } } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } else { cuda_data_to_left[local_data_index] = 1; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -654,14 +661,14 @@ __global__ void GenDataToLeftBitVectorKernel4(const int best_split_feature_ref, split_indices_block_size_data_partition, thread_to_left_offset_cnt); } -// missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel5(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, +// min_bin_ref == max_bin_ref +template +__global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, + const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, const int split_indices_block_size_data_partition, @@ -674,18 +681,30 @@ __global__ void GenDataToLeftBitVectorKernel5(const int best_split_feature_ref, if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin < min_bin_ref || bin > max_bin_ref) { + if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + } else if (bin != max_bin_ref) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + } } else { - cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + if (MISSING_IS_NA && !MFB_IS_NA) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + } else { + if (MAX_TO_LEFT) { + cuda_data_to_left[local_data_index] = 1; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + } else { + cuda_data_to_left[local_data_index] = 0; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + } + } } } else { thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; @@ -695,9 +714,9 @@ __global__ void GenDataToLeftBitVectorKernel5(const int best_split_feature_ref, split_indices_block_size_data_partition, thread_to_left_offset_cnt); } -// missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel6(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, +// min_bin_ref < max_bin_ref +template +__global__ void GenDataToLeftBitVectorKernel0_2(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature @@ -708,505 +727,1299 @@ __global__ void GenDataToLeftBitVectorKernel6(const int best_split_feature_ref, const int split_indices_block_size_data_partition, int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint16_t thread_to_left_offset_cnt[(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + + ((SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + uint8_t bit0 = 0; + uint8_t bit1 = 0; + uint8_t bit2 = 0; + uint8_t bit3 = 0; + uint8_t bit4 = 0; + uint8_t bit5 = 0; + uint8_t bit6 = 0; + uint8_t bit7 = 0; + unsigned int local_data_index = ((blockIdx.x * blockDim.x) << 3) + (threadIdx.x << 2); if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin == max_bin_ref) { + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin < min_bin_ref || bin > max_bin_ref) { - cuda_data_to_left[local_data_index] = split_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; + bit0 = split_missing_default_to_left; + } else if ((bin < min_bin_ref || bin > max_bin_ref)) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit0 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit0 = split_default_to_left; + } } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + bit0 = 0; } else { cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + bit0 = 1; } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + cuda_data_to_left[local_data_index] = 0; } - __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); -} - -// missing_is_zero = 0, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel7(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + ++local_data_index; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin < min_bin_ref || bin > max_bin_ref) { + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + bit1 = split_missing_default_to_left; + } else if ((bin < min_bin_ref || bin > max_bin_ref)) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit1 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit1 = split_default_to_left; + } } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + bit1 = 0; } else { cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + bit1 = 1; } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + cuda_data_to_left[local_data_index] = 0; } - __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); -} - -// missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel8(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + ++local_data_index; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin == t_zero_bin) { + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - } else if (bin < min_bin_ref || bin > max_bin_ref) { - cuda_data_to_left[local_data_index] = split_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; + bit2 = split_missing_default_to_left; + } else if ((bin < min_bin_ref || bin > max_bin_ref)) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit2 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit2 = split_default_to_left; + } } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + bit2 = 0; } else { cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + bit2 = 1; } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + cuda_data_to_left[local_data_index] = 0; } - __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); -} - -// missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel9(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + ++local_data_index; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin == t_zero_bin) { + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin < min_bin_ref || bin > max_bin_ref) { - cuda_data_to_left[local_data_index] = split_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; + bit3 = split_missing_default_to_left; + } else if ((bin < min_bin_ref || bin > max_bin_ref)) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit3 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit3 = split_default_to_left; + } } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + bit3 = 0; } else { cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + bit3 = 1; } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + cuda_data_to_left[local_data_index] = 0; } - __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); -} - -// missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel10(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + local_data_index = ((blockIdx.x * blockDim.x) << 3) + ((threadIdx.x + blockDim.x) << 2); if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin < min_bin_ref || bin > max_bin_ref) { + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + bit4 = split_missing_default_to_left; + } else if ((bin < min_bin_ref || bin > max_bin_ref)) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit4 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit4 = split_default_to_left; + } } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + bit4 = 0; } else { cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + bit4 = 1; } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + cuda_data_to_left[local_data_index] = 0; } - __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); -} - -// missing_is_zero = 1, missing_is_na = 0, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel11(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + ++local_data_index; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin < min_bin_ref || bin > max_bin_ref) { + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + bit5 = split_missing_default_to_left; + } else if ((bin < min_bin_ref || bin > max_bin_ref)) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit5 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit5 = split_default_to_left; + } } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + bit5 = 0; } else { cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + bit5 = 1; } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + cuda_data_to_left[local_data_index] = 0; } - __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); -} - -// missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 0, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel12(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + ++local_data_index; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin == t_zero_bin || bin == max_bin_ref) { + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin < min_bin_ref || bin > max_bin_ref) { - cuda_data_to_left[local_data_index] = split_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; + bit6 = split_missing_default_to_left; + } else if ((bin < min_bin_ref || bin > max_bin_ref)) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit6 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit6 = split_default_to_left; + } } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + bit6 = 0; } else { cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + bit6 = 1; } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + cuda_data_to_left[local_data_index] = 0; } - __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); -} - -// missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 0, mfb_is_na = 1, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel13(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + ++local_data_index; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin < min_bin_ref || bin > max_bin_ref) { + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + bit7 = split_missing_default_to_left; + } else if ((bin < min_bin_ref || bin > max_bin_ref)) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit7 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit7 = split_default_to_left; + } } else if (bin > th) { cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + bit7 = 0; } else { cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + bit7 = 1; } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + cuda_data_to_left[local_data_index] = 0; } + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = bit0 + bit1 + bit2 + bit3; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x + blockDim.x)] = bit4 + bit5 + bit6 + bit7; __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); + ReduceSum(thread_to_left_offset_cnt, (split_indices_block_size_data_partition << 1)); + __syncthreads(); + if (threadIdx.x == 0) { + const data_size_t num_data_in_block = (((blockIdx.x + 1) * blockDim.x * 8) <= num_data_in_leaf) ? + static_cast(blockDim.x * 8) : + (num_data_in_leaf - static_cast(blockIdx.x * blockDim.x * 8)); + if (num_data_in_block > 0) { + const data_size_t data_to_left = static_cast(thread_to_left_offset_cnt[0]); + block_to_left_offset_buffer[blockIdx.x + 1] = data_to_left; + block_to_right_offset_buffer[blockIdx.x + 1] = num_data_in_block - data_to_left; + } else { + block_to_left_offset_buffer[blockIdx.x + 1] = 0; + block_to_right_offset_buffer[blockIdx.x + 1] = 0; + } + } } -// missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 0, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel14(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, +// min_bin_ref == max_bin_ref +template +__global__ void GenDataToLeftBitVectorKernel16_2(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, + const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, uint8_t* cuda_data_to_left, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, const int split_indices_block_size_data_partition, int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; + if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("********************************************** calling GenDataToLeftBitVectorKernel16_2 **********************************************\n"); + } + __shared__ uint16_t thread_to_left_offset_cnt[(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + 1 + + ((SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + 1) / NUM_BANKS_DATA_PARTITION]; const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + uint8_t bit0 = 0; + uint8_t bit1 = 0; + uint8_t bit2 = 0; + uint8_t bit3 = 0; + uint8_t bit4 = 0; + uint8_t bit5 = 0; + uint8_t bit6 = 0; + uint8_t bit7 = 0; + unsigned int local_data_index = ((blockIdx.x * blockDim.x) << 3) + (threadIdx.x << 2); if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin == max_bin_ref) { + if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin < min_bin_ref || bin > max_bin_ref) { + bit0 = split_missing_default_to_left; + } else if (bin != max_bin_ref) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit0 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit0 = split_default_to_left; + } + } else { + if (MISSING_IS_NA && !MFB_IS_NA) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit0 = split_missing_default_to_left; + } else { + if (MAX_TO_LEFT) { + cuda_data_to_left[local_data_index] = 1; + bit0 = 1; + } else { + cuda_data_to_left[local_data_index] = 0; + bit0 = 0; + } + } + } + } else { + cuda_data_to_left[local_data_index] = 0; + } + ++local_data_index; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(column_data[global_data_index]); + if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + bit1 = split_missing_default_to_left; + } else if (bin != max_bin_ref) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit1 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit1 = split_default_to_left; + } } else { - cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + if (MISSING_IS_NA && !MFB_IS_NA) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit1 = split_missing_default_to_left; + } else { + if (MAX_TO_LEFT) { + cuda_data_to_left[local_data_index] = 1; + bit1 = 1; + } else { + cuda_data_to_left[local_data_index] = 0; + bit1 = 0; + } + } } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + cuda_data_to_left[local_data_index] = 0; } - __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); -} - -// missing_is_zero = 1, missing_is_na = 1, mfb_is_zero = 1, mfb_is_na = 1, min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel15(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t /*split_default_to_left*/, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + ++local_data_index; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (bin < min_bin_ref || bin > max_bin_ref) { + if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + bit2 = split_missing_default_to_left; + } else if (bin != max_bin_ref) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit2 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit2 = split_default_to_left; + } } else { - cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + if (MISSING_IS_NA && !MFB_IS_NA) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit2 = split_missing_default_to_left; + } else { + if (MAX_TO_LEFT) { + cuda_data_to_left[local_data_index] = 1; + bit2 = 1; + } else { + cuda_data_to_left[local_data_index] = 0; + bit2 = 0; + } + } } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + cuda_data_to_left[local_data_index] = 0; } - __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); -} - -// min_bin_ref == max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + ++local_data_index; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + bit3 = split_missing_default_to_left; } else if (bin != max_bin_ref) { if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + bit3 = split_missing_default_to_left; } else { cuda_data_to_left[local_data_index] = split_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; + bit3 = split_default_to_left; } } else { if (MISSING_IS_NA && !MFB_IS_NA) { cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - //cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; + bit3 = split_missing_default_to_left; } else { if (MAX_TO_LEFT) { cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - //cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + bit3 = 1; } else { cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - //cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + bit3 = 0; } } } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + cuda_data_to_left[local_data_index] = 0; + } + local_data_index = ((blockIdx.x * blockDim.x) << 3) + ((threadIdx.x + blockDim.x) << 2); + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(column_data[global_data_index]); + if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit4 = split_missing_default_to_left; + } else if (bin != max_bin_ref) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit4 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit4 = split_default_to_left; + } + } else { + if (MISSING_IS_NA && !MFB_IS_NA) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit4 = split_missing_default_to_left; + } else { + if (MAX_TO_LEFT) { + cuda_data_to_left[local_data_index] = 1; + bit4 = 1; + } else { + cuda_data_to_left[local_data_index] = 0; + bit4 = 0; + } + } + } + } else { + cuda_data_to_left[local_data_index] = 0; + } + ++local_data_index; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(column_data[global_data_index]); + if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit5 = split_missing_default_to_left; + } else if (bin != max_bin_ref) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit5 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit5 = split_default_to_left; + } + } else { + if (MISSING_IS_NA && !MFB_IS_NA) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit5 = split_missing_default_to_left; + } else { + if (MAX_TO_LEFT) { + cuda_data_to_left[local_data_index] = 1; + bit5 = 1; + } else { + cuda_data_to_left[local_data_index] = 0; + bit5 = 0; + } + } + } + } else { + cuda_data_to_left[local_data_index] = 0; + } + ++local_data_index; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(column_data[global_data_index]); + if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit6 = split_missing_default_to_left; + } else if (bin != max_bin_ref) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit6 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit6 = split_default_to_left; + } + } else { + if (MISSING_IS_NA && !MFB_IS_NA) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit6 = split_missing_default_to_left; + } else { + if (MAX_TO_LEFT) { + cuda_data_to_left[local_data_index] = 1; + bit6 = 1; + } else { + cuda_data_to_left[local_data_index] = 0; + bit6 = 0; + } + } + } + } else { + cuda_data_to_left[local_data_index] = 0; + } + ++local_data_index; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(column_data[global_data_index]); + if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit7 = split_missing_default_to_left; + } else if (bin != max_bin_ref) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit7 = split_missing_default_to_left; + } else { + cuda_data_to_left[local_data_index] = split_default_to_left; + bit7 = split_default_to_left; + } + } else { + if (MISSING_IS_NA && !MFB_IS_NA) { + cuda_data_to_left[local_data_index] = split_missing_default_to_left; + bit7 = split_missing_default_to_left; + } else { + if (MAX_TO_LEFT) { + cuda_data_to_left[local_data_index] = 1; + bit7 = 1; + } else { + cuda_data_to_left[local_data_index] = 0; + bit7 = 0; + } + } + } + } else { + cuda_data_to_left[local_data_index] = 0; + } + __syncthreads(); + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = bit0 + bit1 + bit2 + bit3; + thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x + blockDim.x)] = bit4 + bit5 + bit6 + bit7; + __syncthreads(); + ReduceSum(thread_to_left_offset_cnt, (split_indices_block_size_data_partition << 1)); + __syncthreads(); + if (threadIdx.x == 0) { + const data_size_t num_data_in_block = (((blockIdx.x + 1) * blockDim.x * 8) <= num_data_in_leaf) ? + static_cast(blockDim.x * 8) : + (num_data_in_leaf - static_cast(blockIdx.x * blockDim.x * 8)); + if (num_data_in_block > 0) { + const data_size_t data_to_left = static_cast(thread_to_left_offset_cnt[0]); + block_to_left_offset_buffer[blockIdx.x + 1] = data_to_left; + block_to_right_offset_buffer[blockIdx.x + 1] = num_data_in_block - data_to_left; + } else { + block_to_left_offset_buffer[blockIdx.x + 1] = 0; + block_to_right_offset_buffer[blockIdx.x + 1] = 0; + } + } +} + +#define GenBitVector_ARGS \ + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, \ + th, num_features_, \ + column_data, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, \ + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, \ + split_indices_block_size_data_partition_aligned, \ + cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index + +template +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( + const bool missing_is_zero, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_bin_to_left, + const int column_index, + const int num_blocks_final, + const int split_indices_block_size_data_partition_aligned, + const int split_feature_index, + const data_size_t leaf_data_start, + const data_size_t num_data_in_leaf, + const uint32_t th, + const uint32_t t_zero_bin, + const uint32_t most_freq_bin, + const uint32_t max_bin, + const uint32_t min_bin, + const uint8_t split_default_to_left, + const uint8_t split_missing_default_to_left, + const int left_leaf_index, + const int right_leaf_index, + const int default_leaf_index, + const int missing_default_leaf_index) { + if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + } +} + +template +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsMinInner2( + const bool missing_is_zero, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_bin_to_left, + const int column_index, + const int num_blocks_final, + const int split_indices_block_size_data_partition_aligned, + const int split_feature_index, + const data_size_t leaf_data_start, + const data_size_t num_data_in_leaf, + const uint32_t th, + const uint32_t t_zero_bin, + const uint32_t most_freq_bin, + const uint32_t max_bin, + const uint32_t min_bin, + const uint8_t split_default_to_left, + const uint8_t split_missing_default_to_left, + const int left_leaf_index, + const int right_leaf_index, + const int default_leaf_index, + const int missing_default_leaf_index) { + int grid_dim = 0; + int block_dim = 0; + CalcBlockDim(num_data_in_leaf, &grid_dim, &block_dim); + CHECK_EQ(num_blocks_final, grid_dim); + CHECK_EQ(split_indices_block_size_data_partition_aligned, block_dim); + if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); + } +} + +template +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( + const bool missing_is_zero, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const int column_index, + const int num_blocks_final, + const int split_indices_block_size_data_partition_aligned, + const int split_feature_index, + const data_size_t leaf_data_start, + const data_size_t num_data_in_leaf, + const uint32_t th, + const uint32_t t_zero_bin, + const uint32_t most_freq_bin, + const uint32_t max_bin, + const uint32_t min_bin, + const uint8_t split_default_to_left, + const uint8_t split_missing_default_to_left, + const int left_leaf_index, + const int right_leaf_index, + const int default_leaf_index, + const int missing_default_leaf_index) { + if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } +} + +template +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner2( + const bool missing_is_zero, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const int column_index, + const int num_blocks_final, + const int split_indices_block_size_data_partition_aligned, + const int split_feature_index, + const data_size_t leaf_data_start, + const data_size_t num_data_in_leaf, + const uint32_t th, + const uint32_t t_zero_bin, + const uint32_t most_freq_bin, + const uint32_t max_bin, + const uint32_t min_bin, + const uint8_t split_default_to_left, + const uint8_t split_missing_default_to_left, + const int left_leaf_index, + const int right_leaf_index, + const int default_leaf_index, + const int missing_default_leaf_index) { + int grid_dim = 0; + int block_dim = 0; + CalcBlockDim(num_data_in_leaf, &grid_dim, &block_dim); + CHECK_EQ(num_blocks_final, grid_dim); + CHECK_EQ(split_indices_block_size_data_partition_aligned, block_dim); + if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); + } +} + +#undef GenBitVector_ARGS + +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, + const int split_feature_index, const uint32_t split_threshold, + const uint8_t split_default_left, const data_size_t leaf_data_start, + const int left_leaf_index, const int right_leaf_index) { + const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; + const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; + int split_indices_block_size_data_partition_aligned = 1; + while (split_indices_block_size_data_partition > 0) { + split_indices_block_size_data_partition_aligned <<= 1; + split_indices_block_size_data_partition >>= 1; + } + const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; + const uint8_t missing_is_zero = feature_missing_is_zero_[split_feature_index]; + const uint8_t missing_is_na = feature_missing_is_na_[split_feature_index]; + const uint8_t mfb_is_zero = feature_mfb_is_zero_[split_feature_index]; + const uint8_t mfb_is_na = feature_mfb_is_na_[split_feature_index]; + const uint32_t default_bin = feature_default_bins_[split_feature_index]; + const uint32_t most_freq_bin = feature_most_freq_bins_[split_feature_index]; + const uint32_t min_bin = feature_min_bins_[split_feature_index]; + const uint32_t max_bin = feature_max_bins_[split_feature_index]; + + uint32_t th = split_threshold + min_bin; + uint32_t t_zero_bin = min_bin + default_bin; + if (most_freq_bin == 0) { + --th; + --t_zero_bin; + } + uint8_t split_default_to_left = 0; + uint8_t split_missing_default_to_left = 0; + int default_leaf_index = right_leaf_index; + int missing_default_leaf_index = right_leaf_index; + if (most_freq_bin <= split_threshold) { + split_default_to_left = 1; + default_leaf_index = left_leaf_index; + } + if (missing_is_zero || missing_is_na) { + if (split_default_left) { + split_missing_default_to_left = 1; + missing_default_leaf_index = left_leaf_index; + } + } + const int column_index = feature_index_to_column_index_[split_feature_index]; + const uint8_t bit_type = column_bit_type_[column_index]; + + const bool max_bin_to_left = (max_bin <= th); + + if (min_bin < max_bin) { + if (bit_type == 8) { + LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + column_index, + num_blocks_final, + split_indices_block_size_data_partition_aligned, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); + } else if (bit_type == 16) { + LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + column_index, + num_blocks_final, + split_indices_block_size_data_partition_aligned, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); + } else if (bit_type == 32) { + LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + column_index, + num_blocks_final, + split_indices_block_size_data_partition_aligned, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); + } + } else { + if (bit_type == 8) { + LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left, + column_index, + num_blocks_final, + split_indices_block_size_data_partition_aligned, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); + } else if (bit_type == 16) { + LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left, + column_index, + num_blocks_final, + split_indices_block_size_data_partition_aligned, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); + } else if (bit_type == 32) { + LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left, + column_index, + num_blocks_final, + split_indices_block_size_data_partition_aligned, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); + } + } + + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, + cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, + left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, + static_cast(missing_is_zero), + static_cast(missing_is_na), + static_cast(mfb_is_zero), + static_cast(mfb_is_na), + max_bin_to_left, + num_blocks_final, + split_indices_block_size_data_partition_aligned); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, + cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, + left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, + static_cast(missing_is_zero), + static_cast(missing_is_na), + static_cast(mfb_is_zero), + static_cast(mfb_is_na), + max_bin_to_left, + num_blocks_final, + split_indices_block_size_data_partition_aligned); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, + cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, + left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, + static_cast(missing_is_zero), + static_cast(missing_is_na), + static_cast(mfb_is_zero), + static_cast(mfb_is_na), + max_bin_to_left, + num_blocks_final, + split_indices_block_size_data_partition_aligned); } - __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); } -#define GenBitVector_ARGS \ - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, \ - th, num_features_, \ - column_data, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, \ - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, \ - split_indices_block_size_data_partition_aligned, \ - cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index - -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start, const int left_leaf_index, const int right_leaf_index) { - const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; - const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); - int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; - int split_indices_block_size_data_partition_aligned = 1; - while (split_indices_block_size_data_partition > 0) { - split_indices_block_size_data_partition_aligned <<= 1; - split_indices_block_size_data_partition >>= 1; - } - const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; + int grid_dim = 0; + int block_dim = 0; + CalcBlockDim(num_data_in_leaf, &grid_dim, &block_dim); const uint8_t missing_is_zero = feature_missing_is_zero_[split_feature_index]; const uint8_t missing_is_na = feature_missing_is_na_[split_feature_index]; const uint8_t mfb_is_zero = feature_mfb_is_zero_[split_feature_index]; @@ -1242,280 +2055,155 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num const bool max_bin_to_left = (max_bin <= th); if (min_bin < max_bin) { - if (!missing_is_zero && !missing_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_1_2_3<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else { - if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel4<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel5<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel6<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel7<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel8<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel9<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel10<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel11<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel12<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel13<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel14<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel15<<>>(GenBitVector_ARGS); - } else { - Log::Fatal("Unknown bit type %d", bit_type); - } - } + if (bit_type == 8) { + LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner2( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + column_index, + grid_dim, + block_dim, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); + } else if (bit_type == 16) { + LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner2( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + column_index, + grid_dim, + block_dim, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); + } else if (bit_type == 32) { + LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner2( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + column_index, + grid_dim, + block_dim, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); } } else { if (bit_type == 8) { - if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); - } + LaunchGenDataToLeftBitVectorKernelMaxIsMinInner2( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left, + column_index, + grid_dim, + block_dim, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); + } else if (bit_type == 16) { + LaunchGenDataToLeftBitVectorKernelMaxIsMinInner2( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left, + column_index, + grid_dim, + block_dim, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); + } else if (bit_type == 32) { + LaunchGenDataToLeftBitVectorKernelMaxIsMinInner2( + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left, + column_index, + grid_dim, + block_dim, + split_feature_index, + leaf_data_start, + num_data_in_leaf, + th, + t_zero_bin, + most_freq_bin, + max_bin, + min_bin, + split_default_to_left, + split_missing_default_to_left, + left_leaf_index, + right_leaf_index, + default_leaf_index, + missing_default_leaf_index); } } + int grid_dim_copy = 0; + int block_dim_copy = 0; + CalcBlockDimInCopy(num_data_in_leaf, &grid_dim_copy, &block_dim_copy); if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, @@ -1526,8 +2214,8 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num static_cast(mfb_is_zero), static_cast(mfb_is_na), max_bin_to_left, - num_blocks_final, - split_indices_block_size_data_partition_aligned); + grid_dim_copy, + block_dim_copy); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, @@ -1538,8 +2226,8 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num static_cast(mfb_is_zero), static_cast(mfb_is_na), max_bin_to_left, - num_blocks_final, - split_indices_block_size_data_partition_aligned); + grid_dim_copy, + block_dim_copy); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, @@ -1550,11 +2238,9 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num static_cast(mfb_is_zero), static_cast(mfb_is_na), max_bin_to_left, - num_blocks_final, - split_indices_block_size_data_partition_aligned); + grid_dim_copy, + block_dim_copy); } - - //SynchronizeCUDADevice(); } __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* block_to_left_offset_buffer, @@ -2010,10 +2696,12 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ const data_size_t* cuda_data_indices, const uint8_t* split_to_left_bit_vector, const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, data_size_t* out_data_indices_in_leaf, const int split_indices_block_size_data_partition) { - __shared__ uint8_t thread_split_to_left_bit_vector[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; + //__shared__ uint8_t thread_split_to_left_bit_vector[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; __shared__ uint16_t thread_to_left_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint16_t thread_to_right_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; + uint8_t first_to_left = 0; + uint8_t second_to_left = 0; const int leaf_index_ref = *leaf_index; const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index_ref] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; @@ -2024,20 +2712,20 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ const data_size_t* cuda_data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; if (global_thread_index < num_data_in_leaf_ref) { const uint8_t bit = split_to_left_bit_vector[global_thread_index]; - thread_split_to_left_bit_vector[threadIdx_x] = bit; + first_to_left = bit; thread_to_left_pos[conflict_free_threadIdx_x_plus_1] = bit; } else { - thread_split_to_left_bit_vector[threadIdx_x] = 0; + first_to_left = 0; thread_to_left_pos[conflict_free_threadIdx_x_plus_1] = 0; } const unsigned int conflict_free_threadIdx_x_plus_blockDim_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x + 1); const unsigned int global_thread_index_plus_blockDim_x = global_thread_index + blockDim_x; if (global_thread_index_plus_blockDim_x < num_data_in_leaf_ref) { const uint8_t bit = split_to_left_bit_vector[global_thread_index_plus_blockDim_x]; - thread_split_to_left_bit_vector[threadIdx_x + blockDim_x] = bit; + second_to_left = bit; thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = bit; } else { - thread_split_to_left_bit_vector[threadIdx_x + blockDim_x] = 0; + second_to_left = 0; thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = 0; } __syncthreads(); @@ -2058,14 +2746,14 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ data_size_t* left_out_data_indices_in_leaf = out_data_indices_in_leaf + to_left_block_offset; data_size_t* right_out_data_indices_in_leaf = out_data_indices_in_leaf + to_right_block_offset; if (global_thread_index < num_data_in_leaf_ref) { - if (thread_split_to_left_bit_vector[threadIdx_x] == 1) { + if (first_to_left == 1) { left_out_data_indices_in_leaf[thread_to_left_pos[conflict_free_threadIdx_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index]; } else { right_out_data_indices_in_leaf[thread_to_right_pos[threadIdx_x]] = cuda_data_indices_in_leaf[global_thread_index]; } } if (global_thread_index_plus_blockDim_x < num_data_in_leaf_ref) { - if (thread_split_to_left_bit_vector[threadIdx_x + blockDim_x] == 1) { + if (second_to_left == 1) { left_out_data_indices_in_leaf[thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x]; } else { right_out_data_indices_in_leaf[thread_to_right_pos[threadIdx_x + blockDim_x]] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x]; @@ -2073,6 +2761,124 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ } } +__global__ void SplitInnerKernel2(const int* leaf_index, const int* cuda_cur_num_leaves, + const data_size_t* cuda_leaf_data_start, const data_size_t* cuda_leaf_num_data, + const data_size_t* cuda_data_indices, const uint8_t* split_to_left_bit_vector, + const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, + data_size_t* out_data_indices_in_leaf, const int split_indices_block_size_data_partition) { + __shared__ uint16_t thread_to_left_pos[(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + 1 + + ((SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint16_t thread_to_right_pos[(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1)]; + const int leaf_index_ref = *leaf_index; + const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; + const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index_ref] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; + const unsigned int threadIdx_x = threadIdx.x; + const unsigned int blockDim_x = blockDim.x; + const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); + const unsigned int global_thread_index = blockIdx.x * blockDim_x * 2 + threadIdx_x; + const data_size_t* cuda_data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; + const uint32_t* split_to_left_bit_vector_uint32 = reinterpret_cast(split_to_left_bit_vector); + const uint32_t bit32_0 = split_to_left_bit_vector_uint32[global_thread_index]; + const uint8_t bit_0 = static_cast(bit32_0 & 0xf); + uint8_t bit_1 = static_cast((bit32_0 >> 8) & 0xf); + uint8_t bit_2 = static_cast((bit32_0 >> 16) & 0xf); + uint8_t bit_3 = static_cast((bit32_0 >> 24) & 0xf); + const uint8_t bit_1_acc = bit_1 + bit_0; + const uint8_t bit_2_acc = bit_1_acc + bit_2; + const uint8_t bit_3_acc = bit_2_acc + bit_3; + thread_to_left_pos[conflict_free_threadIdx_x_plus_1] = bit_3_acc; + const unsigned int conflict_free_threadIdx_x_plus_blockDim_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x + 1); + const unsigned int global_thread_index_plus_blockDim_x = global_thread_index + blockDim_x; + const uint32_t bit32_1 = split_to_left_bit_vector_uint32[global_thread_index_plus_blockDim_x]; + const uint8_t bit_4 = static_cast(bit32_1 & 0xf); + uint8_t bit_5 = static_cast((bit32_1 >> 8) & 0xf); + uint8_t bit_6 = static_cast((bit32_1 >> 16) & 0xf); + uint8_t bit_7 = static_cast((bit32_1 >> 24) & 0xf); + const uint8_t bit_5_acc = bit_4 + bit_5; + const uint8_t bit_6_acc = bit_5_acc + bit_6; + const uint8_t bit_7_acc = bit_6_acc + bit_7; + thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = bit_7_acc; + __syncthreads(); + const uint32_t to_right_block_offset = block_to_right_offset_buffer[blockIdx.x]; + const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x]; + if (threadIdx_x == 0) { + thread_to_left_pos[0] = 0; + thread_to_right_pos[0] = 0; + } + __syncthreads(); + PrefixSum(thread_to_left_pos, (split_indices_block_size_data_partition << 1)); + __syncthreads(); + if (threadIdx_x > 0) { + thread_to_right_pos[threadIdx_x] = ((threadIdx_x * 4) - thread_to_left_pos[conflict_free_threadIdx_x_plus_1]); + } + thread_to_right_pos[threadIdx_x + blockDim_x] = (((threadIdx_x + blockDim_x) * 4) - thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]); + __syncthreads(); + data_size_t* left_out_data_indices_in_leaf = out_data_indices_in_leaf + to_left_block_offset; + data_size_t* right_out_data_indices_in_leaf = out_data_indices_in_leaf + to_right_block_offset; + const data_size_t global_thread_index_base = global_thread_index * 4; + const data_size_t global_thread_index_plus_blockDim_x_base = global_thread_index_plus_blockDim_x * 4; + const uint16_t to_left_pos_offset_0 = thread_to_left_pos[conflict_free_threadIdx_x_plus_1]; + const uint16_t to_right_pos_offset_0 = thread_to_right_pos[threadIdx_x]; + const uint16_t to_left_pos_offset_1 = thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]; + const uint16_t to_right_pos_offset_1 = thread_to_right_pos[threadIdx_x + blockDim_x]; + if (global_thread_index_base < num_data_in_leaf_ref) { + if (bit_0 == 1) { + left_out_data_indices_in_leaf[to_left_pos_offset_0] = cuda_data_indices_in_leaf[global_thread_index_base]; + } else { + right_out_data_indices_in_leaf[to_right_pos_offset_0] = cuda_data_indices_in_leaf[global_thread_index_base]; + } + } + if (global_thread_index_base + 1 < num_data_in_leaf_ref) { + if (bit_1 == 1) { + left_out_data_indices_in_leaf[to_left_pos_offset_0 + bit_0] = cuda_data_indices_in_leaf[global_thread_index_base + 1]; + } else { + right_out_data_indices_in_leaf[to_right_pos_offset_0 + 1 - bit_0] = cuda_data_indices_in_leaf[global_thread_index_base + 1]; + } + } + if (global_thread_index_base + 2 < num_data_in_leaf_ref) { + if (bit_2 == 1) { + left_out_data_indices_in_leaf[to_left_pos_offset_0 + bit_1_acc] = cuda_data_indices_in_leaf[global_thread_index_base + 2]; + } else { + right_out_data_indices_in_leaf[to_right_pos_offset_0 + 2 - bit_1_acc] = cuda_data_indices_in_leaf[global_thread_index_base + 2]; + } + } + if (global_thread_index_base + 3 < num_data_in_leaf_ref) { + if (bit_3 == 1) { + left_out_data_indices_in_leaf[to_left_pos_offset_0 + bit_2_acc] = cuda_data_indices_in_leaf[global_thread_index_base + 3]; + } else { + right_out_data_indices_in_leaf[to_right_pos_offset_0 + 3 - bit_2_acc] = cuda_data_indices_in_leaf[global_thread_index_base + 3]; + } + } + if (global_thread_index_plus_blockDim_x_base < num_data_in_leaf_ref) { + if (bit_4 == 1) { + left_out_data_indices_in_leaf[to_left_pos_offset_1] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base]; + } else { + right_out_data_indices_in_leaf[to_right_pos_offset_1] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base]; + } + } + if (global_thread_index_plus_blockDim_x_base + 1 < num_data_in_leaf_ref) { + if (bit_5 == 1) { + left_out_data_indices_in_leaf[to_left_pos_offset_1 + bit_4] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 1]; + } else { + right_out_data_indices_in_leaf[to_right_pos_offset_1 + 1 - bit_4] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 1]; + } + } + if (global_thread_index_plus_blockDim_x_base + 2 < num_data_in_leaf_ref) { + if (bit_6 == 1) { + left_out_data_indices_in_leaf[to_left_pos_offset_1 + bit_5_acc] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 2]; + } else { + right_out_data_indices_in_leaf[to_right_pos_offset_1 + 2 - bit_5_acc] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 2]; + } + } + if (global_thread_index_plus_blockDim_x_base + 3 < num_data_in_leaf_ref) { + if (bit_7 == 1) { + left_out_data_indices_in_leaf[to_left_pos_offset_1 + bit_6_acc] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 3]; + } else { + right_out_data_indices_in_leaf[to_right_pos_offset_1 + 3 - bit_6_acc] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 3]; + } + } +} + __global__ void CopyDataIndicesKernel( const data_size_t num_data_in_leaf, const data_size_t* out_data_indices_in_leaf, @@ -2146,7 +2952,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data cuda_num_total_bin_, cuda_hist_, cuda_hist_pool_, - + tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, @@ -2249,6 +3055,168 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data *larger_leaf_index = cpu_split_info_buffer[7]; } +void CUDADataPartition::LaunchSplitInnerKernel2(const int* leaf_index, const data_size_t num_data_in_leaf, + const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left, const double* best_split_gain, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, + // for leaf splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer, + std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + std::vector* cpu_leaf_sum_hessians, + int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { + int block_dim = 0; + int grid_dim = 0; + CalcBlockDim(num_data_in_leaf, &grid_dim, &block_dim); + int grid_dim_ref = grid_dim - 1; + int grid_dim_aligned = 1; + while (grid_dim_ref > 0) { + grid_dim_aligned <<= 1; + grid_dim_ref >>= 1; + } + global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); + + if (grid_dim > AGGREGATE_BLOCK_SIZE) { + AggregateBlockOffsetKernel2<<<1, AGGREGATE_BLOCK_SIZE, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, + cuda_leaf_num_data_, cuda_data_indices_, + cuda_cur_num_leaves_, + best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, + best_left_sum_gradients, best_left_sum_hessians, best_left_count, + best_left_gain, best_left_leaf_value, + best_right_sum_gradients, best_right_sum_hessians, best_right_count, + best_right_gain, best_right_leaf_value, + + smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, + smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, + smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, + smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + smaller_leaf_cuda_hist_pointer_pointer, + larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, + larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, + larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_hist_pointer_pointer, + cuda_num_total_bin_, + cuda_hist_, + cuda_hist_pool_, + + tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, + tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, + tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, + data_partition_leaf_output_, grid_dim); + } else { + AggregateBlockOffsetKernel3<<<1, grid_dim_aligned, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, + cuda_leaf_num_data_, cuda_data_indices_, + cuda_cur_num_leaves_, + best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, + best_left_sum_gradients, best_left_sum_hessians, best_left_count, + best_left_gain, best_left_leaf_value, + best_right_sum_gradients, best_right_sum_hessians, best_right_count, + best_right_gain, best_right_leaf_value, + + smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, + smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, + smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, + smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + smaller_leaf_cuda_hist_pointer_pointer, + larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, + larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, + larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_hist_pointer_pointer, + cuda_num_total_bin_, + cuda_hist_, + cuda_hist_pool_, + + tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, + tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, + tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, + data_partition_leaf_output_, grid_dim, grid_dim_aligned); + } + SynchronizeCUDADevice(); + global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); + global_timer.Start("CUDADataPartition::SplitInnerKernel"); + + SplitInnerKernel2<<>>( + leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, + cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, + cuda_out_data_indices_in_leaf_, block_dim); + //SynchronizeCUDADevice(); + global_timer.Stop("CUDADataPartition::SplitInnerKernel"); + + global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); + SplitTreeStructureKernel<<<4, 6, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, + cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, + cuda_cur_num_leaves_, + best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, + best_left_sum_gradients, best_left_sum_hessians, best_left_count, + best_left_gain, best_left_leaf_value, + best_right_sum_gradients, best_right_sum_hessians, best_right_count, + best_right_gain, best_right_leaf_value, best_split_found, + + smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, + smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, + smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, + smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + smaller_leaf_cuda_hist_pointer_pointer, + larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, + larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, + larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, + larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + larger_leaf_cuda_hist_pointer_pointer, + cuda_num_total_bin_, + cuda_hist_, + cuda_hist_pool_, block_dim, + + tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, + tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, + tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, + data_partition_leaf_output_, cuda_split_info_buffer_); + //SynchronizeCUDADevice(); + global_timer.Stop("CUDADataPartition::SplitTreeStructureKernel"); + std::vector cpu_split_info_buffer(12); + const double* cpu_sum_hessians_info = reinterpret_cast(cpu_split_info_buffer.data() + 8); + global_timer.Start("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); + CopyFromCUDADeviceToHostAsync(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 12, cuda_streams_[0]); + global_timer.Stop("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); + SynchronizeCUDADevice(); + const data_size_t left_leaf_num_data = cpu_split_info_buffer[1]; + const data_size_t left_leaf_data_start = cpu_split_info_buffer[2]; + const data_size_t right_leaf_num_data = cpu_split_info_buffer[4]; + global_timer.Start("CUDADataPartition::CopyDataIndicesKernel"); + int grid_dim_copy = 0; + int block_dim_copy = 0; + CalcBlockDimInCopy(num_data_in_leaf, &grid_dim_copy, &block_dim_copy); + CopyDataIndicesKernel<<>>( + left_leaf_num_data + right_leaf_num_data, cuda_out_data_indices_in_leaf_, cuda_data_indices_ + left_leaf_data_start); + global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); + const int left_leaf_index = cpu_split_info_buffer[0]; + const int right_leaf_index = cpu_split_info_buffer[3]; + const data_size_t right_leaf_data_start = cpu_split_info_buffer[5]; + (*cpu_leaf_num_data)[left_leaf_index] = left_leaf_num_data; + (*cpu_leaf_data_start)[left_leaf_index] = left_leaf_data_start; + (*cpu_leaf_num_data)[right_leaf_index] = right_leaf_num_data; + (*cpu_leaf_data_start)[right_leaf_index] = right_leaf_data_start; + (*cpu_leaf_sum_hessians)[left_leaf_index] = cpu_sum_hessians_info[0]; + (*cpu_leaf_sum_hessians)[right_leaf_index] = cpu_sum_hessians_info[1]; + *smaller_leaf_index = cpu_split_info_buffer[6]; + *larger_leaf_index = cpu_split_info_buffer[7]; +} + __global__ void PrefixSumKernel(uint32_t* cuda_elements) { __shared__ uint32_t elements[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1]; const unsigned int threadIdx_x = threadIdx.x; diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index b639dd8facc2..322a5916446a 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -189,6 +189,14 @@ class CUDADataPartition { const uint8_t* tree_default_left() const { return tree_default_left_; } private: + void CalcBlockDim(const data_size_t num_data_in_leaf, + int* grid_dim, + int* block_dim); + + void CalcBlockDimInCopy(const data_size_t num_data_in_leaf, + int* grid_dim, + int* block_dim); + void CopyColWiseData(const Dataset* train_data); void GenDataToLeftBitVector(const data_size_t num_data_in_leaf, @@ -243,11 +251,136 @@ class CUDADataPartition { std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index); + void LaunchSplitInnerKernel2(const int* leaf_index, const data_size_t num_data_in_leaf, + const int* best_split_feature, const uint32_t* best_split_threshold, + const uint8_t* best_split_default_left, const double* best_split_gain, + const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, + const double* best_left_gain, const double* best_left_leaf_value, + const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, + const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, + // for leaf splits information update + int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, + double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, + double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, + const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** smaller_leaf_cuda_hist_pointer_pointer, + int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, + double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, + double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, + const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, + hist_t** larger_leaf_cuda_hist_pointer_pointer, + std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + std::vector* cpu_leaf_sum_hessians, + int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index); + void LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start, const int left_leaf_index, const int right_leaf_index); + void LaunchGenDataToLeftBitVectorKernel2(const data_size_t num_data_in_leaf, + const int split_feature_index, const uint32_t split_threshold, + const uint8_t split_default_left, const data_size_t leaf_data_start, + const int left_leaf_index, const int right_leaf_index); + + template + void LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( + const bool missing_is_zero, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_bin_to_left, + const int column_index, + const int num_blocks_final, + const int split_indices_block_size_data_partition_aligned, + const int split_feature_index, + const data_size_t leaf_data_start, + const data_size_t num_data_in_leaf, + const uint32_t th, + const uint32_t t_zero_bin, + const uint32_t most_freq_bin, + const uint32_t max_bin, + const uint32_t min_bin, + const uint8_t split_default_to_left, + const uint8_t split_missing_default_to_left, + const int left_leaf_index, + const int right_leaf_index, + const int default_leaf_index, + const int missing_default_leaf_index); + + template + void LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( + const bool missing_is_zero, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const int column_index, + const int num_blocks_final, + const int split_indices_block_size_data_partition_aligned, + const int split_feature_index, + const data_size_t leaf_data_start, + const data_size_t num_data_in_leaf, + const uint32_t th, + const uint32_t t_zero_bin, + const uint32_t most_freq_bin, + const uint32_t max_bin, + const uint32_t min_bin, + const uint8_t split_default_to_left, + const uint8_t split_missing_default_to_left, + const int left_leaf_index, + const int right_leaf_index, + const int default_leaf_index, + const int missing_default_leaf_index); + + template + void LaunchGenDataToLeftBitVectorKernelMaxIsMinInner2( + const bool missing_is_zero, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_bin_to_left, + const int column_index, + const int num_blocks_final, + const int split_indices_block_size_data_partition_aligned, + const int split_feature_index, + const data_size_t leaf_data_start, + const data_size_t num_data_in_leaf, + const uint32_t th, + const uint32_t t_zero_bin, + const uint32_t most_freq_bin, + const uint32_t max_bin, + const uint32_t min_bin, + const uint8_t split_default_to_left, + const uint8_t split_missing_default_to_left, + const int left_leaf_index, + const int right_leaf_index, + const int default_leaf_index, + const int missing_default_leaf_index); + + template + void LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner2( + const bool missing_is_zero, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const int column_index, + const int num_blocks_final, + const int split_indices_block_size_data_partition_aligned, + const int split_feature_index, + const data_size_t leaf_data_start, + const data_size_t num_data_in_leaf, + const uint32_t th, + const uint32_t t_zero_bin, + const uint32_t most_freq_bin, + const uint32_t max_bin, + const uint32_t min_bin, + const uint8_t split_default_to_left, + const uint8_t split_missing_default_to_left, + const int left_leaf_index, + const int right_leaf_index, + const int default_leaf_index, + const int missing_default_leaf_index); + template void LaunchUpdateDataIndexToLeafIndexKernel(const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index cfe83321d51d..91e1db0a7725 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -237,10 +237,8 @@ __global__ void CUDAConstructHistogramDenseKernel2( const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1; const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; - hist_t* feature_histogram_ptr = histogram_buffer + total_num_bin * (blockIdx.y % USED_HISTOGRAM_BUFFER_NUM) * 2 + (partition_hist_start << 1); for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { shared_hist[i] = 0.0f; - //feature_histogram_ptr[i] = 0.0f; } __syncthreads(); const unsigned int threadIdx_y = threadIdx.y; @@ -268,6 +266,7 @@ __global__ void CUDAConstructHistogramDenseKernel2( } } __syncthreads(); + hist_t* feature_histogram_ptr = histogram_buffer + total_num_bin * (blockIdx.y % USED_HISTOGRAM_BUFFER_NUM) * 2 + (partition_hist_start << 1); for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); } @@ -332,7 +331,7 @@ __global__ void CUDAConstructHistogramSparseKernel2( inner_data_index += blockDim.y; } __syncthreads(); - hist_t* feature_histogram_ptr = histogram_buffer + total_num_bin * blockIdx.y * 2 + (partition_hist_start << 1); + hist_t* feature_histogram_ptr = histogram_buffer + total_num_bin * (blockIdx.y % USED_HISTOGRAM_BUFFER_NUM) * 2 + (partition_hist_start << 1); for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); } @@ -606,7 +605,6 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel2( num_data_, block_cuda_hist_buffer_, num_total_bin_); } } - SynchronizeCUDADevice(); global_timer.Stop("CUDAConstructHistogramKernel2"); const int merge_block_dim = 1024; const int num_bin_per_block = merge_block_dim / USED_HISTOGRAM_BUFFER_NUM; @@ -614,7 +612,6 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel2( global_timer.Start("MergeHistogramBufferKernel"); MergeHistogramBufferKernel<<>>( block_cuda_hist_buffer_, num_total_bin_, num_bin_per_block, cuda_leaf_hist); - SynchronizeCUDADevice(); global_timer.Stop("MergeHistogramBufferKernel"); } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 45b8c14942ec..7ea31661a2d9 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -151,7 +151,7 @@ class CUDAHistogramConstructor { std::vector need_fix_histogram_features_; std::vector need_fix_histogram_features_num_bin_aligend_; - const int min_grid_dim_y_ = 10; + const int min_grid_dim_y_ = 160; // CUDA memory, held by this object uint32_t* cuda_feature_group_bin_offsets_; From eb1d7fadfa60478667163074c0cae756bf7157e0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 2 Jul 2021 06:54:58 +0000 Subject: [PATCH 035/166] add cuda tree predictor --- include/LightGBM/bin.h | 2 + src/treelearner/cuda/cuda_data_partition.cpp | 41 +++++++------ src/treelearner/cuda/cuda_data_partition.cu | 63 +++++++------------- src/treelearner/cuda/cuda_data_partition.hpp | 12 ++-- src/treelearner/cuda/cuda_tree_predictor.cpp | 36 +++++++++++ src/treelearner/cuda/cuda_tree_predictor.cu | 0 src/treelearner/cuda/cuda_tree_predictor.hpp | 62 +++++++++++++++++++ 7 files changed, 150 insertions(+), 66 deletions(-) create mode 100644 src/treelearner/cuda/cuda_tree_predictor.cpp create mode 100644 src/treelearner/cuda/cuda_tree_predictor.cu create mode 100644 src/treelearner/cuda/cuda_tree_predictor.hpp diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index dbb942373f97..06ed5f38fef3 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -198,6 +198,8 @@ class BinMapper { } } + inline const std::vector& bin_upper_bound() const { return bin_upper_bound_; } + private: /*! \brief Number of bins */ int num_bin_; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 3efe449da60b..b56c53c37061 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -30,6 +30,8 @@ CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_f feature_missing_is_na_.resize(train_data->num_features()); feature_mfb_is_zero_.resize(train_data->num_features()); feature_mfb_is_na_.resize(train_data->num_features()); + bin_upper_bounds_.resize(train_data->num_features()); + feature_num_bins_.resize(train_data->num_features()); int cur_group = 0; uint32_t prev_group_bins = 0; for (int feature_index = 0; feature_index < num_features_; ++feature_index) { @@ -43,6 +45,8 @@ CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_f feature_most_freq_bins_[feature_index] = bin_mapper->GetMostFreqBin(); feature_min_bins_[feature_index] = train_data->feature_min_bin(feature_index); feature_max_bins_[feature_index] = train_data->feature_max_bin(feature_index); + bin_upper_bounds_[feature_index] = bin_mapper->bin_upper_bound(); + feature_num_bins_[feature_index] = bin_mapper->num_bin(); const MissingType missing_type = bin_mapper->missing_type(); if (missing_type == MissingType::None) { feature_missing_is_zero_[feature_index] = 0; @@ -72,10 +76,6 @@ CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_f num_data_in_leaf_.resize(num_leaves_, 0); num_data_in_leaf_[0] = num_data_; - /*for (size_t i = 0; i < feature_max_bins_.size(); ++i) { - Log::Warning("feature_min_bins_[%d] = %d, feature_max_bins_[%d] = %d", i, feature_min_bins_[i], i, feature_max_bins_[i]); - }*/ - train_data_ = train_data; } @@ -111,6 +111,7 @@ void CUDADataPartition::Init(const Dataset* train_data) { AllocateCUDAMemory(static_cast(num_leaves_), &tree_split_leaf_index_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_inner_feature_index_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_threshold_); + AllocateCUDAMemory(static_cast(num_leaves_), &tree_threshold_real_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_left_output_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_right_output_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_left_count_); @@ -136,6 +137,21 @@ void CUDADataPartition::Init(const Dataset* train_data) { const size_t max_num_blocks_in_debug = static_cast((num_data_ + 1023) / 1024); AllocateCUDAMemory(max_num_blocks_in_debug, &cuda_gradients_sum_buffer_); AllocateCUDAMemory(max_num_blocks_in_debug, &cuda_hessians_sum_buffer_); + + std::vector flatten_bin_upper_bounds; + std::vector feature_num_bin_offsets; + int offset = 0; + feature_num_bin_offsets.emplace_back(offset); + for (size_t i = 0; i < bin_upper_bounds_.size(); ++i) { + CHECK_EQ(static_cast(feature_num_bins_[i]), bin_upper_bounds_[i].size()); + for (const auto value : bin_upper_bounds_[i]) { + flatten_bin_upper_bounds.emplace_back(value); + } + offset += feature_num_bins_[i]; + feature_num_bin_offsets.emplace_back(offset); + } + InitCUDAMemoryFromHostMemory(&cuda_feature_num_bin_offsets_, feature_num_bin_offsets.data(), feature_num_bin_offsets.size()); + InitCUDAMemoryFromHostMemory(&cuda_bin_upper_bounds_, flatten_bin_upper_bounds.data(), flatten_bin_upper_bounds.size()); } void CUDADataPartition::CopyColWiseData(const Dataset* train_data) { @@ -383,9 +399,6 @@ void CUDADataPartition::Split(const int* leaf_id, const uint32_t split_threshold = cpu_leaf_best_split_threshold[cpu_leaf_index]; const uint8_t split_default_left = cpu_leaf_best_split_default_left[cpu_leaf_index]; const data_size_t leaf_data_start = cpu_leaf_data_start->at(cpu_leaf_index); - //Log::Warning("real split feature index = %d", train_data_->RealFeatureIndex(split_feature_index)); - //Log::Warning("split threshold = %d", split_threshold); - //Log::Warning("split default left = %d", split_default_left); global_timer.Stop("SplitInner Copy CUDA To Host"); GenDataToLeftBitVector(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, cpu_leaf_index, cur_max_leaf_index); global_timer.Stop("GenDataToLeftBitVector"); @@ -415,17 +428,7 @@ void CUDADataPartition::GenDataToLeftBitVector(const data_size_t num_data_in_lea const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start, const int left_leaf_index, const int right_leaf_index) { - LaunchGenDataToLeftBitVectorKernel2(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, left_leaf_index, right_leaf_index); - /*if (num_data_in_leaf == 10500000) { - std::vector cpu_bit_vector(num_data_in_leaf, 0); - CopyFromCUDADeviceToHost(cpu_bit_vector.data(), cuda_data_to_left_, num_data_in_leaf); - for (size_t i = 0; i < 100; ++i) { - Log::Warning("cpu_bit_vector[%d] = %d", i, cpu_bit_vector[i]); - } - for (size_t i = 10500000 - 100; i < 10500000; ++i) { - Log::Warning("cpu_bit_vector[%d] = %d", i, cpu_bit_vector[i]); - } - }*/ + LaunchGenDataToLeftBitVectorKernel(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, left_leaf_index, right_leaf_index); } void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, @@ -449,7 +452,7 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { - LaunchSplitInnerKernel2(leaf_index, num_data_in_leaf, + LaunchSplitInnerKernel(leaf_index, num_data_in_leaf, best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, best_left_sum_gradients, best_left_sum_hessians, best_left_count, best_left_gain, best_left_leaf_value, diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 7b32a1eba269..ae4ac7695406 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -2265,12 +2265,7 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, hist_t** larger_leaf_cuda_hist_pointer_pointer, const int* cuda_num_total_bin, - hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition, - - int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, - double* tree_left_output, double* tree_right_output, data_size_t* tree_left_count, data_size_t* tree_right_count, - double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, - double* data_partition_leaf_output) { + hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition) { __shared__ uint32_t block_to_left_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint32_t block_to_right_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + @@ -2353,8 +2348,6 @@ __global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* b if (blockIdx.x == 0 && threadIdx.x == 0) { ++(*cuda_cur_num_leaves); const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; - /*printf("leaf_index_ref = %d, cuda_cur_num_leaves = %d, cur_max_leaf_index = %d\n", - leaf_index_ref, *cuda_cur_num_leaves, cur_max_leaf_index);*/ block_to_left_offset_buffer[0] = 0; const unsigned int to_left_total_cnt = block_to_left_offset_buffer[num_blocks]; block_to_right_offset_buffer[0] = to_left_total_cnt; @@ -2390,11 +2383,7 @@ __global__ void AggregateBlockOffsetKernel2(const int* leaf_index, data_size_t* hist_t** larger_leaf_cuda_hist_pointer_pointer, const int* cuda_num_total_bin, hist_t* cuda_hist, hist_t** cuda_hist_pool, - - int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, - double* tree_left_output, double* tree_right_output, data_size_t* tree_left_count, data_size_t* tree_right_count, - double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, - double* data_partition_leaf_output, const data_size_t num_blocks) { + const data_size_t num_blocks) { __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE + 2 + (AGGREGATE_BLOCK_SIZE + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE + 2 + @@ -2480,11 +2469,7 @@ __global__ void AggregateBlockOffsetKernel3(const int* leaf_index, data_size_t* hist_t** larger_leaf_cuda_hist_pointer_pointer, const int* cuda_num_total_bin, hist_t* cuda_hist, hist_t** cuda_hist_pool, - - int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, - double* tree_left_output, double* tree_right_output, data_size_t* tree_left_count, data_size_t* tree_right_count, - double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, - double* data_partition_leaf_output, const data_size_t num_blocks, const data_size_t num_blocks_aligned) { + const data_size_t num_blocks, const data_size_t num_blocks_aligned) { __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE + 2 + (AGGREGATE_BLOCK_SIZE + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE + 2 + @@ -2550,7 +2535,8 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo const int* cuda_num_total_bin, hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition, - int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, + const double* cuda_bin_upper_bounds, const int* cuda_feature_num_bin_offsets, + int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, double* tree_threshold_real, double* tree_left_output, double* tree_right_output, data_size_t* tree_left_count, data_size_t* tree_right_count, double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, double* data_partition_leaf_output, @@ -2607,7 +2593,12 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo best_split_found[leaf_index_ref] = 0; } else if (global_thread_index == 22) { best_split_found[cur_max_leaf_index] = 0; - } + } else if (global_thread_index == 23) { + const uint32_t threshold_int = best_split_threshold[leaf_index_ref]; + const int split_inner_feature_index = best_split_feature[leaf_index_ref]; + const double threshold_real = cuda_bin_upper_bounds[cuda_feature_num_bin_offsets[split_inner_feature_index] + threshold_int]; + tree_threshold_real[cur_max_leaf_index - 1] = threshold_real; + } if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { if (global_thread_index == 0) { @@ -2952,11 +2943,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data cuda_num_total_bin_, cuda_hist_, cuda_hist_pool_, - - tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, - tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, - tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, - data_partition_leaf_output_, num_blocks_final); + num_blocks_final); } else { AggregateBlockOffsetKernel3<<<1, num_blocks_final_aligned, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, @@ -2981,11 +2968,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data cuda_num_total_bin_, cuda_hist_, cuda_hist_pool_, - - tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, - tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, - tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, - data_partition_leaf_output_, num_blocks_final, num_blocks_final_aligned); + num_blocks_final, num_blocks_final_aligned); } SynchronizeCUDADevice(); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); @@ -3023,7 +3006,9 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data cuda_hist_, cuda_hist_pool_, split_indices_block_size_data_partition_aligned, - tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, + cuda_bin_upper_bounds_, cuda_feature_num_bin_offsets_, + + tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, tree_threshold_real_, tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, data_partition_leaf_output_, cuda_split_info_buffer_); @@ -3111,11 +3096,7 @@ void CUDADataPartition::LaunchSplitInnerKernel2(const int* leaf_index, const dat cuda_num_total_bin_, cuda_hist_, cuda_hist_pool_, - - tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, - tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, - tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, - data_partition_leaf_output_, grid_dim); + grid_dim); } else { AggregateBlockOffsetKernel3<<<1, grid_dim_aligned, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, @@ -3140,11 +3121,7 @@ void CUDADataPartition::LaunchSplitInnerKernel2(const int* leaf_index, const dat cuda_num_total_bin_, cuda_hist_, cuda_hist_pool_, - - tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, - tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, - tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, - data_partition_leaf_output_, grid_dim, grid_dim_aligned); + grid_dim, grid_dim_aligned); } SynchronizeCUDADevice(); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); @@ -3182,7 +3159,9 @@ void CUDADataPartition::LaunchSplitInnerKernel2(const int* leaf_index, const dat cuda_hist_, cuda_hist_pool_, block_dim, - tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, + cuda_bin_upper_bounds_, cuda_feature_num_bin_offsets_, + + tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, tree_threshold_real_, tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, data_partition_leaf_output_, cuda_split_info_buffer_); diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 322a5916446a..915b808892a2 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -112,11 +112,6 @@ class CUDADataPartition { CopyFromCUDADeviceToHost(test_leaf_num_data.data(), cuda_leaf_num_data_, static_cast(num_leaves_)); CopyFromCUDADeviceToHost(test_leaf_data_start.data(), cuda_leaf_data_start_, static_cast(num_leaves_)); CopyFromCUDADeviceToHost(test_leaf_data_end.data(), cuda_leaf_data_end_, static_cast(num_leaves_)); - /*for (int i = 0; i < num_leaves_; ++i) { - Log::Warning("test_leaf_num_data[%d] = %d", i, test_leaf_num_data[i]); - Log::Warning("test_leaf_data_start[%d] = %d", i, test_leaf_data_start[i]); - Log::Warning("test_leaf_data_end[%d] = %d", i, test_leaf_data_end[i]); - }*/ const data_size_t start_pos = test_leaf_data_start[2]; const int check_window_size = 10; for (data_size_t i = 0; i < check_window_size; ++i) { @@ -172,6 +167,8 @@ class CUDADataPartition { const uint32_t* tree_threshold() const { return tree_threshold_; } + const double* tree_threshold_real() const { return tree_threshold_real_; } + const double* tree_left_output() const { return tree_left_output_; } const double* tree_right_output() const { return tree_right_output_; } @@ -426,6 +423,8 @@ class CUDADataPartition { std::vector column_bit_type_; std::vector feature_index_to_column_index_; const Dataset* train_data_; + std::vector> bin_upper_bounds_; + std::vector feature_num_bins_; // CUDA streams std::vector cuda_streams_; @@ -458,6 +457,7 @@ class CUDADataPartition { int* tree_split_leaf_index_; int* tree_inner_feature_index_; uint32_t* tree_threshold_; + double* tree_threshold_real_; double* tree_left_output_; double* tree_right_output_; data_size_t* tree_left_count_; @@ -467,6 +467,8 @@ class CUDADataPartition { double* tree_gain_; uint8_t* tree_default_left_; double* data_partition_leaf_output_; + double* cuda_bin_upper_bounds_; + int* cuda_feature_num_bin_offsets_; // for debug double* cuda_gradients_sum_buffer_; double* cuda_hessians_sum_buffer_; diff --git a/src/treelearner/cuda/cuda_tree_predictor.cpp b/src/treelearner/cuda/cuda_tree_predictor.cpp new file mode 100644 index 000000000000..be95a70138c7 --- /dev/null +++ b/src/treelearner/cuda/cuda_tree_predictor.cpp @@ -0,0 +1,36 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_tree_predictor.hpp" + +namespace LightGBM { + +CUDATreePredictor::CUDATreePredictor(const Config* config, + const int* tree_split_leaf_index, + const int* tree_inner_feature_index, + const uint32_t* tree_threshold, + const double* tree_threshold_real, + const double* tree_left_output, + const double* tree_right_output, + const data_size_t* tree_left_count, + const data_size_t* tree_right_count, + const double* tree_left_sum_hessian, + const double* tree_right_sum_hessian, + const double* tree_gain, + const uint8_t* tree_default_left, + const double* leaf_output): +tree_split_leaf_index_(tree_split_leaf_index), +tree_inner_feature_index_(tree_inner_feature_index), +tree_threshold_(tree_threshold), + { + +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_tree_predictor.cu b/src/treelearner/cuda/cuda_tree_predictor.cu new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/treelearner/cuda/cuda_tree_predictor.hpp b/src/treelearner/cuda/cuda_tree_predictor.hpp new file mode 100644 index 000000000000..cec5f6a4837a --- /dev/null +++ b/src/treelearner/cuda/cuda_tree_predictor.hpp @@ -0,0 +1,62 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_CUDA_TREE_PREDICTOR_HPP_ +#define LIGHTGBM_CUDA_TREE_PREDICTOR_HPP_ + +#ifdef USE_CUDA + +#include +#include +#include "new_cuda_utils.hpp" + +#include + +namespace LightGBM { + +class CUDATreePredictor { + public: + CUDATreePredictor(const Config* config, + const int* tree_split_leaf_index, + const int* tree_inner_feature_index, + const uint32_t* tree_threshold, + const double* tree_threshold_real, + const double* tree_left_output, + const double* tree_right_output, + const data_size_t* tree_left_count, + const data_size_t* tree_right_count, + const double* tree_left_sum_hessian, + const double* tree_right_sum_hessian, + const double* tree_gain, + const uint8_t* tree_default_left, + const double* leaf_output); + + void Predict(const double* data, double* out_score) const; + + private: + void BuildTree(); + + void LaunchPredictKernel(const double* data, double* out_score) const; + + // CUDA memory, held by other objects + const int* tree_split_leaf_index_; + const int* tree_inner_feature_index_; + const uint32_t* tree_threshold_; + const double* tree_threshold_real_; + const double* tree_left_output_; + const double* tree_right_output_; + const data_size_t* tree_left_count_; + const data_size_t* tree_right_count_; + const double* tree_left_sum_hessian_; + const double* tree_right_sum_hessian_; + const double* tree_gain_; + const uint8_t* tree_default_left_; + const double* leaf_output_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_CUDA_TREE_PREDICTOR_HPP_ From dd177f5d0161b8931ae71af09a10c5a61882fa90 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sun, 18 Jul 2021 15:55:35 +0000 Subject: [PATCH 036/166] remove comma --- src/treelearner/cuda/cuda_tree_predictor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/cuda/cuda_tree_predictor.cpp b/src/treelearner/cuda/cuda_tree_predictor.cpp index be95a70138c7..701dc640ff9f 100644 --- a/src/treelearner/cuda/cuda_tree_predictor.cpp +++ b/src/treelearner/cuda/cuda_tree_predictor.cpp @@ -26,7 +26,7 @@ CUDATreePredictor::CUDATreePredictor(const Config* config, const double* leaf_output): tree_split_leaf_index_(tree_split_leaf_index), tree_inner_feature_index_(tree_inner_feature_index), -tree_threshold_(tree_threshold), +tree_threshold_(tree_threshold) { } From ee836d63cecc08de8f87c21971a88ed820264ffd Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 19 Jul 2021 07:16:43 +0000 Subject: [PATCH 037/166] refactor objective and score updater --- CMakeLists.txt | 8 ++ include/LightGBM/cuda/cuda_column_data.hpp | 74 +++++++++++++++ include/LightGBM/cuda/cuda_metadata.hpp | 43 +++++++++ include/LightGBM/cuda/cuda_tree.hpp | 94 +++++++++++++++++++ include/LightGBM/cuda/cuda_utils.h | 90 ++++++++++++++++++ include/LightGBM/dataset.h | 7 ++ src/boosting/cuda/cuda_score_updater.cpp | 73 ++++++++++++++ src/boosting/cuda/cuda_score_updater.cu | 46 +++++++++ src/boosting/cuda/cuda_score_updater.hpp | 48 ++++++++++ src/boosting/gbdt.cpp | 79 +++++++++++----- src/boosting/gbdt.h | 7 +- src/boosting/score_updater.hpp | 14 +-- src/cuda/cuda_utils.cpp | 23 +++++ src/io/cuda/cuda_metadata.cpp | 30 ++++++ src/io/dataset.cpp | 5 + src/io/metadata.cpp | 6 ++ src/main.cpp | 10 +- src/objective/binary_objective.hpp | 2 +- src/objective/cuda/cuda_binary_objective.cpp | 55 +++++++++++ src/objective/cuda/cuda_binary_objective.cu | 83 ++++++++++++++++ src/objective/cuda/cuda_binary_objective.hpp | 52 ++++++++++ .../cuda/cuda_objective_function.cpp | 12 +++ .../cuda/cuda_objective_function.hpp | 25 +++++ src/objective/objective_function.cpp | 76 ++++++++------- .../cuda/cuda_centralized_info.cpp | 7 +- .../cuda/cuda_centralized_info.hpp | 8 +- .../cuda/cuda_histogram_constructor.cpp | 8 +- .../cuda/cuda_histogram_constructor.hpp | 4 +- src/treelearner/cuda/cuda_leaf_splits.cpp | 14 +-- src/treelearner/cuda/cuda_leaf_splits.hpp | 8 +- .../cuda/new_cuda_tree_learner.cpp | 51 ++-------- .../cuda/new_cuda_tree_learner.hpp | 6 -- 32 files changed, 925 insertions(+), 143 deletions(-) create mode 100644 include/LightGBM/cuda/cuda_column_data.hpp create mode 100644 include/LightGBM/cuda/cuda_metadata.hpp create mode 100644 include/LightGBM/cuda/cuda_tree.hpp create mode 100644 src/boosting/cuda/cuda_score_updater.cpp create mode 100644 src/boosting/cuda/cuda_score_updater.cu create mode 100644 src/boosting/cuda/cuda_score_updater.hpp create mode 100644 src/cuda/cuda_utils.cpp create mode 100644 src/io/cuda/cuda_metadata.cpp create mode 100644 src/objective/cuda/cuda_binary_objective.cpp create mode 100644 src/objective/cuda/cuda_binary_objective.cu create mode 100644 src/objective/cuda/cuda_binary_objective.hpp create mode 100644 src/objective/cuda/cuda_objective_function.cpp create mode 100644 src/objective/cuda/cuda_objective_function.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 141ce31d5ea1..1ba4baac8d40 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -369,6 +369,14 @@ if(USE_CUDA) src/treelearner/*.cu src/treelearner/cuda/*.cpp src/treelearner/cuda/*.cu + src/io/cuda/*.cu + src/io/cuda/*.cpp + src/cuda/*.cpp + src/cuda/*.cu + src/objective/cuda/*.cpp + src/objective/cuda/*.cu + src/boosting/cuda/*.cpp + src/boosting/cuda/*.cu endif(USE_CUDA) ) diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp new file mode 100644 index 000000000000..c8855d4ed2ad --- /dev/null +++ b/include/LightGBM/cuda/cuda_column_data.hpp @@ -0,0 +1,74 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#ifndef LIGHTGBM_CUDA_COLUMN_DATA_HPP_ +#define LIGHTGBM_CUDA_COLUMN_DATA_HPP_ + +#include +#include +#include +#include + +#include + +namespace LightGBM { + +class CUDAColumnData { + public: + CUDAColumnData(const data_size_t num_data); + + ~CUDAColumnData(); + + void Init(const int num_columns, + const std::vector& column_data, + const std::vector& column_bin_iterator, + const std::vector& column_bit_type, + const std::vector& feature_max_bin, + const std::vector& feature_min_bin, + const std::vector& feature_offset, + const std::vector& feature_most_freq_bin, + const std::vector& feature_default_bin, + const std::vector& feature_to_column); + + void* const* cuda_data_by_column() const { return cuda_data_by_column_; } + + const uint32_t* cuda_feature_min_bin() const { return cuda_feature_min_bin_; } + + const uint32_t* cuda_feature_max_bin() const { return cuda_feature_max_bin_; } + + const uint32_t* cuda_feature_offset() const { return cuda_feature_offset_; } + + const uint32_t* cuda_feature_most_freq_bin() const { return cuda_feature_most_freq_bin_; } + + const uint32_t* cuda_feature_default_bin() const { return cuda_feature_default_bin_; } + + const int* cuda_feature_to_column() const { return cuda_feature_to_column_; } + + const int8_t* cuda_column_bit_type() const { return cuda_column_bit_type_; } + + private: + template + void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, void** out_column_data_pointer); + + int num_threads_; + data_size_t num_data_; + int num_columns_; + std::vector column_bit_type_; + void** cuda_data_by_column_; + std::vector feature_to_column_; + std::vector data_by_column_; + + int8_t* cuda_column_bit_type_; + uint32_t* cuda_feature_min_bin_; + uint32_t* cuda_feature_max_bin_; + uint32_t* cuda_feature_offset_; + uint32_t* cuda_feature_most_freq_bin_; + uint32_t* cuda_feature_default_bin_; + int* cuda_feature_to_column_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_CUDA_COLUMN_DATA_HPP_ diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp new file mode 100644 index 000000000000..d4118e69856c --- /dev/null +++ b/include/LightGBM/cuda/cuda_metadata.hpp @@ -0,0 +1,43 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#ifndef LIGHTGBM_CUDA_META_DATA_HPP_ +#define LIGHTGBM_CUDA_META_DATA_HPP_ + +#include "../meta.h" + +#include + +namespace LightGBM { + +class CUDAMetadata { + public: + CUDAMetadata(); + + ~CUDAMetadata(); + + void Init(const std::vector& label, + const std::vector& weight, + const std::vector& query_boundaries, + const std::vector& query_weights, + const std::vector& init_score, + const std::vector& queries); + + const label_t* cuda_label() const { return cuda_label_; } + + const label_t* cuda_weights() const { return cuda_weights_; } + + private: + label_t* cuda_label_; + label_t* cuda_weights_; + data_size_t* cuda_query_boundaries_; + label_t* cuda_query_weights_; + double* cuda_init_score_; + data_size_t* cuda_queries_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_CUDA_META_DATA_HPP_ diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp new file mode 100644 index 000000000000..2ea852522a71 --- /dev/null +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -0,0 +1,94 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#ifndef LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ +#define LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ + +#include +#include + +namespace LightGBM { + +class CUDATree : public Tree { + public: + /*! + * \brief Constructor + * \param max_leaves The number of max leaves + * \param track_branch_features Whether to keep track of ancestors of leaf nodes + * \param is_linear Whether the tree has linear models at each leaf + */ + explicit CUDATree(int max_leaves, bool track_branch_features, bool is_linear); + + explicit CUDATree(const Tree* host_tree); + + ~CUDATree() noexcept; + + /*! + * \brief Adding prediction value of this tree model to scores + * \param data The dataset + * \param num_data Number of total data + * \param score Will add prediction to score + */ + void AddPredictionToScore(const Dataset* data, + data_size_t num_data, + double* score) const override; + + /*! + * \brief Adding prediction value of this tree model to scores + * \param data The dataset + * \param used_data_indices Indices of used data + * \param num_data Number of total data + * \param score Will add prediction to score + */ + void AddPredictionToScore(const Dataset* data, + const data_size_t* used_data_indices, + data_size_t num_data, double* score) const override; + + const int* cuda_left_child() const { return cuda_left_child_; } + + const int* cuda_right_child() const { return cuda_right_child_; } + + const int* cuda_split_feature_inner() const { return cuda_split_feature_inner_; } + + const int* cuda_split_feature() const { return cuda_split_feature_; } + + const uint32_t* cuda_threshold_in_bin() const { return cuda_threshold_in_bin_; } + + const double* cuda_threshold() const { return cuda_threshold_; } + + const int8_t* cuda_decision_type() const { return cuda_decision_type_; } + + const double* cuda_leaf_value() const { return cuda_leaf_value_; } + + inline void Shrinkage(double rate) override; + + private: + void InitCUDA(); + + void LaunchAddPredictionToScoreKernel(const Dataset* data, + data_size_t num_data, + double* score) const; + + void LaunchAddPredictionToScoreKernel(const Dataset* data, + const data_size_t* used_data_indices, + data_size_t num_data, double* score) const; + + void LaunchShrinkageKernel(const double rate); + + int* cuda_left_child_; + int* cuda_right_child_; + int* cuda_split_feature_inner_; + int* cuda_split_feature_; + uint32_t* cuda_threshold_in_bin_; + double* cuda_threshold_; + int8_t* cuda_decision_type_; + double* cuda_leaf_value_; + + const int num_threads_per_block_add_prediction_to_score_; +}; + +} //namespace LightGBM + +#endif // LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 1054e09daf18..7bb6a14d1df5 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -11,6 +11,10 @@ #include #include +#include + +namespace LightGBM { + #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { if (code != cudaSuccess) { @@ -19,6 +23,92 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = } } +#define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); } + +template +void AllocateCUDAMemoryOuter(T** out_ptr, size_t size, const char* file, const int line) { + void* tmp_ptr = nullptr; + CUDASUCCESS_OR_FATAL_OUTER(cudaMalloc(&tmp_ptr, size * sizeof(T))); + *out_ptr = reinterpret_cast(tmp_ptr); +} + +template +void CopyFromHostToCUDADeviceOuter(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice)); +} + +template +void CopyFromHostToCUDADeviceAsyncOuter(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream, const char* file, const int line) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice, stream)); +} + +template +void InitCUDAMemoryFromHostMemoryOuter(T** dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { + AllocateCUDAMemoryOuter(dst_ptr, size, file, line); + CopyFromHostToCUDADeviceOuter(*dst_ptr, src_ptr, size, file, line); +} + +template +void InitCUDAValueFromConstantOuter(T** dst_ptr, const T value, const char* file, const int line) { + AllocateCUDAMemoryOuter(1, dst_ptr, file, line); + CopyFromHostToCUDADeviceOuter(*dst_ptr, &value, 1, file, line); +} + +template +void CopyFromCUDADeviceToHostOuter(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost)); +} + +template +void CopyFromCUDADeviceToHostAsyncOuter(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream, const char* file, const int line) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost, stream)); +} + +template +void CopyFromCUDADeviceToCUDADeviceOuter(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice)); +} + +template +void CopyFromCUDADeviceToCUDADeviceAsyncOuter(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { + void* void_dst_ptr = reinterpret_cast(dst_ptr); + const void* void_src_ptr = reinterpret_cast(src_ptr); + size_t size_in_bytes = size * sizeof(T); + CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice)); +} + +void SynchronizeCUDADeviceOuter(const char* file, const int line); + +template +void SetCUDAMemoryOuter(T* dst_ptr, int value, size_t size, const char* file, const int line) { + CUDASUCCESS_OR_FATAL_OUTER(cudaMemset(reinterpret_cast(dst_ptr), value, size * sizeof(T))); +} + +void PrintLastCUDAErrorOuter(const char* /*file*/, const int /*line*/); + +template +void DeallocateCUDAMemoryOuter(T** ptr, const char* file, const int line) { + CUDASUCCESS_OR_FATAL_OUTER(cudaFree(reinterpret_cast(*ptr))); + *ptr = nullptr; +} + +} + #endif // USE_CUDA #endif // LIGHTGBM_CUDA_CUDA_UTILS_H_ diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index f9218b45b937..636c3df177f6 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -6,6 +6,7 @@ #define LIGHTGBM_DATASET_H_ #include +#include #include #include #include @@ -210,6 +211,10 @@ class Metadata { /*! \brief Disable copy */ Metadata(const Metadata&) = delete; + CUDAMetadata* cuda_metadata() const { return cuda_metadata_.get(); } + + void CreateCUDAMetadata(); + private: /*! \brief Load initial scores from file */ void LoadInitialScore(); @@ -246,6 +251,7 @@ class Metadata { bool weight_load_from_file_; bool query_load_from_file_; bool init_score_load_from_file_; + std::unique_ptr cuda_metadata_; }; @@ -739,6 +745,7 @@ class Dataset { /*! map feature (inner index) to its index in the list of numeric (non-categorical) features */ std::vector numeric_feature_map_; int num_numeric_features_; + std::string device_type_; }; } // namespace LightGBM diff --git a/src/boosting/cuda/cuda_score_updater.cpp b/src/boosting/cuda/cuda_score_updater.cpp new file mode 100644 index 000000000000..32abdb7f035c --- /dev/null +++ b/src/boosting/cuda/cuda_score_updater.cpp @@ -0,0 +1,73 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_score_updater.hpp" + +namespace LightGBM { + +CUDAScoreUpdater::CUDAScoreUpdater(const Dataset* data, int num_tree_per_iteration): + ScoreUpdater(data, num_tree_per_iteration), num_threads_per_block_(1024) { + num_data_ = data->num_data(); + int64_t total_size = static_cast(num_data_) * num_tree_per_iteration; + InitCUDA(total_size); + has_init_score_ = false; + const double* init_score = data->metadata().init_score(); + // if exists initial score, will start from it + if (init_score != nullptr) { + if ((data->metadata().num_init_score() % num_data_) != 0 + || (data->metadata().num_init_score() / num_data_) != num_tree_per_iteration) { + Log::Fatal("Number of class for initial score error"); + } + has_init_score_ = true; + CopyFromHostToCUDADeviceOuter(cuda_score_, init_score, total_size, __FILE__, __LINE__); + } + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +void CUDAScoreUpdater::InitCUDA(const size_t total_size) { + Log::Warning("allocating cuda_score_ memory with size %d", total_size); + AllocateCUDAMemoryOuter(&cuda_score_, total_size, __FILE__, __LINE__); +} + +CUDAScoreUpdater::~CUDAScoreUpdater() { + DeallocateCUDAMemoryOuter(&cuda_score_, __FILE__, __LINE__); +} + +inline void CUDAScoreUpdater::AddScore(double val, int cur_tree_id) { + Common::FunctionTimer fun_timer("CUDAScoreUpdater::AddScore", global_timer); + const size_t offset = static_cast(num_data_) * cur_tree_id; + LaunchAddScoreConstantKernel(val, offset); +} + +inline void CUDAScoreUpdater::AddScore(const Tree* tree, int cur_tree_id) { + Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); + const size_t offset = static_cast(num_data_) * cur_tree_id; + std::vector host_score(num_data_, 0.0f); + CopyFromCUDADeviceToHostOuter(host_score.data(), cuda_score_ + offset, static_cast(num_data_), __FILE__, __LINE__); + tree->AddPredictionToScore(data_, num_data_, host_score.data()); + CopyFromHostToCUDADeviceOuter(cuda_score_ + offset, host_score.data(), static_cast(num_data_), __FILE__, __LINE__); +} + +inline void CUDAScoreUpdater::AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) { + Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); + const size_t offset = static_cast(num_data_) * cur_tree_id; + tree_learner->AddPredictionToScore(tree, cuda_score_ + offset); +} + +inline void CUDAScoreUpdater::AddScore(const Tree* tree, const data_size_t* data_indices, + data_size_t data_cnt, int cur_tree_id) { + // TODO(shiyu1994): bagging is not supported yet + Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); + const size_t offset = static_cast(num_data_) * cur_tree_id; + tree->AddPredictionToScore(data_, data_indices, data_cnt, cuda_score_ + offset); +} + +inline void CUDAScoreUpdater::MultiplyScore(double val, int cur_tree_id) { + Common::FunctionTimer fun_timer("CUDAScoreUpdater::MultiplyScore", global_timer); + const size_t offset = static_cast(num_data_) * cur_tree_id; + LaunchMultiplyScoreConstantKernel(val, offset); +} + +} // namespace LightGBM diff --git a/src/boosting/cuda/cuda_score_updater.cu b/src/boosting/cuda/cuda_score_updater.cu new file mode 100644 index 000000000000..72eca4b635d7 --- /dev/null +++ b/src/boosting/cuda/cuda_score_updater.cu @@ -0,0 +1,46 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_score_updater.hpp" + +namespace LightGBM { + +__global__ void AddScoreConstantKernel( + const double val, + const size_t offset, + const data_size_t num_data, + double* score) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + score[data_index] += val; + } +} + +void CUDAScoreUpdater::LaunchAddScoreConstantKernel(const double val, const size_t offset) { + const int num_blocks = (num_data_ + num_threads_per_block_) / num_threads_per_block_; + double cuda_score = 0.0f; + CopyFromCUDADeviceToHostOuter(&cuda_score, cuda_score_, 1, __FILE__, __LINE__); + Log::Warning("adding constant to cuda score updater, num_blocks = %d, num_data_ = %d, cuda_score_ = %f", num_blocks, num_data_, cuda_score); + Log::Warning("adding init score = %f", val); + AddScoreConstantKernel<<>>(val, offset, num_data_, cuda_score_); +} + +__global__ void MultiplyScoreConstantKernel( + const double val, + const size_t offset, + const data_size_t num_data, + double* score) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + score[data_index] *= val; + } +} + +void CUDAScoreUpdater::LaunchMultiplyScoreConstantKernel(const double val, const size_t offset) { + const int num_blocks = (num_data_ + num_threads_per_block_) / num_threads_per_block_; + MultiplyScoreConstantKernel<<>>(val, offset, num_data_, cuda_score_); +} + +} diff --git a/src/boosting/cuda/cuda_score_updater.hpp b/src/boosting/cuda/cuda_score_updater.hpp new file mode 100644 index 000000000000..623df1c84740 --- /dev/null +++ b/src/boosting/cuda/cuda_score_updater.hpp @@ -0,0 +1,48 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include + +#include "../score_updater.hpp" + +namespace LightGBM { + +class CUDAScoreUpdater: public ScoreUpdater { + public: + CUDAScoreUpdater(const Dataset* data, int num_tree_per_iteration); + + ~CUDAScoreUpdater(); + + inline void AddScore(double val, int cur_tree_id) override; + + inline void AddScore(const Tree* tree, int cur_tree_id) override; + + inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) override; + + inline void AddScore(const Tree* tree, const data_size_t* data_indices, + data_size_t data_cnt, int cur_tree_id) override; + + inline void MultiplyScore(double val, int cur_tree_id) override; + + inline const double* score() const override { return cuda_score_; } + + /*! \brief Disable copy */ + CUDAScoreUpdater& operator=(const CUDAScoreUpdater&) = delete; + + CUDAScoreUpdater(const CUDAScoreUpdater&) = delete; + + private: + void InitCUDA(const size_t total_size); + + void LaunchAddScoreConstantKernel(const double val, const size_t offset); + + void LaunchMultiplyScoreConstantKernel(const double val, const size_t offset); + + double* cuda_score_; + + const int num_threads_per_block_; +}; + +} // namespace LightGBM diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index aae328d0f819..ea9413a17e37 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -103,14 +103,25 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective } training_metrics_.shrink_to_fit(); - train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_)); + if (config_->device_type == std::string("cuda")) { + train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_)); + } else { + train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_)); + } num_data_ = train_data_->num_data(); // create buffer for gradients and hessians if (objective_function_ != nullptr) { size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - gradients_.resize(total_size); - hessians_.resize(total_size); + if (config_->device_type == std::string("cuda")) { + AllocateCUDAMemoryOuter(&gradients_pointer_, total_size, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&hessians_pointer_, total_size, __FILE__, __LINE__); + } else { + gradients_.resize(total_size); + hessians_.resize(total_size); + gradients_pointer_ = gradients_.data(); + hessians_pointer_ = hessians_.data(); + } } // get max feature index max_feature_idx_ = train_data_->num_total_features() - 1; @@ -143,7 +154,9 @@ void GBDT::AddValidDataset(const Dataset* valid_data, Log::Fatal("Cannot add validation data, since it has different bin mappers with training data"); } // for a validation dataset, we need its score and metric - auto new_score_updater = std::unique_ptr(new ScoreUpdater(valid_data, num_tree_per_iteration_)); + auto new_score_updater = config_->device_type == std::string("cuda") ? + std::unique_ptr(new CUDAScoreUpdater(valid_data, num_tree_per_iteration_)) : + std::unique_ptr(new ScoreUpdater(valid_data, num_tree_per_iteration_)); // update score for (int i = 0; i < iter_; ++i) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { @@ -175,7 +188,7 @@ void GBDT::Boosting() { // objective function will calculate gradients and hessians int64_t num_score = 0; objective_function_-> - GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data()); + GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_); } data_size_t GBDT::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) { @@ -311,8 +324,8 @@ void GBDT::RefitTree(const std::vector>& tree_leaf_prediction) CHECK_LT(leaf_pred[i], models_[model_index]->num_leaves()); } size_t offset = static_cast(tree_id) * num_data_; - auto grad = gradients_.data() + offset; - auto hess = hessians_.data() + offset; + auto grad = gradients_pointer_ + offset; + auto hess = hessians_pointer_ + offset; auto new_tree = tree_learner_->FitByExistingTree(models_[model_index].get(), leaf_pred, grad, hess); train_score_updater_->AddScore(tree_learner_.get(), new_tree, tree_id); models_[model_index].reset(new_tree); @@ -374,9 +387,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true); } - //Boosting(); - gradients = gradients_.data(); - hessians = hessians_.data(); + Boosting(); + gradients = gradients_pointer_; + hessians = hessians_pointer_; } // bagging logic Bagging(iter_); @@ -391,11 +404,12 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { // need to copy gradients for bagging subset. if (is_use_subset_ && bag_data_cnt_ < num_data_) { for (int i = 0; i < bag_data_cnt_; ++i) { - gradients_[offset + i] = grad[bag_data_indices_[i]]; - hessians_[offset + i] = hess[bag_data_indices_[i]]; + // TODO(shiyu1994): bagging is not supported, the copy operation should be done in GPU + gradients_pointer_[offset + i] = grad[bag_data_indices_[i]]; + gradients_pointer_[offset + i] = hess[bag_data_indices_[i]]; } - grad = gradients_.data() + offset; - hess = hessians_.data() + offset; + grad = gradients_pointer_ + offset; + hess = hessians_pointer_ + offset; } bool is_first_tree = models_.size() < static_cast(num_tree_per_iteration_); new_tree.reset(tree_learner_->Train(grad, hess, is_first_tree)); @@ -510,8 +524,11 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { } } -std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score) const { - return metric->Eval(score, objective_function_); +std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { + std::vector tmp_score(num_data, 0.0f); + CopyFromCUDADeviceToHostOuter(tmp_score.data(), score, static_cast(num_data), __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + return metric->Eval(tmp_score.data(), objective_function_); } std::string GBDT::OutputMetric(int iter) { @@ -523,7 +540,7 @@ std::string GBDT::OutputMetric(int iter) { if (need_output) { for (auto& sub_metric : training_metrics_) { auto name = sub_metric->GetName(); - auto scores = EvalOneMetric(sub_metric, train_score_updater_->score()); + auto scores = EvalOneMetric(sub_metric, train_score_updater_->score(), train_data_->num_data()); for (size_t k = 0; k < name.size(); ++k) { std::stringstream tmp_buf; tmp_buf << "Iteration:" << iter @@ -540,7 +557,7 @@ std::string GBDT::OutputMetric(int iter) { if (need_output || early_stopping_round_ > 0) { for (size_t i = 0; i < valid_metrics_.size(); ++i) { for (size_t j = 0; j < valid_metrics_[i].size(); ++j) { - auto test_scores = EvalOneMetric(valid_metrics_[i][j], valid_score_updater_[i]->score()); + auto test_scores = EvalOneMetric(valid_metrics_[i][j], valid_score_updater_[i]->score(), valid_score_updater_[i]->num_data()); auto name = valid_metrics_[i][j]->GetName(); for (size_t k = 0; k < name.size(); ++k) { std::stringstream tmp_buf; @@ -580,7 +597,7 @@ std::vector GBDT::GetEvalAt(int data_idx) const { std::vector ret; if (data_idx == 0) { for (auto& sub_metric : training_metrics_) { - auto scores = EvalOneMetric(sub_metric, train_score_updater_->score()); + auto scores = EvalOneMetric(sub_metric, train_score_updater_->score(), train_score_updater_->num_data()); for (auto score : scores) { ret.push_back(score); } @@ -588,7 +605,7 @@ std::vector GBDT::GetEvalAt(int data_idx) const { } else { auto used_idx = data_idx - 1; for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) { - auto test_scores = EvalOneMetric(valid_metrics_[used_idx][j], valid_score_updater_[used_idx]->score()); + auto test_scores = EvalOneMetric(valid_metrics_[used_idx][j], valid_score_updater_[used_idx]->score(), valid_score_updater_[used_idx]->num_data()); for (auto score : test_scores) { ret.push_back(score); } @@ -722,8 +739,15 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* // create buffer for gradients and hessians if (objective_function_ != nullptr) { size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - gradients_.resize(total_size); - hessians_.resize(total_size); + if (config_->device_type == std::string("cuda")) { + AllocateCUDAMemoryOuter(&gradients_pointer_, total_size, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&hessians_pointer_, total_size, __FILE__, __LINE__); + } else { + gradients_.resize(total_size); + hessians_.resize(total_size); + gradients_pointer_ = gradients_.data(); + hessians_pointer_ = hessians_.data(); + } } max_feature_idx_ = train_data_->num_total_features() - 1; @@ -823,8 +847,15 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { if (is_use_subset_ && bag_data_cnt_ < num_data_) { if (objective_function_ == nullptr) { size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - gradients_.resize(total_size); - hessians_.resize(total_size); + if (config_->device_type == std::string("cuda")) { + AllocateCUDAMemoryOuter(&gradients_pointer_, total_size, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&hessians_pointer_, total_size, __FILE__, __LINE__); + } else { + gradients_.resize(total_size); + hessians_.resize(total_size); + gradients_pointer_ = gradients_.data(); + hessians_pointer_ = hessians_.data(); + } } } } else { diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 998bd5353ce2..8da261389cf1 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,7 @@ #include #include +#include "cuda/cuda_score_updater.hpp" #include "score_updater.hpp" namespace LightGBM { @@ -440,7 +442,7 @@ class GBDT : public GBDTBase { * \brief eval results for one metric */ - virtual std::vector EvalOneMetric(const Metric* metric, const double* score) const; + virtual std::vector EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const; /*! * \brief Print metric result of current iteration @@ -495,7 +497,8 @@ class GBDT : public GBDTBase { /*! \brief Second order derivative of training data */ std::vector> hessians_; #endif - + score_t* gradients_pointer_; + score_t* hessians_pointer_; /*! \brief Store the indices of in-bag data */ std::vector> bag_data_indices_; /*! \brief Number of in-bag data */ diff --git a/src/boosting/score_updater.hpp b/src/boosting/score_updater.hpp index 7446691a4709..0e79ed762736 100644 --- a/src/boosting/score_updater.hpp +++ b/src/boosting/score_updater.hpp @@ -51,7 +51,7 @@ class ScoreUpdater { inline bool has_init_score() const { return has_init_score_; } - inline void AddScore(double val, int cur_tree_id) { + virtual inline void AddScore(double val, int cur_tree_id) { Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; #pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) @@ -60,7 +60,7 @@ class ScoreUpdater { } } - inline void MultiplyScore(double val, int cur_tree_id) { + virtual inline void MultiplyScore(double val, int cur_tree_id) { const size_t offset = static_cast(num_data_) * cur_tree_id; #pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) for (int i = 0; i < num_data_; ++i) { @@ -73,7 +73,7 @@ class ScoreUpdater { * \param tree Trained tree model * \param cur_tree_id Current tree for multiclass training */ - inline void AddScore(const Tree* tree, int cur_tree_id) { + virtual inline void AddScore(const Tree* tree, int cur_tree_id) { Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; tree->AddPredictionToScore(data_, num_data_, score_.data() + offset); @@ -85,7 +85,7 @@ class ScoreUpdater { * \param tree_learner * \param cur_tree_id Current tree for multiclass training */ - inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) { + virtual inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) { Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; tree_learner->AddPredictionToScore(tree, score_.data() + offset); @@ -98,14 +98,14 @@ class ScoreUpdater { * \param data_cnt Number of data that will be processed * \param cur_tree_id Current tree for multiclass training */ - inline void AddScore(const Tree* tree, const data_size_t* data_indices, + virtual inline void AddScore(const Tree* tree, const data_size_t* data_indices, data_size_t data_cnt, int cur_tree_id) { Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + offset); } /*! \brief Pointer of score */ - inline const double* score() const { return score_.data(); } + virtual inline const double* score() const { return score_.data(); } inline data_size_t num_data() const { return num_data_; } @@ -114,7 +114,7 @@ class ScoreUpdater { /*! \brief Disable copy */ ScoreUpdater(const ScoreUpdater&) = delete; - private: + protected: /*! \brief Number of total data */ data_size_t num_data_; /*! \brief Pointer of data set */ diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp new file mode 100644 index 000000000000..a5bcf2bc1a98 --- /dev/null +++ b/src/cuda/cuda_utils.cpp @@ -0,0 +1,23 @@ +/*! + * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include + +//#ifdef USE_CUDA + +namespace LightGBM { + +void SynchronizeCUDADeviceOuter(const char* file, const int line) { + CUDASUCCESS_OR_FATAL_OUTER(cudaDeviceSynchronize()); +} + +void PrintLastCUDAErrorOuter(const char* /*file*/, const int /*line*/) { + const char* error_name = cudaGetErrorName(cudaGetLastError()); + Log::Warning(error_name); +} + +} // namespace LightGBM + +//#endif // USE_CUDA diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp new file mode 100644 index 000000000000..ace5ec22cc84 --- /dev/null +++ b/src/io/cuda/cuda_metadata.cpp @@ -0,0 +1,30 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include + +namespace LightGBM { + +CUDAMetadata::CUDAMetadata() {} + +CUDAMetadata::~CUDAMetadata() {} + +void CUDAMetadata::Init(const std::vector& label, + const std::vector& weight, + const std::vector& query_boundaries, + const std::vector& query_weights, + const std::vector& init_score, + const std::vector& queries) { + Log::Warning("label.size() = %d", label.size()); + InitCUDAMemoryFromHostMemoryOuter(&cuda_label_, label.data(), label.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_weights_, weight.data(), weight.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_query_boundaries_, query_boundaries.data(), query_boundaries.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_query_weights_, query_weights.data(), query_weights.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_init_score_, init_score.data(), init_score.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_queries_, queries.data(), queries.size(), __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +} // namespace LightGBM diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index fb47961a8fd7..4c56320653aa 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -425,6 +425,7 @@ void Dataset::Construct(std::vector>* bin_mappers, ++num_numeric_features_; } } + device_type_ = io_config.device_type; } void Dataset::FinishLoad() { @@ -436,6 +437,9 @@ void Dataset::FinishLoad() { feature_groups_[i]->FinishLoad(); } } + if (device_type_ == std::string("cuda")) { + metadata_.CreateCUDAMetadata(); + } is_finish_load_ = true; } @@ -767,6 +771,7 @@ void Dataset::CreateValid(const Dataset* dataset) { label_idx_ = dataset->label_idx_; real_feature_idx_ = dataset->real_feature_idx_; forced_bin_bounds_ = dataset->forced_bin_bounds_; + device_type_ = dataset->device_type_; } void Dataset::ReSize(data_size_t num_data) { diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 49fc834b87df..8a90c9d9627f 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -18,6 +18,7 @@ Metadata::Metadata() { weight_load_from_file_ = false; query_load_from_file_ = false; init_score_load_from_file_ = false; + cuda_metadata_ = nullptr; } void Metadata::Init(const char* data_filename) { @@ -472,6 +473,11 @@ void Metadata::LoadQueryWeights() { } } +void Metadata::CreateCUDAMetadata() { + cuda_metadata_.reset(new CUDAMetadata()); + cuda_metadata_->Init(label_, weights_, query_boundaries_, query_weights_, init_score_, queries_); +} + void Metadata::LoadFromMemory(const void* memory) { const char* mem_ptr = reinterpret_cast(memory); diff --git a/src/main.cpp b/src/main.cpp index 8034da826811..b2cb1f6804f4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,10 +8,16 @@ #include "network/linkers.h" -int main(int argc, char** argv) { +int main(int /*argc*/, char** /*argv*/) { bool success = false; + const std::string config_str = std::string("config=train.conf"); + char* argv = new char[config_str.size() + 1]; + for (size_t i = 0; i < config_str.size(); ++i) { + argv[i] = config_str[i]; + } + argv[config_str.size()] = '\0'; try { - LightGBM::Application app(argc, argv); + LightGBM::Application app(2, &argv - 1); app.Run(); #ifdef USE_MPI diff --git a/src/objective/binary_objective.hpp b/src/objective/binary_objective.hpp index 7d66a9950e8d..50597f9e8b70 100644 --- a/src/objective/binary_objective.hpp +++ b/src/objective/binary_objective.hpp @@ -185,7 +185,7 @@ class BinaryLogloss: public ObjectiveFunction { data_size_t NumPositiveData() const override { return num_pos_data_; } - private: + protected: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Number of positive samples */ diff --git a/src/objective/cuda/cuda_binary_objective.cpp b/src/objective/cuda/cuda_binary_objective.cpp new file mode 100644 index 000000000000..96fe8593e361 --- /dev/null +++ b/src/objective/cuda/cuda_binary_objective.cpp @@ -0,0 +1,55 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_binary_objective.hpp" + +namespace LightGBM { + +CUDABinaryLogloss::CUDABinaryLogloss(const Config& config, + std::function is_pos): +BinaryLogloss(config, is_pos) {} + +CUDABinaryLogloss::CUDABinaryLogloss(const std::vector& strs): BinaryLogloss(strs) {} + +CUDABinaryLogloss::~CUDABinaryLogloss() {} + +void CUDABinaryLogloss::Init(const Metadata& metadata, data_size_t num_data) { + BinaryLogloss::Init(metadata, num_data); + cuda_label_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + AllocateCUDAMemoryOuter(&cuda_boost_from_score_, 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_boost_from_score_, 0, 1, __FILE__, __LINE__); +} + +void CUDABinaryLogloss::GetGradients(const double* scores, score_t* gradients, score_t* hessians) const { + LaunchGetGradientsKernel(scores, gradients, hessians); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + /*std::vector host_gradients(num_data_, 0.0f); + std::vector host_hessians(num_data_, 0.0f); + std::vector host_scores(num_data_, 0.0f); + CopyFromCUDADeviceToHostOuter(host_gradients.data(), gradients, static_cast(num_data_), __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(host_hessians.data(), hessians, static_cast(num_data_), __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(host_scores.data(), scores, static_cast(num_data_), __FILE__, __LINE__); + + for (size_t i = 0; i < 100; ++i) { + Log::Warning("===================================== host_gradients[%d] = %f, host_hessians[%d] = %f, host_score[%d] = %f =====================================", i, host_gradients[i], i, host_hessians[i], i, host_scores[i]); + }*/ +} + +double CUDABinaryLogloss::BoostFromScore(int) const { + LaunchBoostFromScoreKernel(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + double boost_from_score = 0.0f; + CopyFromCUDADeviceToHostOuter(&boost_from_score, cuda_boost_from_score_, 1, __FILE__, __LINE__); + Log::Warning("boost_from_score = %f", boost_from_score); + return boost_from_score; +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_binary_objective.cu b/src/objective/cuda/cuda_binary_objective.cu new file mode 100644 index 000000000000..f418dc5c83bc --- /dev/null +++ b/src/objective/cuda/cuda_binary_objective.cu @@ -0,0 +1,83 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_binary_objective.hpp" + +namespace LightGBM { + +__global__ void BoostFromScoreKernel_1_BinaryLogloss(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_init_score) { + __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE_BINARY]; + const unsigned int tid = threadIdx.x; + const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; + shared_label[tid] = 0.0f; + __syncthreads(); + for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; ++j) { + if (i + j < num_data) { + shared_label[tid] += cuda_labels[i + j]; + } + } + __syncthreads(); + for (unsigned int s = 1; s < blockDim.x; s *= 2) { + if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE_BINARY) { + shared_label[tid] += shared_label[tid + s]; + } + __syncthreads(); + } + if (tid == 0) { + atomicAdd_system(out_cuda_init_score, shared_label[0]); + } +} + +__global__ void BoostFromScoreKernel_2_BinaryLogloss(double* out_cuda_init_score, const data_size_t num_data, const double sigmoid) { + const double suml = *out_cuda_init_score; + const double sumw = static_cast(num_data); + if (threadIdx.x == 0 && blockIdx.x == 0) { + printf("******************************************* suml = %f sumw = %f *******************************************\n", suml, sumw); + } + const double pavg = suml / sumw; + const double init_score = log(pavg / (1.0f - pavg)) / sigmoid; + *out_cuda_init_score = init_score; +} + +void CUDABinaryLogloss::LaunchBoostFromScoreKernel() const { + const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE_BINARY * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; + const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; + BoostFromScoreKernel_1_BinaryLogloss<<>>(cuda_label_, num_data_, cuda_boost_from_score_); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + BoostFromScoreKernel_2_BinaryLogloss<<<1, 1>>>(cuda_boost_from_score_, num_data_, sigmoid_); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +__global__ void GetGradientsKernel_BinaryLogloss(const double* cuda_scores, const label_t* cuda_labels, + const double sigmoid, const data_size_t num_data, + score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + const label_t cuda_label = static_cast(cuda_labels[data_index]); + const int label = cuda_label == 0 ? -1 : 1; + const double response = -label * sigmoid / (1.0f + std::exp(label * sigmoid * cuda_scores[data_index])); + const double abs_response = fabs(response); + cuda_out_gradients[data_index] = static_cast(response); + cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response)); + } +} + +void CUDABinaryLogloss::LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY; + GetGradientsKernel_BinaryLogloss<<>>( + scores, + cuda_label_, + sigmoid_, + num_data_, + gradients, + hessians); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_binary_objective.hpp b/src/objective/cuda/cuda_binary_objective.hpp new file mode 100644 index 000000000000..d67f70eb3a1d --- /dev/null +++ b/src/objective/cuda/cuda_binary_objective.hpp @@ -0,0 +1,52 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_ +#define LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_ + +#ifdef USE_CUDA + +#define GET_GRADIENTS_BLOCK_SIZE_BINARY (1024) +#define CALC_INIT_SCORE_BLOCK_SIZE_BINARY (1024) +#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY (6) + +#include "cuda_objective_function.hpp" +#include "../binary_objective.hpp" + +namespace LightGBM { + +class CUDABinaryLogloss : public CUDAObjectiveInterface, public BinaryLogloss { + public: + explicit CUDABinaryLogloss(const Config& config, + std::function is_pos = nullptr); + + explicit CUDABinaryLogloss(const std::vector& strs); + + ~CUDABinaryLogloss(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + void GetGradients(const double* scores, score_t* gradients, score_t* hessians) const override; + + double BoostFromScore(int) const override; + + private: + void LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const; + + void LaunchBoostFromScoreKernel() const; + + // CUDA memory, held by other objects + const label_t* cuda_label_; + const label_t* cuda_weights_; + + // CUDA memory, held by this object + mutable double* cuda_boost_from_score_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_ diff --git a/src/objective/cuda/cuda_objective_function.cpp b/src/objective/cuda/cuda_objective_function.cpp new file mode 100644 index 000000000000..713e96550506 --- /dev/null +++ b/src/objective/cuda/cuda_objective_function.cpp @@ -0,0 +1,12 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "cuda_objective_function.hpp" +#include "cuda_binary_objective.hpp" + +namespace LightGBM { + +} // namespace LightGBM diff --git a/src/objective/cuda/cuda_objective_function.hpp b/src/objective/cuda/cuda_objective_function.hpp new file mode 100644 index 000000000000..1888ba9e4fcd --- /dev/null +++ b/src/objective/cuda/cuda_objective_function.hpp @@ -0,0 +1,25 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_OBJECTIVE_HPP_ +#define LIGHTGBM_OBJECTIVE_CUDA_CUDA_OBJECTIVE_HPP_ + +#ifdef USE_CUDA + +#include +#include +#include + +namespace LightGBM { + +class CUDAObjectiveInterface { + +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_OBJECTIVE_HPP_ diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index 193353d935c3..d0de0c371ece 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -10,43 +10,51 @@ #include "regression_objective.hpp" #include "xentropy_objective.hpp" +#include "cuda/cuda_binary_objective.hpp" + namespace LightGBM { ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) { - if (type == std::string("regression")) { - return new RegressionL2loss(config); - } else if (type == std::string("regression_l1")) { - return new RegressionL1loss(config); - } else if (type == std::string("quantile")) { - return new RegressionQuantileloss(config); - } else if (type == std::string("huber")) { - return new RegressionHuberLoss(config); - } else if (type == std::string("fair")) { - return new RegressionFairLoss(config); - } else if (type == std::string("poisson")) { - return new RegressionPoissonLoss(config); - } else if (type == std::string("binary")) { - return new BinaryLogloss(config); - } else if (type == std::string("lambdarank")) { - return new LambdarankNDCG(config); - } else if (type == std::string("rank_xendcg")) { - return new RankXENDCG(config); - } else if (type == std::string("multiclass")) { - return new MulticlassSoftmax(config); - } else if (type == std::string("multiclassova")) { - return new MulticlassOVA(config); - } else if (type == std::string("cross_entropy")) { - return new CrossEntropy(config); - } else if (type == std::string("cross_entropy_lambda")) { - return new CrossEntropyLambda(config); - } else if (type == std::string("mape")) { - return new RegressionMAPELOSS(config); - } else if (type == std::string("gamma")) { - return new RegressionGammaLoss(config); - } else if (type == std::string("tweedie")) { - return new RegressionTweedieLoss(config); - } else if (type == std::string("custom")) { - return nullptr; + if (config.device_type == std::string("cuda")) { + if (type == std::string("binary")) { + return new CUDABinaryLogloss(config); + } + } else { + if (type == std::string("regression")) { + return new RegressionL2loss(config); + } else if (type == std::string("regression_l1")) { + return new RegressionL1loss(config); + } else if (type == std::string("quantile")) { + return new RegressionQuantileloss(config); + } else if (type == std::string("huber")) { + return new RegressionHuberLoss(config); + } else if (type == std::string("fair")) { + return new RegressionFairLoss(config); + } else if (type == std::string("poisson")) { + return new RegressionPoissonLoss(config); + } else if (type == std::string("binary")) { + return new BinaryLogloss(config); + } else if (type == std::string("lambdarank")) { + return new LambdarankNDCG(config); + } else if (type == std::string("rank_xendcg")) { + return new RankXENDCG(config); + } else if (type == std::string("multiclass")) { + return new MulticlassSoftmax(config); + } else if (type == std::string("multiclassova")) { + return new MulticlassOVA(config); + } else if (type == std::string("cross_entropy")) { + return new CrossEntropy(config); + } else if (type == std::string("cross_entropy_lambda")) { + return new CrossEntropyLambda(config); + } else if (type == std::string("mape")) { + return new RegressionMAPELOSS(config); + } else if (type == std::string("gamma")) { + return new RegressionGammaLoss(config); + } else if (type == std::string("tweedie")) { + return new RegressionTweedieLoss(config); + } else if (type == std::string("custom")) { + return nullptr; + } } Log::Fatal("Unknown objective type name: %s", type.c_str()); return nullptr; diff --git a/src/treelearner/cuda/cuda_centralized_info.cpp b/src/treelearner/cuda/cuda_centralized_info.cpp index b99e340e7fdf..97f0f5e5beca 100644 --- a/src/treelearner/cuda/cuda_centralized_info.cpp +++ b/src/treelearner/cuda/cuda_centralized_info.cpp @@ -18,9 +18,6 @@ void CUDACentralizedInfo::Init(const score_t* labels, const Dataset* train_data) InitCUDAMemoryFromHostMemory(&cuda_num_leaves_, &num_leaves_, 1); InitCUDAMemoryFromHostMemory(&cuda_num_features_, &num_features_, 1); - AllocateCUDAMemory(static_cast(num_data_), &cuda_gradients_); - AllocateCUDAMemory(static_cast(num_data_), &cuda_hessians_); - InitCUDAMemoryFromHostMemory(&cuda_labels_, labels, num_data_); if (train_data->metadata().query_boundaries() != nullptr) { @@ -32,8 +29,8 @@ void CUDACentralizedInfo::Init(const score_t* labels, const Dataset* train_data) } void CUDACentralizedInfo::BeforeTrain(const score_t* gradients, const score_t* hessians) { - CopyFromHostToCUDADevice(cuda_gradients_, gradients, static_cast(num_data_)); - CopyFromHostToCUDADevice(cuda_hessians_, hessians, static_cast(num_data_)); + cuda_gradients_ = gradients; + cuda_hessians_ = hessians; } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_centralized_info.hpp b/src/treelearner/cuda/cuda_centralized_info.hpp index 0105bb587e91..ad212dc9a14e 100644 --- a/src/treelearner/cuda/cuda_centralized_info.hpp +++ b/src/treelearner/cuda/cuda_centralized_info.hpp @@ -38,10 +38,6 @@ class CUDACentralizedInfo { const label_t* cuda_labels() const { return cuda_labels_; } - score_t* cuda_gradients_ref() { return cuda_gradients_; } - - score_t* cuda_hessians_ref() { return cuda_hessians_; } - const data_size_t* cuda_query_boundaries() { return cuda_query_boundaries_; } void Test() { @@ -67,8 +63,8 @@ class CUDACentralizedInfo { data_size_t* cuda_num_data_; int* cuda_num_leaves_; int* cuda_num_features_; - score_t* cuda_gradients_; - score_t* cuda_hessians_; + const score_t* cuda_gradients_; + const score_t* cuda_hessians_; label_t* cuda_labels_; data_size_t* cuda_query_boundaries_; }; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index cca5bc8be091..fec55bdc7571 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -12,13 +12,11 @@ namespace LightGBM { CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, - const score_t* cuda_gradients, const score_t* cuda_hessians, const std::vector& feature_hist_offsets, const int min_data_in_leaf, const double min_sum_hessian_in_leaf): num_data_(train_data->num_data()), num_features_(train_data->num_features()), num_leaves_(num_leaves), num_threads_(num_threads), num_feature_groups_(train_data->num_feature_groups()), - min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf), - cuda_gradients_(cuda_gradients), cuda_hessians_(cuda_hessians) { + min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf) { train_data_ = train_data; int offset = 0; for (int group_id = 0; group_id < train_data->num_feature_groups(); ++group_id) { @@ -54,7 +52,9 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, num_total_bin_ = offset; } -void CUDAHistogramConstructor::BeforeTrain() { +void CUDAHistogramConstructor::BeforeTrain(const score_t* gradients, const score_t* hessians) { + cuda_gradients_ = gradients; + cuda_hessians_ = hessians; SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_); } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 7ea31661a2d9..cac22493f703 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -31,7 +31,7 @@ namespace LightGBM { class CUDAHistogramConstructor { public: CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, - const score_t* cuda_gradients, const score_t* cuda_hessians, const std::vector& feature_hist_offsets, + const std::vector& feature_hist_offsets, const int min_data_in_leaf, const double min_sum_hessian_in_leaf); void Init(const Dataset* train_data, TrainingShareStates* share_state); @@ -43,7 +43,7 @@ class CUDAHistogramConstructor { const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); - void BeforeTrain(); + void BeforeTrain(const score_t* gradients, const score_t* hessians); const hist_t* cuda_hist() const { return cuda_hist_; } diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index f5e7edb89422..9beb4e0ecae9 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -11,7 +11,6 @@ namespace LightGBM { CUDALeafSplits::CUDALeafSplits(const data_size_t num_data, const int leaf_index, - const score_t* cuda_gradients, const score_t* cuda_hessians, const int* cuda_num_data): num_data_(num_data), leaf_index_(leaf_index) { cuda_sum_of_gradients_ = nullptr; cuda_sum_of_hessians_ = nullptr; @@ -19,8 +18,6 @@ CUDALeafSplits::CUDALeafSplits(const data_size_t num_data, const int leaf_index, cuda_gain_ = nullptr; cuda_leaf_value_ = nullptr; - cuda_gradients_ = cuda_gradients; - cuda_hessians_ = cuda_hessians; cuda_data_indices_in_leaf_ = nullptr; cuda_num_data_ = cuda_num_data; } @@ -48,7 +45,8 @@ void CUDALeafSplits::Init() { CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); } -void CUDALeafSplits::InitValues(const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, +void CUDALeafSplits::InitValues( + const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, const data_size_t* cuda_num_data_in_leaf, const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, const double* cuda_gain, const double* cuda_leaf_value) { CopyFromCUDADeviceToCUDADevice(cuda_sum_of_gradients_, cuda_sum_of_gradients, 1); @@ -71,8 +69,12 @@ void CUDALeafSplits::InitValues() { SynchronizeCUDADevice(); } -void CUDALeafSplits::InitValues(const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, - double* root_sum_hessians) { +void CUDALeafSplits::InitValues( + const score_t* cuda_gradients, const score_t* cuda_hessians, + const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, + double* root_sum_hessians) { + cuda_gradients_ = cuda_gradients; + cuda_hessians_ = cuda_hessians; SetCUDAMemory(cuda_sum_of_gradients_, 0, num_blocks_init_from_gradients_); SetCUDAMemory(cuda_sum_of_hessians_, 0, num_blocks_init_from_gradients_); LaunchInitValuesKernal(); diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index 4aa26aab9af8..adda11525d4b 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -22,18 +22,20 @@ namespace LightGBM { class CUDALeafSplits { public: CUDALeafSplits(const data_size_t num_data, const int leaf_index, - const score_t* cuda_gradients, const score_t* cuda_hessians, const int* cuda_num_data); CUDALeafSplits(); void Init(); - void InitValues(const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, + void InitValues( + const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, const data_size_t* cuda_num_data_in_leaf, const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, const double* cuda_gain, const double* cuda_leaf_value); - void InitValues(const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, + void InitValues( + const score_t* cuda_gradients, const score_t* cuda_hessians, + const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, double* root_sum_hessians); void InitValues(); diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 9dfbc00c63bd..77ccc52065a1 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -25,14 +25,12 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia const label_t* labels = train_data->metadata().label(); cuda_centralized_info_.reset(new CUDACentralizedInfo(num_data_, this->config_->num_leaves, num_features_)); cuda_centralized_info_->Init(labels, train_data_); - cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0, cuda_centralized_info_->cuda_gradients(), - cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); + cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0, cuda_centralized_info_->cuda_num_data())); cuda_smaller_leaf_splits_->Init(); - cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_, -1, cuda_centralized_info_->cuda_gradients(), - cuda_centralized_info_->cuda_hessians(), cuda_centralized_info_->cuda_num_data())); + cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_, -1, cuda_centralized_info_->cuda_num_data())); cuda_larger_leaf_splits_->Init(); cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, - cuda_centralized_info_->cuda_gradients(), cuda_centralized_info_->cuda_hessians(), share_state_->feature_hist_offsets(), + share_state_->feature_hist_offsets(), config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf)); cuda_histogram_constructor_->Init(train_data_, share_state_.get()); cuda_data_partition_.reset(new CUDADataPartition(num_data_, num_features_, this->config_->num_leaves, num_threads_, @@ -46,9 +44,6 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia this->config_->min_sum_hessian_in_leaf, this->config_->min_gain_to_split, cuda_centralized_info_->cuda_num_features())); cuda_best_split_finder_->Init(); - cuda_score_updater_.reset(new CUDAScoreUpdater(num_data_)); - cuda_score_updater_->Init(); - InitObjective(); leaf_best_split_feature_.resize(config_->num_leaves, -1); leaf_best_split_threshold_.resize(config_->num_leaves, 0); @@ -61,14 +56,15 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia void NewCUDATreeLearner::BeforeTrain() { cuda_data_partition_->BeforeTrain(nullptr); global_timer.Start("CUDACentralizedInfo::BeforeTrain"); - cuda_objective_->GetGradients(cuda_score_updater_->cuda_scores(), - cuda_centralized_info_->cuda_gradients_ref(), cuda_centralized_info_->cuda_hessians_ref()); global_timer.Stop("CUDACentralizedInfo::BeforeTrain"); - cuda_smaller_leaf_splits_->InitValues(cuda_data_partition_->cuda_data_indices(), + cuda_smaller_leaf_splits_->InitValues( + gradients_, + hessians_, + cuda_data_partition_->cuda_data_indices(), cuda_histogram_constructor_->cuda_hist_pointer(), &leaf_sum_hessians_[0]); cuda_larger_leaf_splits_->InitValues(); - cuda_histogram_constructor_->BeforeTrain(); + cuda_histogram_constructor_->BeforeTrain(gradients_, hessians_); cuda_best_split_finder_->BeforeTrain(); leaf_num_data_[0] = num_data_; leaf_data_start_[0] = 0; @@ -87,8 +83,8 @@ void NewCUDATreeLearner::FindBestSplitsFromHistograms(const std::vector& void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, int* /*left_leaf*/, int* /*right_leaf*/) {} -void NewCUDATreeLearner::AddPredictionToScore(const Tree* /*tree*/, double* /*out_score*/) const { - cuda_data_partition_->UpdateTrainScore(config_->learning_rate, cuda_score_updater_->cuda_score_ref()); +void NewCUDATreeLearner::AddPredictionToScore(const Tree* /*tree*/, double* out_score) const { + cuda_data_partition_->UpdateTrainScore(config_->learning_rate, out_score); } Tree* NewCUDATreeLearner::BuildTree(const int num_leaves) { @@ -279,33 +275,6 @@ void NewCUDATreeLearner::ResetTrainingData(const Dataset* /*train_data*/, void NewCUDATreeLearner::SetBaggingData(const Dataset* /*subset*/, const data_size_t* /*used_indices*/, data_size_t /*num_data*/) {} -void NewCUDATreeLearner::InitObjective() { - if (config_->objective == std::string("binary")) { - cuda_objective_.reset(new CUDABinaryObjective(num_data_, - cuda_centralized_info_->cuda_labels(), config_->sigmoid)); - } else if (config_->objective == std::string("regression")) { - cuda_objective_.reset(new CUDARegressionObjective(num_data_, cuda_centralized_info_->cuda_labels())); - } else if (config_->objective == std::string("lambdarank")) { - cuda_objective_.reset(new CUDARankingObjective(num_data_, - cuda_centralized_info_->cuda_labels(), - cuda_centralized_info_->cuda_query_boundaries(), - train_data_->metadata().query_boundaries(), - train_data_->metadata().num_queries(), - config_->lambdarank_norm, - config_->sigmoid, - config_->lambdarank_truncation_level, - train_data_->metadata().label(), - config_->num_threads)); - } else { - Log::Fatal("Unsupported objective %s for CUDA.", config_->objective.c_str()); - } - - cuda_objective_->Init(); - cuda_objective_->CalcInitScore(); - - cuda_score_updater_->SetInitScore(cuda_objective_->cuda_init_score()); -} - } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 827ae02f3408..ce28d8089f57 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -51,8 +51,6 @@ class NewCUDATreeLearner: public SerialTreeLearner { Tree* BuildTree(const int num_leaves); - void InitObjective(); - // number of GPUs int num_gpus_; // number of threads on CPU @@ -71,10 +69,6 @@ class NewCUDATreeLearner: public SerialTreeLearner { // for best split information finding, given the histograms std::unique_ptr cuda_best_split_finder_; - std::unique_ptr cuda_score_updater_; - - std::unique_ptr cuda_objective_; - std::vector leaf_best_split_feature_; std::vector leaf_best_split_threshold_; std::vector leaf_best_split_default_left_; From 0467fcef1ad0b2154ba9c9508670df36b4c83c6e Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 21 Jul 2021 04:03:54 +0000 Subject: [PATCH 038/166] before use struct --- include/LightGBM/bin.h | 4 +- include/LightGBM/cuda/cuda_column_data.hpp | 57 +- include/LightGBM/cuda/cuda_tree.hpp | 8 +- include/LightGBM/dataset.h | 17 +- include/LightGBM/feature_group.h | 15 +- include/LightGBM/tree.h | 12 +- src/boosting/cuda/cuda_score_updater.cpp | 5 +- src/io/cuda/cuda_column_data.cpp | 174 ++ src/io/cuda/cuda_metadata.cpp | 1 - src/io/cuda/cuda_tree.cpp | 90 + src/io/cuda/cuda_tree.cu | 151 ++ src/io/dataset.cpp | 116 +- src/io/dense_bin.cpp | 60 +- src/io/dense_bin.hpp | 4 +- src/io/sparse_bin.cpp | 39 +- src/io/sparse_bin.hpp | 4 +- src/io/tree.cpp | 3 + .../cuda/cuda_best_split_finder.cpp | 8 +- .../cuda/cuda_best_split_finder.cu | 8 +- .../cuda/cuda_best_split_finder.hpp | 3 + src/treelearner/cuda/cuda_binary_objective.cu | 4 +- src/treelearner/cuda/cuda_data_partition.cpp | 293 +-- src/treelearner/cuda/cuda_data_partition.cu | 1979 ++--------------- src/treelearner/cuda/cuda_data_partition.hpp | 298 +-- .../cuda/cuda_histogram_constructor.cpp | 4 +- .../cuda/cuda_histogram_constructor.cu | 6 +- src/treelearner/cuda/cuda_leaf_splits.cpp | 6 +- src/treelearner/cuda/cuda_leaf_splits.cu | 4 +- .../cuda/cuda_ranking_objective.cu | 2 +- .../cuda/cuda_regression_objective.cu | 4 +- src/treelearner/cuda/cuda_split_info.hpp | 42 + .../cuda/new_cuda_tree_learner.cpp | 15 +- 32 files changed, 1136 insertions(+), 2300 deletions(-) create mode 100644 src/io/cuda/cuda_column_data.cpp create mode 100644 src/io/cuda/cuda_tree.cpp create mode 100644 src/io/cuda/cuda_tree.cu create mode 100644 src/treelearner/cuda/cuda_split_info.hpp diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 06ed5f38fef3..631173b56f88 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -389,7 +389,9 @@ class Bin { */ virtual Bin* Clone() = 0; - virtual const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const = 0; + virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const = 0; + + virtual const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const = 0; }; diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp index c8855d4ed2ad..84db2a0bee61 100644 --- a/include/LightGBM/cuda/cuda_column_data.hpp +++ b/include/LightGBM/cuda/cuda_column_data.hpp @@ -24,16 +24,40 @@ class CUDAColumnData { void Init(const int num_columns, const std::vector& column_data, const std::vector& column_bin_iterator, - const std::vector& column_bit_type, + const std::vector& column_bit_type, const std::vector& feature_max_bin, const std::vector& feature_min_bin, const std::vector& feature_offset, const std::vector& feature_most_freq_bin, const std::vector& feature_default_bin, + const std::vector& feature_missing_is_zero, + const std::vector& feature_missing_is_na, + const std::vector& feature_mfb_is_zero, + const std::vector& feature_mfb_is_na, const std::vector& feature_to_column); + const void* GetColumnData(const int column_index) const { return data_by_column_[column_index]; } + void* const* cuda_data_by_column() const { return cuda_data_by_column_; } + uint32_t feature_min_bin(const int feature_index) const { return feature_min_bin_[feature_index]; } + + uint32_t feature_max_bin(const int feature_index) const { return feature_max_bin_[feature_index]; } + + uint32_t feature_offset(const int feature_index) const { return feature_offset_[feature_index]; } + + uint32_t feature_most_freq_bin(const int feature_index) const { return feature_most_freq_bin_[feature_index]; } + + uint32_t feature_default_bin(const int feature_index) const { return feature_default_bin_[feature_index]; } + + uint8_t feature_missing_is_zero(const int feature_index) const { return feature_missing_is_zero_[feature_index]; } + + uint8_t feature_missing_is_na(const int feature_index) const { return feature_missing_is_na_[feature_index]; } + + uint8_t feature_mfb_is_zero(const int feature_index) const { return feature_mfb_is_zero_[feature_index]; } + + uint8_t feature_mfb_is_na(const int feature_index) const { return feature_mfb_is_na_[feature_index]; } + const uint32_t* cuda_feature_min_bin() const { return cuda_feature_min_bin_; } const uint32_t* cuda_feature_max_bin() const { return cuda_feature_max_bin_; } @@ -44,9 +68,21 @@ class CUDAColumnData { const uint32_t* cuda_feature_default_bin() const { return cuda_feature_default_bin_; } + const uint8_t* cuda_feature_missing_is_zero() const { return cuda_feature_missing_is_zero_; } + + const uint8_t* cuda_feature_missing_is_na() const { return cuda_feature_missing_is_na_; } + + const uint8_t* cuda_feature_mfb_is_zero() const { return cuda_feature_mfb_is_zero_; } + + const uint8_t* cuda_feature_mfb_is_na() const { return cuda_feature_mfb_is_na_; } + const int* cuda_feature_to_column() const { return cuda_feature_to_column_; } - const int8_t* cuda_column_bit_type() const { return cuda_column_bit_type_; } + const uint8_t* cuda_column_bit_type() const { return cuda_column_bit_type_; } + + int feature_to_column(const int feature_index) const { return feature_to_column_[feature_index]; } + + uint8_t column_bit_type(const int column_index) const { return column_bit_type_[column_index]; } private: template @@ -55,17 +91,30 @@ class CUDAColumnData { int num_threads_; data_size_t num_data_; int num_columns_; - std::vector column_bit_type_; + std::vector column_bit_type_; + std::vector feature_min_bin_; + std::vector feature_max_bin_; + std::vector feature_offset_; + std::vector feature_most_freq_bin_; + std::vector feature_default_bin_; + std::vector feature_missing_is_zero_; + std::vector feature_missing_is_na_; + std::vector feature_mfb_is_zero_; + std::vector feature_mfb_is_na_; void** cuda_data_by_column_; std::vector feature_to_column_; std::vector data_by_column_; - int8_t* cuda_column_bit_type_; + uint8_t* cuda_column_bit_type_; uint32_t* cuda_feature_min_bin_; uint32_t* cuda_feature_max_bin_; uint32_t* cuda_feature_offset_; uint32_t* cuda_feature_most_freq_bin_; uint32_t* cuda_feature_default_bin_; + uint8_t* cuda_feature_missing_is_zero_; + uint8_t* cuda_feature_missing_is_na_; + uint8_t* cuda_feature_mfb_is_zero_; + uint8_t* cuda_feature_mfb_is_na_; int* cuda_feature_to_column_; }; diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index 2ea852522a71..1f980fdb5115 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -68,12 +68,8 @@ class CUDATree : public Tree { void InitCUDA(); void LaunchAddPredictionToScoreKernel(const Dataset* data, - data_size_t num_data, - double* score) const; - - void LaunchAddPredictionToScoreKernel(const Dataset* data, - const data_size_t* used_data_indices, - data_size_t num_data, double* score) const; + const data_size_t* used_data_indices, + data_size_t num_data, double* score) const; void LaunchShrinkageKernel(const double rate); diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 636c3df177f6..73c096819852 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -6,6 +6,7 @@ #define LIGHTGBM_DATASET_H_ #include +#include #include #include #include @@ -571,7 +572,7 @@ class Dataset { return feature_groups_[group]->FeatureGroupData(); } - const uint8_t* GetColWiseData( + const void* GetColWiseData( const int feature_group_index, const int sub_feature_index, uint8_t* bit_type, @@ -579,6 +580,13 @@ class Dataset { std::vector* bin_iterator, const int num_threads) const; + const void* GetColWiseData( + const int feature_group_index, + const int sub_feature_index, + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const; + inline double RealThreshold(int i, uint32_t threshold) const { const int group = feature2group_[i]; const int sub_feature = feature2subfeature_[i]; @@ -704,7 +712,13 @@ class Dataset { return feature_groups_[feature_group_index]->feature_min_bin(sub_feature_index); } + const CUDAColumnData* cuda_column_data() const { + return cuda_column_data_.get(); + } + private: + void CreateCUDAColumnData(); + std::string data_filename_; /*! \brief Store used features */ std::vector> feature_groups_; @@ -746,6 +760,7 @@ class Dataset { std::vector numeric_feature_map_; int num_numeric_features_; std::string device_type_; + std::unique_ptr cuda_column_data_; }; } // namespace LightGBM diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index a81e06fa64e6..66cc09ed6527 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -478,7 +478,7 @@ class FeatureGroup { } } - const uint8_t* GetColWiseData(const int sub_feature_index, + const void* GetColWiseData(const int sub_feature_index, uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, @@ -492,6 +492,19 @@ class FeatureGroup { } } + const void* GetColWiseData(const int sub_feature_index, + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const { + if (sub_feature_index >= 0) { + CHECK(is_multi_val_); + return multi_bin_data_[sub_feature_index]->GetColWiseData(bit_type, is_sparse, bin_iterator); + } else { + CHECK(!is_multi_val_); + return bin_data_->GetColWiseData(bit_type, is_sparse, bin_iterator); + } + } + uint32_t feature_max_bin(const int sub_feature_index) { if (!is_multi_val_) { return bin_offsets_[sub_feature_index + 1] - 1; diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 4f5ede83102b..4a80f9823c0b 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -100,7 +100,7 @@ class Tree { * \param num_data Number of total data * \param score Will add prediction to score */ - void AddPredictionToScore(const Dataset* data, + virtual void AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const; @@ -111,7 +111,7 @@ class Tree { * \param num_data Number of total data * \param score Will add prediction to score */ - void AddPredictionToScore(const Dataset* data, + virtual void AddPredictionToScore(const Dataset* data, const data_size_t* used_data_indices, data_size_t num_data, double* score) const; @@ -184,7 +184,7 @@ class Tree { * shrinkage rate (a.k.a learning rate) is used to tune the training process * \param rate The factor of shrinkage */ - inline void Shrinkage(double rate) { + virtual inline void Shrinkage(double rate) { #pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048) for (int i = 0; i < num_leaves_ - 1; ++i) { leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] * rate); @@ -316,11 +316,13 @@ class Tree { inline bool is_linear() const { return is_linear_; } + inline bool is_cuda_tree() const { return is_cuda_tree_; } + inline void SetIsLinear(bool is_linear) { is_linear_ = is_linear; } - private: + protected: std::string NumericalDecisionIfElse(int node) const; std::string CategoricalDecisionIfElse(int node) const; @@ -527,6 +529,8 @@ class Tree { std::vector> leaf_features_; /* \brief features used in leaf linear models; indexing is relative to used_features_ */ std::vector> leaf_features_inner_; + /*! \brief Marks whether this tree is a CUDATree */ + bool is_cuda_tree_; }; inline void Tree::Split(int leaf, int feature, int real_feature, diff --git a/src/boosting/cuda/cuda_score_updater.cpp b/src/boosting/cuda/cuda_score_updater.cpp index 32abdb7f035c..a05f8edb3d10 100644 --- a/src/boosting/cuda/cuda_score_updater.cpp +++ b/src/boosting/cuda/cuda_score_updater.cpp @@ -44,10 +44,7 @@ inline void CUDAScoreUpdater::AddScore(double val, int cur_tree_id) { inline void CUDAScoreUpdater::AddScore(const Tree* tree, int cur_tree_id) { Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; - std::vector host_score(num_data_, 0.0f); - CopyFromCUDADeviceToHostOuter(host_score.data(), cuda_score_ + offset, static_cast(num_data_), __FILE__, __LINE__); - tree->AddPredictionToScore(data_, num_data_, host_score.data()); - CopyFromHostToCUDADeviceOuter(cuda_score_ + offset, host_score.data(), static_cast(num_data_), __FILE__, __LINE__); + tree->AddPredictionToScore(data_, num_data_, cuda_score_ + offset); } inline void CUDAScoreUpdater::AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) { diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp new file mode 100644 index 000000000000..34a3631d1271 --- /dev/null +++ b/src/io/cuda/cuda_column_data.cpp @@ -0,0 +1,174 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include + +namespace LightGBM { + +CUDAColumnData::CUDAColumnData(const data_size_t num_data) { + num_threads_ = OMP_NUM_THREADS(); + num_data_ = num_data; +} + +CUDAColumnData::~CUDAColumnData() {} + +template +void CUDAColumnData::InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, void** out_column_data_pointer) { + BIN_TYPE* cuda_column_data = nullptr; + if (!IS_SPARSE) { + if (IS_4BIT) { + std::vector expanded_column_data(num_data_, 0); + const BIN_TYPE* in_column_data_reintrepreted = reinterpret_cast(in_column_data); + for (data_size_t i = 0; i < num_data_; ++i) { + expanded_column_data[i] = static_cast((in_column_data_reintrepreted[i >> 1] >> ((i & 1) << 2)) & 0xf); + } + InitCUDAMemoryFromHostMemoryOuter(&cuda_column_data, + expanded_column_data.data(), + static_cast(num_data_), + __FILE__, + __LINE__); + } else { + InitCUDAMemoryFromHostMemoryOuter(&cuda_column_data, + reinterpret_cast(in_column_data), + static_cast(num_data_), + __FILE__, + __LINE__); + } + } else { + // need to iterate bin iterator + std::vector expanded_column_data(num_data_, 0); + for (data_size_t i = 0; i < num_data_; ++i) { + expanded_column_data[i] = static_cast(bin_iterator->Get(i)); + } + InitCUDAMemoryFromHostMemoryOuter(&cuda_column_data, + reinterpret_cast(in_column_data), + static_cast(num_data_), + __FILE__, + __LINE__); + } + *out_column_data_pointer = reinterpret_cast(cuda_column_data); +} + +void CUDAColumnData::Init(const int num_columns, + const std::vector& column_data, + const std::vector& column_bin_iterator, + const std::vector& column_bit_type, + const std::vector& feature_max_bin, + const std::vector& feature_min_bin, + const std::vector& feature_offset, + const std::vector& feature_most_freq_bin, + const std::vector& feature_default_bin, + const std::vector& feature_missing_is_zero, + const std::vector& feature_missing_is_na, + const std::vector& feature_mfb_is_zero, + const std::vector& feature_mfb_is_na, + const std::vector& feature_to_column) { + num_columns_ = num_columns; + column_bit_type_ = column_bit_type; + feature_max_bin_ = feature_max_bin; + feature_min_bin_ = feature_min_bin; + feature_offset_ = feature_offset; + feature_most_freq_bin_ = feature_most_freq_bin; + feature_default_bin_ = feature_default_bin; + feature_missing_is_zero_ = feature_missing_is_zero; + feature_missing_is_na_ = feature_missing_is_na; + feature_mfb_is_zero_ = feature_mfb_is_zero; + feature_mfb_is_na_ = feature_mfb_is_na; + data_by_column_.resize(num_columns_, nullptr); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int column_index = 0; column_index < num_columns_; ++column_index) { + const int8_t bit_type = column_bit_type[column_index]; + if (column_data[column_index] != nullptr) { + // is dense column + if (bit_type == 4) { + column_bit_type_[column_index] = 8; + InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + } else if (bit_type == 8) { + InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + } else if (bit_type == 16) { + InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + } else if (bit_type == 32) { + InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + } else { + Log::Fatal("Unknow column bit type %d", bit_type); + } + } else { + // is sparse column + if (bit_type == 8) { + InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + } else if (bit_type == 16) { + InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + } else if (bit_type == 32) { + InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + } else { + Log::Fatal("Unknow column bit type %d", bit_type); + } + } + feature_to_column_ = feature_to_column; + } + InitCUDAMemoryFromHostMemoryOuter(&cuda_data_by_column_, + data_by_column_.data(), + data_by_column_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_column_bit_type_, + column_bit_type_.data(), + column_bit_type_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_max_bin_, + feature_max_bin.data(), + feature_max_bin.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_min_bin_, + feature_min_bin.data(), + feature_min_bin.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_offset_, + feature_offset.data(), + feature_offset.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_most_freq_bin_, + feature_most_freq_bin.data(), + feature_most_freq_bin.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_default_bin_, + feature_default_bin.data(), + feature_default_bin.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_missing_is_zero_, + feature_missing_is_zero.data(), + feature_missing_is_zero.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_missing_is_na_, + feature_missing_is_na.data(), + feature_missing_is_na.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_mfb_is_zero_, + feature_mfb_is_zero.data(), + feature_mfb_is_zero.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_mfb_is_na_, + feature_mfb_is_na.data(), + feature_mfb_is_na.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_to_column_, + feature_to_column_.data(), + feature_to_column_.size(), + __FILE__, + __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +} // namespace LightGBM diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp index ace5ec22cc84..29b66ef51b7f 100644 --- a/src/io/cuda/cuda_metadata.cpp +++ b/src/io/cuda/cuda_metadata.cpp @@ -17,7 +17,6 @@ void CUDAMetadata::Init(const std::vector& label, const std::vector& query_weights, const std::vector& init_score, const std::vector& queries) { - Log::Warning("label.size() = %d", label.size()); InitCUDAMemoryFromHostMemoryOuter(&cuda_label_, label.data(), label.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_weights_, weight.data(), weight.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_query_boundaries_, query_boundaries.data(), query_boundaries.size(), __FILE__, __LINE__); diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp new file mode 100644 index 000000000000..bb59b5fb8f70 --- /dev/null +++ b/src/io/cuda/cuda_tree.cpp @@ -0,0 +1,90 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include + +namespace LightGBM { + +CUDATree::CUDATree(int max_leaves, bool track_branch_features, bool is_linear): +Tree(max_leaves, track_branch_features, is_linear), +num_threads_per_block_add_prediction_to_score_(1024) { + is_cuda_tree_ = true; + Log::Fatal("CUDATree can be only created from host Tree."); +} + +CUDATree::CUDATree(const Tree* host_tree): + Tree(*host_tree), + num_threads_per_block_add_prediction_to_score_(1024) { + is_cuda_tree_ = true; + InitCUDA(); +} + +CUDATree::~CUDATree() {} + +void CUDATree::InitCUDA() { + InitCUDAMemoryFromHostMemoryOuter(&cuda_left_child_, + left_child_.data(), + left_child_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_right_child_, + right_child_.data(), + right_child_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_split_feature_inner_, + split_feature_inner_.data(), + split_feature_inner_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_split_feature_, + split_feature_.data(), + split_feature_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_threshold_in_bin_, + threshold_in_bin_.data(), + threshold_in_bin_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_threshold_, + threshold_.data(), + threshold_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_decision_type_, + decision_type_.data(), + decision_type_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_leaf_value_, + leaf_value_.data(), + leaf_value_.size(), + __FILE__, + __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +void CUDATree::AddPredictionToScore(const Dataset* data, + data_size_t num_data, + double* score) const { + LaunchAddPredictionToScoreKernel(data, nullptr, num_data, score); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +void CUDATree::AddPredictionToScore(const Dataset* data, + const data_size_t* used_data_indices, + data_size_t num_data, double* score) const { + // TODO(shiyu1994): used_data_indices should reside on GPU + LaunchAddPredictionToScoreKernel(data, used_data_indices, num_data, score); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +inline void CUDATree::Shrinkage(double rate) { + Tree::Shrinkage(rate); + LaunchShrinkageKernel(rate); +} + +} // namespace LightGBM diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu new file mode 100644 index 000000000000..c995f1be43dc --- /dev/null +++ b/src/io/cuda/cuda_tree.cu @@ -0,0 +1,151 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include + +namespace LightGBM { + +template +__global__ void AddPredictionToScoreKernel( + // dataset information + const data_size_t num_data, + void* const* cuda_data_by_column, + const uint8_t* cuda_column_bit_type, + const uint32_t* cuda_feature_min_bin, + const uint32_t* cuda_feature_max_bin, + const uint32_t* cuda_feature_offset, + const uint32_t* cuda_feature_default_bin, + const uint32_t* cuda_feature_most_freq_bin, + const int* cuda_feature_to_column, + const data_size_t* cuda_used_indices, + // tree information + const uint32_t* cuda_threshold_in_bin, + const int8_t* cuda_decision_type, + const int* cuda_split_feature_inner, + const int* cuda_left_child, + const int* cuda_right_child, + const double* cuda_leaf_value, + // output + double* score) { + const data_size_t inner_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const data_size_t data_index = USE_INDICES ? cuda_used_indices[inner_data_index] : inner_data_index; + if (data_index < num_data) { + int node = 0; + while (node >= 0) { + const int split_feature_inner = cuda_split_feature_inner[node]; + const int column = cuda_feature_to_column[split_feature_inner]; + const uint32_t default_bin = cuda_feature_default_bin[split_feature_inner]; + const uint32_t most_freq_bin = cuda_feature_most_freq_bin[split_feature_inner]; + const uint32_t max_bin = cuda_feature_max_bin[split_feature_inner]; + const uint32_t min_bin = cuda_feature_min_bin[split_feature_inner]; + const uint32_t offset = cuda_feature_offset[split_feature_inner]; + const uint8_t column_bit_type = cuda_column_bit_type[column]; + uint32_t bin = 0; + if (column_bit_type == 8) { + bin = static_cast((reinterpret_cast(cuda_data_by_column[column]))[data_index]); + } else if (column_bit_type == 16) { + bin = static_cast((reinterpret_cast(cuda_data_by_column[column]))[data_index]); + } else if (column_bit_type == 32) { + bin = static_cast((reinterpret_cast(cuda_data_by_column[column]))[data_index]); + } + if (bin >= min_bin && bin <= max_bin) { + bin = bin - min_bin + offset; + } else { + bin = most_freq_bin; + } + const int8_t decision_type = cuda_decision_type[node]; + const uint32_t threshold_in_bin = cuda_threshold_in_bin[node]; + const int8_t missing_type = ((decision_type >> 2) & 3); + const bool default_left = ((decision_type & kDefaultLeftMask) > 0); + if ((missing_type == 1 && bin == default_bin) || (missing_type == 2 && bin == max_bin)) { + if (default_left) { + node = cuda_left_child[node]; + } else { + node = cuda_right_child[node]; + } + } else { + if (bin <= threshold_in_bin) { + node = cuda_left_child[node]; + } else { + node = cuda_right_child[node]; + } + } + } + score[data_index] += cuda_leaf_value[~node]; + } +} + +void CUDATree::LaunchAddPredictionToScoreKernel( + const Dataset* data, + const data_size_t* used_data_indices, + data_size_t num_data, + double* score) const { + const CUDAColumnData* cuda_column_data = data->cuda_column_data(); + if (cuda_column_data == nullptr) { + Log::Warning("error cuda_column_data is nullptr"); + } + const int num_blocks = (num_data + num_threads_per_block_add_prediction_to_score_ - 1) / num_threads_per_block_add_prediction_to_score_; + if (used_data_indices == nullptr) { + AddPredictionToScoreKernel<<>>( + // dataset information + num_data, + cuda_column_data->cuda_data_by_column(), + cuda_column_data->cuda_column_bit_type(), + cuda_column_data->cuda_feature_min_bin(), + cuda_column_data->cuda_feature_max_bin(), + cuda_column_data->cuda_feature_offset(), + cuda_column_data->cuda_feature_default_bin(), + cuda_column_data->cuda_feature_most_freq_bin(), + cuda_column_data->cuda_feature_to_column(), + nullptr, + // tree information + cuda_threshold_in_bin_, + cuda_decision_type_, + cuda_split_feature_inner_, + cuda_left_child_, + cuda_right_child_, + cuda_leaf_value_, + // output + score); + } else { + AddPredictionToScoreKernel<<>>( + // dataset information + num_data, + cuda_column_data->cuda_data_by_column(), + cuda_column_data->cuda_column_bit_type(), + cuda_column_data->cuda_feature_min_bin(), + cuda_column_data->cuda_feature_max_bin(), + cuda_column_data->cuda_feature_offset(), + cuda_column_data->cuda_feature_default_bin(), + cuda_column_data->cuda_feature_most_freq_bin(), + cuda_column_data->cuda_feature_to_column(), + used_data_indices, + // tree information + cuda_threshold_in_bin_, + cuda_decision_type_, + cuda_split_feature_inner_, + cuda_left_child_, + cuda_right_child_, + cuda_leaf_value_, + // output + score); + } + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +__global__ void ShrinkageKernel(const double rate, double* cuda_leaf_value, const int num_leaves) { + const int leaf_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (leaf_index < num_leaves) { + cuda_leaf_value[leaf_index] *= rate; + } +} + +void CUDATree::LaunchShrinkageKernel(const double rate) { + const int num_threads_per_block = 1024; + const int num_blocks = (num_leaves_ + num_threads_per_block - 1) / num_threads_per_block; + ShrinkageKernel<<>>(rate, cuda_leaf_value_, num_leaves_); +} + +} // namespace LightGBM diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 4c56320653aa..9e32a0ea5bfb 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -438,7 +438,10 @@ void Dataset::FinishLoad() { } } if (device_type_ == std::string("cuda")) { + CreateCUDAColumnData(); metadata_.CreateCUDAMetadata(); + } else { + cuda_column_data_.reset(nullptr); } is_finish_load_ = true; } @@ -1476,7 +1479,7 @@ void Dataset::AddFeaturesFrom(Dataset* other) { } } -const uint8_t* Dataset::GetColWiseData( +const void* Dataset::GetColWiseData( const int feature_group_index, const int sub_feature_index, uint8_t* bit_type, @@ -1486,4 +1489,115 @@ const uint8_t* Dataset::GetColWiseData( return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator, num_threads); } +const void* Dataset::GetColWiseData( + const int feature_group_index, + const int sub_feature_index, + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const { + return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator); +} + +void Dataset::CreateCUDAColumnData() { + cuda_column_data_.reset(new CUDAColumnData(num_data_)); + int num_columns = 0; + std::vector column_data; + std::vector column_bin_iterator; + std::vector column_bit_type; + int feature_index = 0; + std::vector feature_to_column(num_features_, -1); + std::vector feature_max_bins(num_features_, 0); + std::vector feature_min_bins(num_features_, 0); + std::vector feature_offsets(num_features_, 0); + std::vector feature_most_freq_bins(num_features_, 0); + std::vector feature_default_bin(num_features_, 0); + std::vector feature_missing_is_zero(num_features_, 0); + std::vector feature_missing_is_na(num_features_, 0); + std::vector feature_mfb_is_zero(num_features_, 0); + std::vector feature_mfb_is_na(num_features_, 0); + for (int feature_group_index = 0; feature_group_index < num_groups_; ++feature_group_index) { + if (feature_groups_[feature_group_index]->is_multi_val_) { + for (int sub_feature_index = 0; sub_feature_index < feature_groups_[feature_group_index]->num_feature_; ++sub_feature_index) { + uint8_t bit_type = 0; + bool is_sparse = false; + BinIterator* bin_iterator = nullptr; + const void* one_column_data = GetColWiseData(feature_group_index, + sub_feature_index, + &bit_type, + &is_sparse, + &bin_iterator); + column_data.emplace_back(one_column_data); + column_bin_iterator.emplace_back(bin_iterator); + column_bit_type.emplace_back(bit_type); + feature_to_column[feature_index] = num_columns; + ++num_columns; + const BinMapper* feature_bin_mapper = FeatureBinMapper(feature_index); + feature_max_bins[feature_index] = feature_max_bin(feature_index); + feature_min_bins[feature_index] = feature_min_bin(feature_index); + const uint32_t most_freq_bin = feature_bin_mapper->GetMostFreqBin(); + feature_offsets[feature_index] = static_cast(most_freq_bin == 0); + feature_most_freq_bins[feature_index] = most_freq_bin; + feature_default_bin[feature_index] = feature_bin_mapper->GetDefaultBin(); + if (feature_bin_mapper->missing_type() == MissingType::Zero) { + feature_missing_is_zero.emplace_back(1); + feature_missing_is_na.emplace_back(0); + } else if (feature_bin_mapper->missing_type() == MissingType::NaN) { + feature_missing_is_zero.emplace_back(0); + feature_missing_is_na.emplace_back(1); + } else { + feature_missing_is_zero.emplace_back(0); + feature_missing_is_na.emplace_back(0); + } + ++feature_index; + } + } else { + uint8_t bit_type = 0; + bool is_sparse = false; + BinIterator* bin_iterator = nullptr; + const void* one_column_data = GetColWiseData(feature_group_index, + -1, + &bit_type, + &is_sparse, + &bin_iterator); + column_data.emplace_back(one_column_data); + column_bin_iterator.emplace_back(bin_iterator); + column_bit_type.emplace_back(bit_type); + feature_to_column[feature_index] = num_columns; + ++num_columns; + const BinMapper* feature_bin_mapper = FeatureBinMapper(feature_index); + feature_max_bins[feature_index] = feature_max_bin(feature_index); + feature_min_bins[feature_index] = feature_min_bin(feature_index); + const uint32_t most_freq_bin = feature_bin_mapper->GetMostFreqBin(); + feature_offsets[feature_index] = static_cast(most_freq_bin == 0); + feature_most_freq_bins[feature_index] = most_freq_bin; + feature_default_bin[feature_index] = feature_bin_mapper->GetDefaultBin(); + if (feature_bin_mapper->missing_type() == MissingType::Zero) { + feature_missing_is_zero.emplace_back(1); + feature_missing_is_na.emplace_back(0); + } else if (feature_bin_mapper->missing_type() == MissingType::NaN) { + feature_missing_is_zero.emplace_back(0); + feature_missing_is_na.emplace_back(1); + } else { + feature_missing_is_zero.emplace_back(0); + feature_missing_is_na.emplace_back(0); + } + ++feature_index; + } + } + cuda_column_data_->Init(num_columns, + column_data, + column_bin_iterator, + column_bit_type, + feature_max_bins, + feature_min_bins, + feature_offsets, + feature_most_freq_bins, + feature_default_bin, + feature_missing_is_zero, + feature_missing_is_na, + feature_mfb_is_zero, + feature_mfb_is_na, + feature_to_column); +} + } // namespace LightGBM diff --git a/src/io/dense_bin.cpp b/src/io/dense_bin.cpp index 7c8cb6247090..89475f57110a 100644 --- a/src/io/dense_bin.cpp +++ b/src/io/dense_bin.cpp @@ -9,7 +9,7 @@ namespace LightGBM { template <> -const uint8_t* DenseBin::GetColWiseData( +const void* DenseBin::GetColWiseData( uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, @@ -17,11 +17,11 @@ const uint8_t* DenseBin::GetColWiseData( *is_sparse = false; *bit_type = 8; bin_iterator->clear(); - return data_.data(); + return reinterpret_cast(data_.data()); } template <> -const uint8_t* DenseBin::GetColWiseData( +const void* DenseBin::GetColWiseData( uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, @@ -29,11 +29,11 @@ const uint8_t* DenseBin::GetColWiseData( *is_sparse = false; *bit_type = 16; bin_iterator->clear(); - return reinterpret_cast(data_.data()); + return reinterpret_cast(data_.data()); } template <> -const uint8_t* DenseBin::GetColWiseData( +const void* DenseBin::GetColWiseData( uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, @@ -41,11 +41,11 @@ const uint8_t* DenseBin::GetColWiseData( *is_sparse = false; *bit_type = 32; bin_iterator->clear(); - return reinterpret_cast(data_.data()); + return reinterpret_cast(data_.data()); } template <> -const uint8_t* DenseBin::GetColWiseData( +const void* DenseBin::GetColWiseData( uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, @@ -53,7 +53,51 @@ const uint8_t* DenseBin::GetColWiseData( *is_sparse = false; *bit_type = 4; bin_iterator->clear(); - return data_.data(); + return reinterpret_cast(data_.data()); +} + +template <> +const void* DenseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const { + *is_sparse = false; + *bit_type = 8; + *bin_iterator = nullptr; + return reinterpret_cast(data_.data()); +} + +template <> +const void* DenseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const { + *is_sparse = false; + *bit_type = 16; + *bin_iterator = nullptr; + return reinterpret_cast(data_.data()); +} + +template <> +const void* DenseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const { + *is_sparse = false; + *bit_type = 32; + *bin_iterator = nullptr; + return reinterpret_cast(data_.data()); +} + +template <> +const void* DenseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const { + *is_sparse = false; + *bit_type = 4; + *bin_iterator = nullptr; + return reinterpret_cast(data_.data()); } } // namespace LightGBM diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index b68c3763ac4a..0ebcdc1a6181 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -461,7 +461,9 @@ class DenseBin : public Bin { DenseBin* Clone() override; - const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const override; + const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const override; + + const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override; private: data_size_t num_data_; diff --git a/src/io/sparse_bin.cpp b/src/io/sparse_bin.cpp index a22af28e1e04..8c45fd512a04 100644 --- a/src/io/sparse_bin.cpp +++ b/src/io/sparse_bin.cpp @@ -9,7 +9,7 @@ namespace LightGBM { template <> -const uint8_t* SparseBin::GetColWiseData( +const void* SparseBin::GetColWiseData( uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, @@ -23,7 +23,7 @@ const uint8_t* SparseBin::GetColWiseData( } template <> -const uint8_t* SparseBin::GetColWiseData( +const void* SparseBin::GetColWiseData( uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, @@ -37,7 +37,7 @@ const uint8_t* SparseBin::GetColWiseData( } template <> -const uint8_t* SparseBin::GetColWiseData( +const void* SparseBin::GetColWiseData( uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, @@ -50,4 +50,37 @@ const uint8_t* SparseBin::GetColWiseData( return nullptr; } +template <> +const void* SparseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const { + *is_sparse = true; + *bit_type = 8; + *bin_iterator = new SparseBinIterator(this, 0); + return nullptr; +} + +template <> +const void* SparseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const { + *is_sparse = true; + *bit_type = 16; + *bin_iterator = new SparseBinIterator(this, 0); + return nullptr; +} + +template <> +const void* SparseBin::GetColWiseData( + uint8_t* bit_type, + bool* is_sparse, + BinIterator** bin_iterator) const { + *is_sparse = true; + *bit_type = 32; + *bin_iterator = new SparseBinIterator(this, 0); + return nullptr; +} + } // namespace LightGBM diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index f3c7b093c419..ad15092ac39b 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -620,7 +620,9 @@ class SparseBin : public Bin { } } - const uint8_t* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const override; + const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, std::vector* bin_iterator, const int num_threads) const override; + + const void* GetColWiseData(uint8_t* bit_type, bool* is_sparse, BinIterator** bin_iterator) const override; private: data_size_t num_data_; diff --git a/src/io/tree.cpp b/src/io/tree.cpp index 67e02af20cd8..9dec5e2c0992 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -53,6 +53,7 @@ Tree::Tree(int max_leaves, bool track_branch_features, bool is_linear) leaf_features_.resize(max_leaves_); leaf_features_inner_.resize(max_leaves_); } + is_cuda_tree_ = false; } int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, @@ -702,6 +703,8 @@ Tree::Tree(const char* str, size_t* used_len) { is_linear_ = false; } + is_cuda_tree_ = false; + if ((num_leaves_ <= 1) && !is_linear_) { return; } diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index dc376cf1c410..ef4c60b6380d 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -112,6 +112,7 @@ void CUDABestSplitFinder::Init() { const int num_task_blocks = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; const size_t cuda_best_leaf_split_info_buffer_size = static_cast(num_task_blocks) * static_cast(num_leaves_); + AllocateCUDAMemoryOuter(&cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__); AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_feature_); AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_default_left_); AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_threshold_); @@ -135,6 +136,7 @@ void CUDABestSplitFinder::Init() { InitCUDAMemoryFromHostMemory(&cuda_task_out_default_left_, cpu_task_out_default_left_.data(), cpu_task_out_default_left_.size()); const size_t output_buffer_size = 2 * static_cast(num_tasks_); + AllocateCUDAMemoryOuter(&cuda_best_split_info_, output_buffer_size, __FILE__, __LINE__); AllocateCUDAMemory(output_buffer_size, &cuda_best_split_default_left_); AllocateCUDAMemory(output_buffer_size, &cuda_best_split_threshold_); AllocateCUDAMemory(output_buffer_size, &cuda_best_split_gain_); @@ -171,10 +173,10 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_le const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ && sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0); LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits, smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); - //SynchronizeCUDADevice(); + //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); } @@ -183,7 +185,7 @@ void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves, smaller_leaf_index, larger_leaf_index, leaf_best_split_feature, leaf_best_split_threshold, leaf_best_split_default_left, best_leaf_index); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 3e462cae97ab..48eacddf6541 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -571,7 +571,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_best_split_right_output_, cuda_best_split_found_); } - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); if (larger_leaf_index >= 0) { FindBestSplitsForLeafKernel<<>>( // input feature information @@ -871,7 +871,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cuda_leaf_best_split_found_, false); } - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); SyncBestSplitForLeafKernel<<>>( cpu_smaller_leaf_index, cpu_larger_leaf_index, @@ -978,7 +978,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( larger_only, num_leaves_); if (num_blocks_per_leaf > 1) { - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( cpu_smaller_leaf_index, cpu_larger_leaf_index, @@ -1082,7 +1082,7 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_ cuda_best_split_info_buffer_, cuda_leaf_best_split_feature_, cuda_leaf_best_split_threshold_, cuda_leaf_best_split_default_left_); std::vector cpu_leaf_best_split_info_buffer(7); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CopyFromCUDADeviceToHost(cpu_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 7); (*leaf_best_split_feature)[smaller_leaf_index] = cpu_leaf_best_split_info_buffer[0]; (*leaf_best_split_threshold)[smaller_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[1]); diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 80c5c5390b39..4783b5296bd6 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -11,6 +11,7 @@ #include "new_cuda_utils.hpp" #include "cuda_leaf_splits.hpp" +#include "cuda_split_info.hpp" #include #include @@ -159,6 +160,7 @@ class CUDABestSplitFinder { double* cuda_leaf_best_split_right_gain_; double* cuda_leaf_best_split_right_output_; uint8_t* cuda_leaf_best_split_found_; + CUDASplitInfo* cuda_leaf_best_split_info_; // for best split information when finding best split uint8_t* cuda_best_split_default_left_; uint32_t* cuda_best_split_threshold_; @@ -175,6 +177,7 @@ class CUDABestSplitFinder { double* cuda_best_split_right_output_; uint8_t* cuda_best_split_found_; int* cuda_num_total_bin_; + CUDASplitInfo* cuda_best_split_info_; // TODO(shiyu1994): use prefix sum to accelerate best split finding hist_t* prefix_sum_hist_left_; hist_t* prefix_sum_hist_right_; diff --git a/src/treelearner/cuda/cuda_binary_objective.cu b/src/treelearner/cuda/cuda_binary_objective.cu index ee42adb5884d..a47e66ec4455 100644 --- a/src/treelearner/cuda/cuda_binary_objective.cu +++ b/src/treelearner/cuda/cuda_binary_objective.cu @@ -45,9 +45,9 @@ void CUDABinaryObjective::LaunchCalcInitScoreKernel() { const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE_BINARY * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; CalcInitScoreKernel_1_Binary<<>>(cuda_labels_, num_data_, cuda_init_score_); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CalcInitScoreKernel_2_Binary<<<1, 1>>>(cuda_init_score_, num_data_, sigmoid_); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } __global__ void GetGradientsKernel_Binary(const double* cuda_scores, const label_t* cuda_labels, diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index b56c53c37061..dcb2b310d9c4 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -10,28 +10,27 @@ namespace LightGBM { -CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_features, const int num_leaves, - const int num_threads, const data_size_t* cuda_num_data, const int* cuda_num_leaves, - const int* cuda_num_features, const std::vector& feature_hist_offsets, const Dataset* train_data, +CUDADataPartition::CUDADataPartition( + const Dataset* train_data, + const int num_total_bin, + const int num_leaves, + const int num_threads, + const data_size_t* cuda_num_data, hist_t* cuda_hist): - num_data_(num_data), num_features_(num_features), num_leaves_(num_leaves), num_threads_(num_threads), - num_total_bin_(feature_hist_offsets.back()), cuda_num_features_(cuda_num_features), + + num_data_(train_data->num_data()), + num_features_(train_data->num_features()), + num_total_bin_(num_total_bin), + num_leaves_(num_leaves), + num_threads_(num_threads), cuda_hist_(cuda_hist) { + cuda_num_data_ = cuda_num_data; - cuda_num_leaves_ = cuda_num_leaves; max_num_split_indices_blocks_ = (num_data_ + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; cur_num_leaves_ = 1; - feature_default_bins_.resize(train_data->num_features()); - feature_most_freq_bins_.resize(train_data->num_features()); - feature_max_bins_.resize(train_data->num_features()); - feature_min_bins_.resize(train_data->num_features()); - feature_missing_is_zero_.resize(train_data->num_features()); - feature_missing_is_na_.resize(train_data->num_features()); - feature_mfb_is_zero_.resize(train_data->num_features()); - feature_mfb_is_na_.resize(train_data->num_features()); - bin_upper_bounds_.resize(train_data->num_features()); - feature_num_bins_.resize(train_data->num_features()); + bin_upper_bounds_.resize(num_features_); + feature_num_bins_.resize(num_features_); int cur_group = 0; uint32_t prev_group_bins = 0; for (int feature_index = 0; feature_index < num_features_; ++feature_index) { @@ -41,51 +40,19 @@ CUDADataPartition::CUDADataPartition(const data_size_t num_data, const int num_f cur_group = group; } const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); - feature_default_bins_[feature_index] = bin_mapper->GetDefaultBin(); - feature_most_freq_bins_[feature_index] = bin_mapper->GetMostFreqBin(); - feature_min_bins_[feature_index] = train_data->feature_min_bin(feature_index); - feature_max_bins_[feature_index] = train_data->feature_max_bin(feature_index); bin_upper_bounds_[feature_index] = bin_mapper->bin_upper_bound(); feature_num_bins_[feature_index] = bin_mapper->num_bin(); - const MissingType missing_type = bin_mapper->missing_type(); - if (missing_type == MissingType::None) { - feature_missing_is_zero_[feature_index] = 0; - feature_missing_is_na_[feature_index] = 0; - feature_mfb_is_zero_[feature_index] = 0; - feature_mfb_is_na_[feature_index] = 0; - } else if (missing_type == MissingType::Zero) { - feature_missing_is_zero_[feature_index] = 1; - feature_missing_is_na_[feature_index] = 0; - if (bin_mapper->GetMostFreqBin() == bin_mapper->GetDefaultBin()) { - feature_mfb_is_zero_[feature_index] = 1; - } else { - feature_mfb_is_zero_[feature_index] = 0; - } - feature_mfb_is_na_[feature_index] = 0; - } else if (missing_type == MissingType::NaN) { - feature_missing_is_zero_[feature_index] = 0; - feature_missing_is_na_[feature_index] = 1; - feature_mfb_is_zero_[feature_index] = 0; - if (bin_mapper->GetMostFreqBin() == bin_mapper->GetDefaultBin()) { - feature_mfb_is_na_[feature_index] = 1; - } else { - feature_mfb_is_na_[feature_index] = 0; - } - } } - num_data_in_leaf_.resize(num_leaves_, 0); - num_data_in_leaf_[0] = num_data_; - train_data_ = train_data; + cuda_column_data_ = train_data->cuda_column_data(); } -void CUDADataPartition::Init(const Dataset* train_data) { +void CUDADataPartition::Init() { // allocate CUDA memory AllocateCUDAMemory(static_cast(num_data_), &cuda_data_indices_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_data_start_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_data_end_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_num_data_); - InitCUDAValueFromConstant(&cuda_num_total_bin_, num_total_bin_); InitCUDAValueFromConstant(&cuda_cur_num_leaves_, 1); // leave some space for alignment AllocateCUDAMemory(static_cast(num_data_) + 1024 * 8, &cuda_data_to_left_); @@ -98,14 +65,6 @@ void CUDADataPartition::Init(const Dataset* train_data) { AllocateCUDAMemory(static_cast(num_leaves_), &cuda_hist_pool_); CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1); - InitCUDAMemoryFromHostMemory(&cuda_feature_most_freq_bins_, feature_most_freq_bins_.data(), static_cast(num_features_)); - InitCUDAMemoryFromHostMemory(&cuda_feature_default_bins_, feature_default_bins_.data(), static_cast(num_features_)); - InitCUDAMemoryFromHostMemory(&cuda_feature_max_bins_, feature_max_bins_.data(), static_cast(num_features_)); - InitCUDAMemoryFromHostMemory(&cuda_feature_min_bins_, feature_min_bins_.data(), static_cast(num_features_)); - InitCUDAMemoryFromHostMemory(&cuda_feature_missing_is_zero_, feature_missing_is_zero_.data(), static_cast(num_features_)); - InitCUDAMemoryFromHostMemory(&cuda_feature_missing_is_na_, feature_missing_is_na_.data(), static_cast(num_features_)); - InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_zero_, feature_mfb_is_zero_.data(), static_cast(num_features_)); - InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_na_, feature_mfb_is_na_.data(), static_cast(num_features_)); AllocateCUDAMemory(12, &cuda_split_info_buffer_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_split_leaf_index_); @@ -121,22 +80,13 @@ void CUDADataPartition::Init(const Dataset* train_data) { AllocateCUDAMemory(static_cast(num_leaves_), &tree_gain_); AllocateCUDAMemory(static_cast(num_leaves_), &tree_default_left_); - AllocateCUDAMemory(static_cast(num_leaves_), &data_partition_leaf_output_); - - CopyColWiseData(train_data); + AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_output_); - cpu_split_info_buffer_.resize(6, 0); - - cuda_streams_.resize(5); + cuda_streams_.resize(4); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[2])); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[3])); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[4])); - - const size_t max_num_blocks_in_debug = static_cast((num_data_ + 1023) / 1024); - AllocateCUDAMemory(max_num_blocks_in_debug, &cuda_gradients_sum_buffer_); - AllocateCUDAMemory(max_num_blocks_in_debug, &cuda_hessians_sum_buffer_); std::vector flatten_bin_upper_bounds; std::vector feature_num_bin_offsets; @@ -154,193 +104,6 @@ void CUDADataPartition::Init(const Dataset* train_data) { InitCUDAMemoryFromHostMemory(&cuda_bin_upper_bounds_, flatten_bin_upper_bounds.data(), flatten_bin_upper_bounds.size()); } -void CUDADataPartition::CopyColWiseData(const Dataset* train_data) { - const int num_feature_group = train_data->num_feature_groups(); - int column_index = 0; - std::vector> features_in_group(num_feature_group); - for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) { - const int feature_group_index = train_data->Feature2Group(feature_index); - features_in_group[feature_group_index].emplace_back(feature_index); - } - - feature_index_to_column_index_.resize(num_features_, -1); - for (int feature_group_index = 0; feature_group_index < num_feature_group; ++feature_group_index) { - if (!train_data->IsMultiGroup(feature_group_index)) { - for (const int feature_index : features_in_group[feature_group_index]) { - feature_index_to_column_index_[feature_index] = column_index; - } - ++column_index; - } else { - for (const int feature_index : features_in_group[feature_group_index]) { - feature_index_to_column_index_[feature_index] = column_index; - ++column_index; - } - } - - if (!train_data->IsMultiGroup(feature_group_index)) { - uint8_t bit_type = 0; - bool is_sparse = false; - std::vector bin_iterator; - const uint8_t* column_data = train_data->GetColWiseData(feature_group_index, -1, &bit_type, &is_sparse, &bin_iterator, num_threads_); - if (column_data != nullptr) { - CHECK(!is_sparse); - if (bit_type == 4) { - std::vector true_column_data(num_data_, 0); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (data_size_t i = 0; i < num_data_; ++i) { - true_column_data[i] = static_cast((column_data[i >> 1] >> ((i & 1) << 2)) & 0xf); - } - bit_type = 8; - uint8_t* cuda_true_column_data = nullptr; - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), static_cast(num_data_)); - cuda_data_by_column_.emplace_back(cuda_true_column_data); - } else if (bit_type == 8) { - uint8_t* cuda_true_column_data = nullptr; - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, column_data, static_cast(num_data_)); - cuda_data_by_column_.emplace_back(cuda_true_column_data); - } else if (bit_type == 16) { - uint16_t* cuda_true_column_data = nullptr; - const uint16_t* true_column_data = reinterpret_cast(column_data); - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data, static_cast(num_data_)); - cuda_data_by_column_.emplace_back(cuda_true_column_data); - } else if (bit_type == 32) { - uint32_t* cuda_true_column_data = nullptr; - const uint32_t* true_column_data = reinterpret_cast(column_data); - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data, static_cast(num_data_)); - cuda_data_by_column_.emplace_back(cuda_true_column_data); - } else { - Log::Fatal("Unknow bit type = %d", bit_type); - } - } else { - CHECK(is_sparse); - CHECK_EQ(bin_iterator.size(), static_cast(num_threads_)); - if (bit_type == 8) { - std::vector true_column_data(num_data_, 0); - uint8_t* cuda_true_column_data = nullptr; - Threading::For(0, num_data_, 512, - [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { - bin_iterator[thread_index]->Reset(start); - BinIterator* thread_bin_iterator = bin_iterator[thread_index]; - for (data_size_t data_index = start; data_index < end; ++data_index) { - true_column_data[data_index] = static_cast(thread_bin_iterator->RawGet(data_index)); - } - }); - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); - cuda_data_by_column_.emplace_back(cuda_true_column_data); - } else if (bit_type == 16) { - std::vector true_column_data(num_data_, 0); - uint16_t* cuda_true_column_data = nullptr; - Threading::For(0, num_data_, 512, - [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { - bin_iterator[thread_index]->Reset(start); - BinIterator* thread_bin_iterator = bin_iterator[thread_index]; - for (data_size_t data_index = start; data_index < end; ++data_index) { - true_column_data[data_index] = static_cast(thread_bin_iterator->RawGet(data_index)); - } - }); - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); - cuda_data_by_column_.emplace_back(cuda_true_column_data); - } else if (bit_type == 32) { - std::vector true_column_data(num_data_, 0); - uint32_t* cuda_true_column_data = nullptr; - Threading::For(0, num_data_, 512, - [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { - bin_iterator[thread_index]->Reset(start); - BinIterator* thread_bin_iterator = bin_iterator[thread_index]; - for (data_size_t data_index = start; data_index < end; ++data_index) { - true_column_data[data_index] = thread_bin_iterator->RawGet(data_index); - } - }); - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); - cuda_data_by_column_.emplace_back(cuda_true_column_data); - } - } - column_bit_type_.emplace_back(bit_type); - } else { - for (int sub_feature_index = 0; sub_feature_index < static_cast(features_in_group[feature_group_index].size()); ++sub_feature_index) { - uint8_t bit_type = 0; - bool is_sparse = false; - std::vector bin_iterator; - const uint8_t* column_data = train_data->GetColWiseData(feature_group_index, sub_feature_index, &bit_type, &is_sparse, &bin_iterator, num_threads_); - if (column_data != nullptr) { - CHECK(!is_sparse); - if (bit_type == 4) { - std::vector true_column_data(num_data_, 0); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (data_size_t i = 0; i < num_data_; ++i) { - true_column_data[i] = static_cast((column_data[i >> 1] >> ((i & 1) << 2)) & 0xf); - } - bit_type = 8; - uint8_t* cuda_true_column_data = nullptr; - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), static_cast(num_data_)); - cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); - } else if (bit_type == 8) { - uint8_t* cuda_true_column_data = nullptr; - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, column_data, static_cast(num_data_)); - cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); - } else if (bit_type == 16) { - uint16_t* cuda_true_column_data = nullptr; - const uint16_t* true_column_data = reinterpret_cast(column_data); - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data, static_cast(num_data_)); - cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); - } else if (bit_type == 32) { - uint32_t* cuda_true_column_data = nullptr; - const uint32_t* true_column_data = reinterpret_cast(column_data); - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data, static_cast(num_data_)); - cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); - } else { - Log::Fatal("Unknow bit type = %d", bit_type); - } - } else { - CHECK(is_sparse); - CHECK_EQ(bin_iterator.size(), static_cast(num_threads_)); - if (bit_type == 8) { - std::vector true_column_data(num_data_, 0); - uint8_t* cuda_true_column_data = nullptr; - Threading::For(0, num_data_, 512, - [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { - bin_iterator[thread_index]->Reset(start); - BinIterator* thread_bin_iterator = bin_iterator[thread_index]; - for (data_size_t data_index = start; data_index < end; ++data_index) { - true_column_data[data_index] = static_cast(thread_bin_iterator->RawGet(data_index)); - } - }); - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); - cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); - } else if (bit_type == 16) { - std::vector true_column_data(num_data_, 0); - uint16_t* cuda_true_column_data = nullptr; - Threading::For(0, num_data_, 512, - [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { - bin_iterator[thread_index]->Reset(start); - BinIterator* thread_bin_iterator = bin_iterator[thread_index]; - for (data_size_t data_index = start; data_index < end; ++data_index) { - true_column_data[data_index] = static_cast(thread_bin_iterator->RawGet(data_index)); - } - }); - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); - cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); - } else if (bit_type == 32) { - std::vector true_column_data(num_data_, 0); - uint32_t* cuda_true_column_data = nullptr; - Threading::For(0, num_data_, 512, - [&bin_iterator, &true_column_data] (const int thread_index, data_size_t start, data_size_t end) { - bin_iterator[thread_index]->Reset(start); - BinIterator* thread_bin_iterator = bin_iterator[thread_index]; - for (data_size_t data_index = start; data_index < end; ++data_index) { - true_column_data[data_index] = thread_bin_iterator->RawGet(data_index); - } - }); - InitCUDAMemoryFromHostMemory(&cuda_true_column_data, true_column_data.data(), true_column_data.size()); - cuda_data_by_column_.emplace_back(reinterpret_cast(cuda_true_column_data)); - } - } - column_bit_type_.emplace_back(bit_type); - } - } - } -} - void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { if (data_indices == nullptr) { // no bagging @@ -348,15 +111,12 @@ void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { SetCUDAMemory(cuda_leaf_num_data_, 0, static_cast(num_leaves_)); SetCUDAMemory(cuda_leaf_data_start_, 0, static_cast(num_leaves_)); SetCUDAMemory(cuda_leaf_data_end_, 0, static_cast(num_leaves_)); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CopyFromCUDADeviceToCUDADevice(cuda_leaf_num_data_, cuda_num_data_, 1); CopyFromCUDADeviceToCUDADevice(cuda_leaf_data_end_, cuda_num_data_, 1); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); cur_num_leaves_ = 1; CopyFromHostToCUDADevice(cuda_cur_num_leaves_, &cur_num_leaves_, 1); - num_data_in_leaf_.clear(); - num_data_in_leaf_.resize(num_leaves_, 0); - num_data_in_leaf_[0] = num_data_; CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1); } else { Log::Fatal("bagging is not supported by GPU"); @@ -478,17 +238,6 @@ void CUDADataPartition::UpdateTrainScore(const double learning_rate, double* cud LaunchAddPredictionToScoreKernel(learning_rate, cuda_scores); } -void CUDADataPartition::CUDACheck( - const int smaller_leaf_index, - const int larger_leaf_index, - const std::vector& num_data_in_leaf, - const CUDALeafSplits* smaller_leaf_splits, - const CUDALeafSplits* larger_leaf_splits, - const score_t* gradients, - const score_t* hessians) { - LaunchCUDACheckKernel(smaller_leaf_index, larger_leaf_index, num_data_in_leaf, smaller_leaf_splits, larger_leaf_splits, gradients, hessians); -} - void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf, int* grid_dim, int* block_dim) { diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index ae4ac7695406..bcbd4046a321 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -52,7 +52,6 @@ __device__ void PrefixSum(uint32_t* elements, unsigned int n) { } __device__ void PrefixSum_1024(uint32_t* elements, unsigned int n) { - //unsigned int offset = 1; unsigned int threadIdx_x = threadIdx.x; const unsigned int conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); const uint32_t last_element = elements[conflict_free_n_minus_1]; @@ -122,32 +121,10 @@ __device__ void PrefixSum_1024(uint32_t* elements, unsigned int n) { __syncthreads(); if (threadIdx_x == 0) { - //const unsigned int src_pos = 511; - //const unsigned int dst_pos = 1023; const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(1023); const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(511); - //elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; elements[conflict_free_dst_pos] += elements[conflict_free_src_pos]; - //} - //__syncthreads(); - - /*for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - offset <<= 1; - __syncthreads(); - }*/ - //if (threadIdx_x == 0) { elements[conflict_free_n_minus_1] = 0; - //} - //__syncthreads(); - - //if (threadIdx_x == 0) { - //const unsigned int dst_pos = 1023; - //const unsigned int src_pos = 511; const uint32_t src_val = elements[conflict_free_src_pos]; elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; elements[conflict_free_dst_pos] += src_val; @@ -253,19 +230,6 @@ __device__ void PrefixSum_1024(uint32_t* elements, unsigned int n) { } __syncthreads(); - /*for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - }*/ if (threadIdx_x == 0) { elements[CONFLICT_FREE_INDEX(n)] = elements[conflict_free_n_minus_1] + last_element; } @@ -391,9 +355,7 @@ __global__ void UpdateDataIndexToLeafIndexKernel(const data_size_t cuda_leaf_dat } } else if (bin > th) { cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; - }/* else { - cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; - }*/ + } } else { if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; @@ -408,8 +370,6 @@ __global__ void UpdateDataIndexToLeafIndexKernel(const data_size_t cuda_leaf_dat cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else { if (!MAX_TO_LEFT) { - /*cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; - } else {*/ cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } } @@ -435,135 +395,135 @@ void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel(const data_size_t const int num_blocks, const int block_size) { if (min_bin_ref < max_bin_ref) { if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } } else { if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } } } @@ -615,52 +575,6 @@ __global__ void GenDataToLeftBitVectorKernel0(const int best_split_feature_ref, split_indices_block_size_data_partition, thread_to_left_offset_cnt); } -// min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernelPacked0(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION * 4]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || - (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - } else if ((bin < min_bin_ref || bin > max_bin_ref)) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; - } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - } else { - cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; - } - } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - } - __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); -} - // min_bin_ref == max_bin_ref template __global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, @@ -714,569 +628,13 @@ __global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, split_indices_block_size_data_partition, thread_to_left_offset_cnt); } -// min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel0_2(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + - ((SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - uint8_t bit0 = 0; - uint8_t bit1 = 0; - uint8_t bit2 = 0; - uint8_t bit3 = 0; - uint8_t bit4 = 0; - uint8_t bit5 = 0; - uint8_t bit6 = 0; - uint8_t bit7 = 0; - unsigned int local_data_index = ((blockIdx.x * blockDim.x) << 3) + (threadIdx.x << 2); - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || - (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit0 = split_missing_default_to_left; - } else if ((bin < min_bin_ref || bin > max_bin_ref)) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit0 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit0 = split_default_to_left; - } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - bit0 = 0; - } else { - cuda_data_to_left[local_data_index] = 1; - bit0 = 1; - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || - (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit1 = split_missing_default_to_left; - } else if ((bin < min_bin_ref || bin > max_bin_ref)) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit1 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit1 = split_default_to_left; - } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - bit1 = 0; - } else { - cuda_data_to_left[local_data_index] = 1; - bit1 = 1; - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || - (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit2 = split_missing_default_to_left; - } else if ((bin < min_bin_ref || bin > max_bin_ref)) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit2 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit2 = split_default_to_left; - } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - bit2 = 0; - } else { - cuda_data_to_left[local_data_index] = 1; - bit2 = 1; - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || - (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit3 = split_missing_default_to_left; - } else if ((bin < min_bin_ref || bin > max_bin_ref)) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit3 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit3 = split_default_to_left; - } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - bit3 = 0; - } else { - cuda_data_to_left[local_data_index] = 1; - bit3 = 1; - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - local_data_index = ((blockIdx.x * blockDim.x) << 3) + ((threadIdx.x + blockDim.x) << 2); - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || - (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit4 = split_missing_default_to_left; - } else if ((bin < min_bin_ref || bin > max_bin_ref)) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit4 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit4 = split_default_to_left; - } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - bit4 = 0; - } else { - cuda_data_to_left[local_data_index] = 1; - bit4 = 1; - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || - (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit5 = split_missing_default_to_left; - } else if ((bin < min_bin_ref || bin > max_bin_ref)) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit5 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit5 = split_default_to_left; - } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - bit5 = 0; - } else { - cuda_data_to_left[local_data_index] = 1; - bit5 = 1; - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || - (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit6 = split_missing_default_to_left; - } else if ((bin < min_bin_ref || bin > max_bin_ref)) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit6 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit6 = split_default_to_left; - } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - bit6 = 0; - } else { - cuda_data_to_left[local_data_index] = 1; - bit6 = 1; - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || - (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit7 = split_missing_default_to_left; - } else if ((bin < min_bin_ref || bin > max_bin_ref)) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit7 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit7 = split_default_to_left; - } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - bit7 = 0; - } else { - cuda_data_to_left[local_data_index] = 1; - bit7 = 1; - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = bit0 + bit1 + bit2 + bit3; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x + blockDim.x)] = bit4 + bit5 + bit6 + bit7; - __syncthreads(); - ReduceSum(thread_to_left_offset_cnt, (split_indices_block_size_data_partition << 1)); - __syncthreads(); - if (threadIdx.x == 0) { - const data_size_t num_data_in_block = (((blockIdx.x + 1) * blockDim.x * 8) <= num_data_in_leaf) ? - static_cast(blockDim.x * 8) : - (num_data_in_leaf - static_cast(blockIdx.x * blockDim.x * 8)); - if (num_data_in_block > 0) { - const data_size_t data_to_left = static_cast(thread_to_left_offset_cnt[0]); - block_to_left_offset_buffer[blockIdx.x + 1] = data_to_left; - block_to_right_offset_buffer[blockIdx.x + 1] = num_data_in_block - data_to_left; - } else { - block_to_left_offset_buffer[blockIdx.x + 1] = 0; - block_to_right_offset_buffer[blockIdx.x + 1] = 0; - } - } -} - -// min_bin_ref == max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel16_2(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { - if (blockIdx.x == 0 && threadIdx.x == 0) { - printf("********************************************** calling GenDataToLeftBitVectorKernel16_2 **********************************************\n"); - } - __shared__ uint16_t thread_to_left_offset_cnt[(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + 1 + - ((SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; - uint8_t bit0 = 0; - uint8_t bit1 = 0; - uint8_t bit2 = 0; - uint8_t bit3 = 0; - uint8_t bit4 = 0; - uint8_t bit5 = 0; - uint8_t bit6 = 0; - uint8_t bit7 = 0; - unsigned int local_data_index = ((blockIdx.x * blockDim.x) << 3) + (threadIdx.x << 2); - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit0 = split_missing_default_to_left; - } else if (bin != max_bin_ref) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit0 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit0 = split_default_to_left; - } - } else { - if (MISSING_IS_NA && !MFB_IS_NA) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit0 = split_missing_default_to_left; - } else { - if (MAX_TO_LEFT) { - cuda_data_to_left[local_data_index] = 1; - bit0 = 1; - } else { - cuda_data_to_left[local_data_index] = 0; - bit0 = 0; - } - } - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit1 = split_missing_default_to_left; - } else if (bin != max_bin_ref) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit1 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit1 = split_default_to_left; - } - } else { - if (MISSING_IS_NA && !MFB_IS_NA) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit1 = split_missing_default_to_left; - } else { - if (MAX_TO_LEFT) { - cuda_data_to_left[local_data_index] = 1; - bit1 = 1; - } else { - cuda_data_to_left[local_data_index] = 0; - bit1 = 0; - } - } - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit2 = split_missing_default_to_left; - } else if (bin != max_bin_ref) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit2 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit2 = split_default_to_left; - } - } else { - if (MISSING_IS_NA && !MFB_IS_NA) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit2 = split_missing_default_to_left; - } else { - if (MAX_TO_LEFT) { - cuda_data_to_left[local_data_index] = 1; - bit2 = 1; - } else { - cuda_data_to_left[local_data_index] = 0; - bit2 = 0; - } - } - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit3 = split_missing_default_to_left; - } else if (bin != max_bin_ref) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit3 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit3 = split_default_to_left; - } - } else { - if (MISSING_IS_NA && !MFB_IS_NA) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit3 = split_missing_default_to_left; - } else { - if (MAX_TO_LEFT) { - cuda_data_to_left[local_data_index] = 1; - bit3 = 1; - } else { - cuda_data_to_left[local_data_index] = 0; - bit3 = 0; - } - } - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - local_data_index = ((blockIdx.x * blockDim.x) << 3) + ((threadIdx.x + blockDim.x) << 2); - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit4 = split_missing_default_to_left; - } else if (bin != max_bin_ref) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit4 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit4 = split_default_to_left; - } - } else { - if (MISSING_IS_NA && !MFB_IS_NA) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit4 = split_missing_default_to_left; - } else { - if (MAX_TO_LEFT) { - cuda_data_to_left[local_data_index] = 1; - bit4 = 1; - } else { - cuda_data_to_left[local_data_index] = 0; - bit4 = 0; - } - } - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit5 = split_missing_default_to_left; - } else if (bin != max_bin_ref) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit5 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit5 = split_default_to_left; - } - } else { - if (MISSING_IS_NA && !MFB_IS_NA) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit5 = split_missing_default_to_left; - } else { - if (MAX_TO_LEFT) { - cuda_data_to_left[local_data_index] = 1; - bit5 = 1; - } else { - cuda_data_to_left[local_data_index] = 0; - bit5 = 0; - } - } - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit6 = split_missing_default_to_left; - } else if (bin != max_bin_ref) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit6 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit6 = split_default_to_left; - } - } else { - if (MISSING_IS_NA && !MFB_IS_NA) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit6 = split_missing_default_to_left; - } else { - if (MAX_TO_LEFT) { - cuda_data_to_left[local_data_index] = 1; - bit6 = 1; - } else { - cuda_data_to_left[local_data_index] = 0; - bit6 = 0; - } - } - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - ++local_data_index; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit7 = split_missing_default_to_left; - } else if (bin != max_bin_ref) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit7 = split_missing_default_to_left; - } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - bit7 = split_default_to_left; - } - } else { - if (MISSING_IS_NA && !MFB_IS_NA) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - bit7 = split_missing_default_to_left; - } else { - if (MAX_TO_LEFT) { - cuda_data_to_left[local_data_index] = 1; - bit7 = 1; - } else { - cuda_data_to_left[local_data_index] = 0; - bit7 = 0; - } - } - } - } else { - cuda_data_to_left[local_data_index] = 0; - } - __syncthreads(); - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = bit0 + bit1 + bit2 + bit3; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x + blockDim.x)] = bit4 + bit5 + bit6 + bit7; - __syncthreads(); - ReduceSum(thread_to_left_offset_cnt, (split_indices_block_size_data_partition << 1)); - __syncthreads(); - if (threadIdx.x == 0) { - const data_size_t num_data_in_block = (((blockIdx.x + 1) * blockDim.x * 8) <= num_data_in_leaf) ? - static_cast(blockDim.x * 8) : - (num_data_in_leaf - static_cast(blockIdx.x * blockDim.x * 8)); - if (num_data_in_block > 0) { - const data_size_t data_to_left = static_cast(thread_to_left_offset_cnt[0]); - block_to_left_offset_buffer[blockIdx.x + 1] = data_to_left; - block_to_right_offset_buffer[blockIdx.x + 1] = num_data_in_block - data_to_left; - } else { - block_to_left_offset_buffer[blockIdx.x + 1] = 0; - block_to_right_offset_buffer[blockIdx.x + 1] = 0; - } - } -} - -#define GenBitVector_ARGS \ - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, \ - th, num_features_, \ - column_data, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, \ - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, \ - split_indices_block_size_data_partition_aligned, \ - cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index +#define GenBitVector_ARGS \ + split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, \ + th, num_features_, \ + column_data, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, \ + split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, \ + split_indices_block_size_data_partition_aligned, \ + cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index template void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( @@ -1302,235 +660,108 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( const int right_leaf_index, const int default_leaf_index, const int missing_default_leaf_index) { + const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index); if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } } template -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsMinInner2( - const bool missing_is_zero, - const bool missing_is_na, - const bool mfb_is_zero, - const bool mfb_is_na, - const bool max_bin_to_left, - const int column_index, - const int num_blocks_final, - const int split_indices_block_size_data_partition_aligned, - const int split_feature_index, - const data_size_t leaf_data_start, - const data_size_t num_data_in_leaf, - const uint32_t th, - const uint32_t t_zero_bin, - const uint32_t most_freq_bin, - const uint32_t max_bin, - const uint32_t min_bin, - const uint8_t split_default_to_left, - const uint8_t split_missing_default_to_left, - const int left_leaf_index, - const int right_leaf_index, - const int default_leaf_index, - const int missing_default_leaf_index) { - int grid_dim = 0; - int block_dim = 0; - CalcBlockDim(num_data_in_leaf, &grid_dim, &block_dim); - CHECK_EQ(num_blocks_final, grid_dim); - CHECK_EQ(split_indices_block_size_data_partition_aligned, block_dim); - if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel16_2<<>>(GenBitVector_ARGS); - } -} - -template -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( const bool missing_is_zero, const bool missing_is_na, const bool mfb_is_zero, @@ -1552,482 +783,129 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( const int right_leaf_index, const int default_leaf_index, const int missing_default_leaf_index) { + const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index); if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } -} - -template -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner2( - const bool missing_is_zero, - const bool missing_is_na, - const bool mfb_is_zero, - const bool mfb_is_na, - const int column_index, - const int num_blocks_final, - const int split_indices_block_size_data_partition_aligned, - const int split_feature_index, - const data_size_t leaf_data_start, - const data_size_t num_data_in_leaf, - const uint32_t th, - const uint32_t t_zero_bin, - const uint32_t most_freq_bin, - const uint32_t max_bin, - const uint32_t min_bin, - const uint8_t split_default_to_left, - const uint8_t split_missing_default_to_left, - const int left_leaf_index, - const int right_leaf_index, - const int default_leaf_index, - const int missing_default_leaf_index) { - int grid_dim = 0; - int block_dim = 0; - CalcBlockDim(num_data_in_leaf, &grid_dim, &block_dim); - CHECK_EQ(num_blocks_final, grid_dim); - CHECK_EQ(split_indices_block_size_data_partition_aligned, block_dim); - if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - GenDataToLeftBitVectorKernel0_2<<>>(GenBitVector_ARGS); - } -} - -#undef GenBitVector_ARGS - -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, - const int split_feature_index, const uint32_t split_threshold, - const uint8_t split_default_left, const data_size_t leaf_data_start, - const int left_leaf_index, const int right_leaf_index) { - const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; - const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); - int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; - int split_indices_block_size_data_partition_aligned = 1; - while (split_indices_block_size_data_partition > 0) { - split_indices_block_size_data_partition_aligned <<= 1; - split_indices_block_size_data_partition >>= 1; - } - const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; - const uint8_t missing_is_zero = feature_missing_is_zero_[split_feature_index]; - const uint8_t missing_is_na = feature_missing_is_na_[split_feature_index]; - const uint8_t mfb_is_zero = feature_mfb_is_zero_[split_feature_index]; - const uint8_t mfb_is_na = feature_mfb_is_na_[split_feature_index]; - const uint32_t default_bin = feature_default_bins_[split_feature_index]; - const uint32_t most_freq_bin = feature_most_freq_bins_[split_feature_index]; - const uint32_t min_bin = feature_min_bins_[split_feature_index]; - const uint32_t max_bin = feature_max_bins_[split_feature_index]; - - uint32_t th = split_threshold + min_bin; - uint32_t t_zero_bin = min_bin + default_bin; - if (most_freq_bin == 0) { - --th; - --t_zero_bin; - } - uint8_t split_default_to_left = 0; - uint8_t split_missing_default_to_left = 0; - int default_leaf_index = right_leaf_index; - int missing_default_leaf_index = right_leaf_index; - if (most_freq_bin <= split_threshold) { - split_default_to_left = 1; - default_leaf_index = left_leaf_index; - } - if (missing_is_zero || missing_is_na) { - if (split_default_left) { - split_missing_default_to_left = 1; - missing_default_leaf_index = left_leaf_index; - } - } - const int column_index = feature_index_to_column_index_[split_feature_index]; - const uint8_t bit_type = column_bit_type_[column_index]; - - const bool max_bin_to_left = (max_bin <= th); - - if (min_bin < max_bin) { - if (bit_type == 8) { - LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } else if (bit_type == 16) { - LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } else if (bit_type == 32) { - LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } - } else { - if (bit_type == 8) { - LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - max_bin_to_left, - column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } else if (bit_type == 16) { - LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - max_bin_to_left, - column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } else if (bit_type == 32) { - LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - max_bin_to_left, - column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } - } - - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, - cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, - left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, - static_cast(missing_is_zero), - static_cast(missing_is_na), - static_cast(mfb_is_zero), - static_cast(mfb_is_na), - max_bin_to_left, - num_blocks_final, - split_indices_block_size_data_partition_aligned); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, - cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, - left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, - static_cast(missing_is_zero), - static_cast(missing_is_na), - static_cast(mfb_is_zero), - static_cast(mfb_is_na), - max_bin_to_left, - num_blocks_final, - split_indices_block_size_data_partition_aligned); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); - LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, - cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, - left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, - static_cast(missing_is_zero), - static_cast(missing_is_na), - static_cast(mfb_is_zero), - static_cast(mfb_is_na), - max_bin_to_left, - num_blocks_final, - split_indices_block_size_data_partition_aligned); + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { + const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } } -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t num_data_in_leaf, +#undef GenBitVector_ARGS + +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start, const int left_leaf_index, const int right_leaf_index) { - int grid_dim = 0; - int block_dim = 0; - CalcBlockDim(num_data_in_leaf, &grid_dim, &block_dim); - const uint8_t missing_is_zero = feature_missing_is_zero_[split_feature_index]; - const uint8_t missing_is_na = feature_missing_is_na_[split_feature_index]; - const uint8_t mfb_is_zero = feature_mfb_is_zero_[split_feature_index]; - const uint8_t mfb_is_na = feature_mfb_is_na_[split_feature_index]; - const uint32_t default_bin = feature_default_bins_[split_feature_index]; - const uint32_t most_freq_bin = feature_most_freq_bins_[split_feature_index]; - const uint32_t min_bin = feature_min_bins_[split_feature_index]; - const uint32_t max_bin = feature_max_bins_[split_feature_index]; + const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; + const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); + int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; + int split_indices_block_size_data_partition_aligned = 1; + while (split_indices_block_size_data_partition > 0) { + split_indices_block_size_data_partition_aligned <<= 1; + split_indices_block_size_data_partition >>= 1; + } + const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; + const uint8_t missing_is_zero = cuda_column_data_->feature_missing_is_zero(split_feature_index); + const uint8_t missing_is_na = cuda_column_data_->feature_missing_is_na(split_feature_index); + const uint8_t mfb_is_zero = cuda_column_data_->feature_mfb_is_zero(split_feature_index); + const uint8_t mfb_is_na = cuda_column_data_->feature_mfb_is_na(split_feature_index); + const uint32_t default_bin = cuda_column_data_->feature_default_bin(split_feature_index); + const uint32_t most_freq_bin = cuda_column_data_->feature_most_freq_bin(split_feature_index); + const uint32_t min_bin = cuda_column_data_->feature_min_bin(split_feature_index); + const uint32_t max_bin = cuda_column_data_->feature_max_bin(split_feature_index); uint32_t th = split_threshold + min_bin; uint32_t t_zero_bin = min_bin + default_bin; @@ -2049,21 +927,21 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu missing_default_leaf_index = left_leaf_index; } } - const int column_index = feature_index_to_column_index_[split_feature_index]; - const uint8_t bit_type = column_bit_type_[column_index]; + const int column_index = cuda_column_data_->feature_to_column(split_feature_index); + const uint8_t bit_type = cuda_column_data_->column_bit_type(column_index); const bool max_bin_to_left = (max_bin <= th); if (min_bin < max_bin) { if (bit_type == 8) { - LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner2( + LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( missing_is_zero, missing_is_na, mfb_is_zero, mfb_is_na, column_index, - grid_dim, - block_dim, + num_blocks_final, + split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -2079,14 +957,14 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu default_leaf_index, missing_default_leaf_index); } else if (bit_type == 16) { - LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner2( + LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( missing_is_zero, missing_is_na, mfb_is_zero, mfb_is_na, column_index, - grid_dim, - block_dim, + num_blocks_final, + split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -2102,14 +980,14 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu default_leaf_index, missing_default_leaf_index); } else if (bit_type == 32) { - LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner2( + LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( missing_is_zero, missing_is_na, mfb_is_zero, mfb_is_na, column_index, - grid_dim, - block_dim, + num_blocks_final, + split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -2127,15 +1005,15 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu } } else { if (bit_type == 8) { - LaunchGenDataToLeftBitVectorKernelMaxIsMinInner2( + LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( missing_is_zero, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left, column_index, - grid_dim, - block_dim, + num_blocks_final, + split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -2151,15 +1029,15 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu default_leaf_index, missing_default_leaf_index); } else if (bit_type == 16) { - LaunchGenDataToLeftBitVectorKernelMaxIsMinInner2( + LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( missing_is_zero, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left, column_index, - grid_dim, - block_dim, + num_blocks_final, + split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -2175,15 +1053,15 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu default_leaf_index, missing_default_leaf_index); } else if (bit_type == 32) { - LaunchGenDataToLeftBitVectorKernelMaxIsMinInner2( + LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( missing_is_zero, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left, column_index, - grid_dim, - block_dim, + num_blocks_final, + split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -2201,11 +1079,9 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu } } - int grid_dim_copy = 0; - int block_dim_copy = 0; - CalcBlockDimInCopy(num_data_in_leaf, &grid_dim_copy, &block_dim_copy); + const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index); if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const uint8_t* column_data = reinterpret_cast(column_data_pointer); LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, @@ -2214,10 +1090,10 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu static_cast(mfb_is_zero), static_cast(mfb_is_na), max_bin_to_left, - grid_dim_copy, - block_dim_copy); + num_blocks_final, + split_indices_block_size_data_partition_aligned); } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const uint16_t* column_data = reinterpret_cast(column_data_pointer); LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, @@ -2226,10 +1102,10 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu static_cast(mfb_is_zero), static_cast(mfb_is_na), max_bin_to_left, - grid_dim_copy, - block_dim_copy); + num_blocks_final, + split_indices_block_size_data_partition_aligned); } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(cuda_data_by_column_[column_index]); + const uint32_t* column_data = reinterpret_cast(column_data_pointer); LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, @@ -2238,129 +1114,12 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel2(const data_size_t nu static_cast(mfb_is_zero), static_cast(mfb_is_na), max_bin_to_left, - grid_dim_copy, - block_dim_copy); - } -} - -__global__ void AggregateBlockOffsetKernel(const int* leaf_index, data_size_t* block_to_left_offset_buffer, - data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, - data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, - int* cuda_cur_num_leaves, - const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, const double* best_split_gain, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, - // for leaf splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, - const int* cuda_num_total_bin, - hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition) { - __shared__ uint32_t block_to_left_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; - __shared__ uint32_t block_to_right_offset[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; - const int leaf_index_ref = *leaf_index; - const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; - const unsigned int blockDim_x = blockDim.x; - const unsigned int threadIdx_x = threadIdx.x; - const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); - const unsigned int conflict_free_threadIdx_x_plus_blockDim_x = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x); - const uint32_t num_blocks = (num_data_in_leaf + split_indices_block_size_data_partition - 1) / split_indices_block_size_data_partition; - const uint32_t num_aggregate_blocks = (num_blocks + split_indices_block_size_data_partition - 1) / split_indices_block_size_data_partition; - uint32_t left_prev_sum = 0; - for (uint32_t block_id = 0; block_id < num_aggregate_blocks; ++block_id) { - const unsigned int read_index = block_id * blockDim_x * 2 + threadIdx_x; - if (read_index < num_blocks) { - block_to_left_offset[conflict_free_threadIdx_x] = block_to_left_offset_buffer[read_index + 1]; - } else { - block_to_left_offset[conflict_free_threadIdx_x] = 0; - } - const unsigned int read_index_plus_blockDim_x = read_index + blockDim_x; - if (read_index_plus_blockDim_x < num_blocks) { - block_to_left_offset[conflict_free_threadIdx_x_plus_blockDim_x] = block_to_left_offset_buffer[read_index_plus_blockDim_x + 1]; - } else { - block_to_left_offset[conflict_free_threadIdx_x_plus_blockDim_x] = 0; - } - if (threadIdx_x == 0) { - block_to_left_offset[0] += left_prev_sum; - } - __syncthreads(); - PrefixSum(block_to_left_offset, split_indices_block_size_data_partition); - __syncthreads(); - if (threadIdx_x == 0) { - left_prev_sum = block_to_left_offset[CONFLICT_FREE_INDEX(split_indices_block_size_data_partition)]; - } - if (read_index < num_blocks) { - const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); - block_to_left_offset_buffer[read_index + 1] = block_to_left_offset[conflict_free_threadIdx_x_plus_1]; - } - if (read_index_plus_blockDim_x < num_blocks) { - const unsigned int conflict_free_threadIdx_x_plus_1_plus_blockDim_x = CONFLICT_FREE_INDEX(threadIdx_x + 1 + blockDim_x); - block_to_left_offset_buffer[read_index_plus_blockDim_x + 1] = block_to_left_offset[conflict_free_threadIdx_x_plus_1_plus_blockDim_x]; - } - __syncthreads(); - } - const unsigned int to_left_total_cnt = block_to_left_offset_buffer[num_blocks]; - uint32_t right_prev_sum = to_left_total_cnt; - for (uint32_t block_id = 0; block_id < num_aggregate_blocks; ++block_id) { - const unsigned int read_index = block_id * blockDim_x * 2 + threadIdx_x; - if (read_index < num_blocks) { - block_to_right_offset[conflict_free_threadIdx_x] = block_to_right_offset_buffer[read_index + 1]; - } else { - block_to_right_offset[conflict_free_threadIdx_x] = 0; - } - const unsigned int read_index_plus_blockDim_x = read_index + blockDim_x; - if (read_index_plus_blockDim_x < num_blocks) { - block_to_right_offset[conflict_free_threadIdx_x_plus_blockDim_x] = block_to_right_offset_buffer[read_index_plus_blockDim_x + 1]; - } else { - block_to_right_offset[conflict_free_threadIdx_x_plus_blockDim_x] = 0; - } - if (threadIdx_x == 0) { - block_to_right_offset[0] += right_prev_sum; - } - __syncthreads(); - PrefixSum(block_to_right_offset, split_indices_block_size_data_partition); - __syncthreads(); - if (threadIdx_x == 0) { - right_prev_sum = block_to_right_offset[CONFLICT_FREE_INDEX(split_indices_block_size_data_partition)]; - } - if (read_index < num_blocks) { - const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); - block_to_right_offset_buffer[read_index + 1] = block_to_right_offset[conflict_free_threadIdx_x_plus_1]; - } - if (read_index_plus_blockDim_x < num_blocks) { - const unsigned int conflict_free_threadIdx_x_plus_1_plus_blockDim_x = CONFLICT_FREE_INDEX(threadIdx_x + 1 + blockDim_x); - block_to_right_offset_buffer[read_index_plus_blockDim_x + 1] = block_to_right_offset[conflict_free_threadIdx_x_plus_1_plus_blockDim_x]; - } - __syncthreads(); - } - if (blockIdx.x == 0 && threadIdx.x == 0) { - ++(*cuda_cur_num_leaves); - const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; - block_to_left_offset_buffer[0] = 0; - const unsigned int to_left_total_cnt = block_to_left_offset_buffer[num_blocks]; - block_to_right_offset_buffer[0] = to_left_total_cnt; - const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index_ref]; - cuda_leaf_data_end[leaf_index_ref] = cuda_leaf_data_start[leaf_index_ref] + static_cast(to_left_total_cnt); - cuda_leaf_num_data[leaf_index_ref] = static_cast(to_left_total_cnt); - cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index_ref]; - cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; - cuda_leaf_num_data[cur_max_leaf_index] = block_to_right_offset_buffer[num_blocks] - to_left_total_cnt; + num_blocks_final, + split_indices_block_size_data_partition_aligned); } } -__global__ void AggregateBlockOffsetKernel2(const int* leaf_index, data_size_t* block_to_left_offset_buffer, +__global__ void AggregateBlockOffsetKernel0(const int* leaf_index, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, int* cuda_cur_num_leaves, @@ -2381,13 +1140,12 @@ __global__ void AggregateBlockOffsetKernel2(const int* leaf_index, data_size_t* double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, hist_t** larger_leaf_cuda_hist_pointer_pointer, - const int* cuda_num_total_bin, hist_t* cuda_hist, hist_t** cuda_hist_pool, const data_size_t num_blocks) { - __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE + 2 + - (AGGREGATE_BLOCK_SIZE + 2) / NUM_BANKS_DATA_PARTITION]; - __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE + 2 + - (AGGREGATE_BLOCK_SIZE + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; const int leaf_index_ref = *leaf_index; const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; const unsigned int blockDim_x = blockDim.x; @@ -2446,7 +1204,7 @@ __global__ void AggregateBlockOffsetKernel2(const int* leaf_index, data_size_t* } } -__global__ void AggregateBlockOffsetKernel3(const int* leaf_index, data_size_t* block_to_left_offset_buffer, +__global__ void AggregateBlockOffsetKernel1(const int* leaf_index, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, int* cuda_cur_num_leaves, @@ -2467,13 +1225,12 @@ __global__ void AggregateBlockOffsetKernel3(const int* leaf_index, data_size_t* double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, hist_t** larger_leaf_cuda_hist_pointer_pointer, - const int* cuda_num_total_bin, hist_t* cuda_hist, hist_t** cuda_hist_pool, const data_size_t num_blocks, const data_size_t num_blocks_aligned) { - __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE + 2 + - (AGGREGATE_BLOCK_SIZE + 2) / NUM_BANKS_DATA_PARTITION]; - __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE + 2 + - (AGGREGATE_BLOCK_SIZE + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; const int leaf_index_ref = *leaf_index; const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; const unsigned int threadIdx_x = threadIdx.x; @@ -2532,19 +1289,18 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, hist_t** larger_leaf_cuda_hist_pointer_pointer, - const int* cuda_num_total_bin, + const int num_total_bin, hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition, const double* cuda_bin_upper_bounds, const int* cuda_feature_num_bin_offsets, int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, double* tree_threshold_real, double* tree_left_output, double* tree_right_output, data_size_t* tree_left_count, data_size_t* tree_right_count, double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, - double* data_partition_leaf_output, + double* cuda_leaf_output, int* cuda_split_info_buffer) { const int leaf_index_ref = *leaf_index; const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; const unsigned int to_left_total_cnt = cuda_leaf_num_data[leaf_index_ref]; - const int cuda_num_total_bin_ref = *cuda_num_total_bin; double* cuda_split_info_buffer_for_hessians = reinterpret_cast(cuda_split_info_buffer + 8); const unsigned int global_thread_index = blockIdx.x * blockDim.x + threadIdx.x; if (global_thread_index == 0) { @@ -2570,9 +1326,9 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo } else if (global_thread_index == 10) { tree_default_left[cur_max_leaf_index - 1] = best_split_default_left[leaf_index_ref]; } else if (global_thread_index == 11) { - data_partition_leaf_output[leaf_index_ref] = best_left_leaf_value[leaf_index_ref]; + cuda_leaf_output[leaf_index_ref] = best_left_leaf_value[leaf_index_ref]; } else if (global_thread_index == 12) { - data_partition_leaf_output[cur_max_leaf_index] = best_right_leaf_value[leaf_index_ref]; + cuda_leaf_output[cur_max_leaf_index] = best_right_leaf_value[leaf_index_ref]; } else if (global_thread_index == 13) { cuda_split_info_buffer[0] = leaf_index_ref; } else if (global_thread_index == 14) { @@ -2604,7 +1360,7 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo if (global_thread_index == 0) { hist_t* parent_hist_ptr = cuda_hist_pool[leaf_index_ref]; cuda_hist_pool[cur_max_leaf_index] = parent_hist_ptr; - cuda_hist_pool[leaf_index_ref] = cuda_hist + 2 * cur_max_leaf_index * cuda_num_total_bin_ref; + cuda_hist_pool[leaf_index_ref] = cuda_hist + 2 * cur_max_leaf_index * num_total_bin; *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; } else if (global_thread_index == 1) { @@ -2670,7 +1426,7 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo } else if (global_thread_index == 13) { *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_num_data[leaf_index_ref]; } else if (global_thread_index == 14) { - cuda_hist_pool[cur_max_leaf_index] = cuda_hist + 2 * cur_max_leaf_index * cuda_num_total_bin_ref; + cuda_hist_pool[cur_max_leaf_index] = cuda_hist + 2 * cur_max_leaf_index * num_total_bin; *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; } else if (global_thread_index == 15) { *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; @@ -2919,8 +1675,8 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data } global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); - if (num_blocks_final > AGGREGATE_BLOCK_SIZE) { - AggregateBlockOffsetKernel2<<<1, AGGREGATE_BLOCK_SIZE, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + if (num_blocks_final > AGGREGATE_BLOCK_SIZE_DATA_PARTITION) { + AggregateBlockOffsetKernel0<<<1, AGGREGATE_BLOCK_SIZE_DATA_PARTITION, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, cuda_cur_num_leaves_, @@ -2940,12 +1696,11 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, larger_leaf_cuda_hist_pointer_pointer, - cuda_num_total_bin_, cuda_hist_, cuda_hist_pool_, num_blocks_final); } else { - AggregateBlockOffsetKernel3<<<1, num_blocks_final_aligned, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + AggregateBlockOffsetKernel1<<<1, num_blocks_final_aligned, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, cuda_cur_num_leaves_, @@ -2965,12 +1720,11 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, larger_leaf_cuda_hist_pointer_pointer, - cuda_num_total_bin_, cuda_hist_, cuda_hist_pool_, num_blocks_final, num_blocks_final_aligned); } - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); global_timer.Start("CUDADataPartition::SplitInnerKernel"); @@ -2978,7 +1732,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); - //SynchronizeCUDADevice(); + //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::SplitInnerKernel"); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); @@ -3002,7 +1756,7 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, larger_leaf_cuda_hist_pointer_pointer, - cuda_num_total_bin_, + num_total_bin_, cuda_hist_, cuda_hist_pool_, split_indices_block_size_data_partition_aligned, @@ -3011,15 +1765,14 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, tree_threshold_real_, tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, - data_partition_leaf_output_, cuda_split_info_buffer_); - //SynchronizeCUDADevice(); + cuda_leaf_output_, cuda_split_info_buffer_); global_timer.Stop("CUDADataPartition::SplitTreeStructureKernel"); std::vector cpu_split_info_buffer(12); const double* cpu_sum_hessians_info = reinterpret_cast(cpu_split_info_buffer.data() + 8); global_timer.Start("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); CopyFromCUDADeviceToHostAsync(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 12, cuda_streams_[0]); global_timer.Stop("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); const data_size_t left_leaf_num_data = cpu_split_info_buffer[1]; const data_size_t left_leaf_data_start = cpu_split_info_buffer[2]; const data_size_t right_leaf_num_data = cpu_split_info_buffer[4]; @@ -3040,162 +1793,6 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data *larger_leaf_index = cpu_split_info_buffer[7]; } -void CUDADataPartition::LaunchSplitInnerKernel2(const int* leaf_index, const data_size_t num_data_in_leaf, - const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, const double* best_split_gain, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, - // for leaf splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, - std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, - std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { - int block_dim = 0; - int grid_dim = 0; - CalcBlockDim(num_data_in_leaf, &grid_dim, &block_dim); - int grid_dim_ref = grid_dim - 1; - int grid_dim_aligned = 1; - while (grid_dim_ref > 0) { - grid_dim_aligned <<= 1; - grid_dim_ref >>= 1; - } - global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); - - if (grid_dim > AGGREGATE_BLOCK_SIZE) { - AggregateBlockOffsetKernel2<<<1, AGGREGATE_BLOCK_SIZE, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, - cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, - cuda_leaf_num_data_, cuda_data_indices_, - cuda_cur_num_leaves_, - best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, - best_left_sum_gradients, best_left_sum_hessians, best_left_count, - best_left_gain, best_left_leaf_value, - best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, - - smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, - smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, - smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, - smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - smaller_leaf_cuda_hist_pointer_pointer, - larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, - larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, - larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, - cuda_num_total_bin_, - cuda_hist_, - cuda_hist_pool_, - grid_dim); - } else { - AggregateBlockOffsetKernel3<<<1, grid_dim_aligned, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, - cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, - cuda_leaf_num_data_, cuda_data_indices_, - cuda_cur_num_leaves_, - best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, - best_left_sum_gradients, best_left_sum_hessians, best_left_count, - best_left_gain, best_left_leaf_value, - best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, - - smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, - smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, - smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, - smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - smaller_leaf_cuda_hist_pointer_pointer, - larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, - larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, - larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, - cuda_num_total_bin_, - cuda_hist_, - cuda_hist_pool_, - grid_dim, grid_dim_aligned); - } - SynchronizeCUDADevice(); - global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); - global_timer.Start("CUDADataPartition::SplitInnerKernel"); - - SplitInnerKernel2<<>>( - leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, - cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - cuda_out_data_indices_in_leaf_, block_dim); - //SynchronizeCUDADevice(); - global_timer.Stop("CUDADataPartition::SplitInnerKernel"); - - global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); - SplitTreeStructureKernel<<<4, 6, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, - cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, - cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, - cuda_cur_num_leaves_, - best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, - best_left_sum_gradients, best_left_sum_hessians, best_left_count, - best_left_gain, best_left_leaf_value, - best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, best_split_found, - - smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, - smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, - smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, - smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - smaller_leaf_cuda_hist_pointer_pointer, - larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, - larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, - larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, - cuda_num_total_bin_, - cuda_hist_, - cuda_hist_pool_, block_dim, - - cuda_bin_upper_bounds_, cuda_feature_num_bin_offsets_, - - tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, tree_threshold_real_, - tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, - tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, - data_partition_leaf_output_, cuda_split_info_buffer_); - //SynchronizeCUDADevice(); - global_timer.Stop("CUDADataPartition::SplitTreeStructureKernel"); - std::vector cpu_split_info_buffer(12); - const double* cpu_sum_hessians_info = reinterpret_cast(cpu_split_info_buffer.data() + 8); - global_timer.Start("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); - CopyFromCUDADeviceToHostAsync(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 12, cuda_streams_[0]); - global_timer.Stop("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); - SynchronizeCUDADevice(); - const data_size_t left_leaf_num_data = cpu_split_info_buffer[1]; - const data_size_t left_leaf_data_start = cpu_split_info_buffer[2]; - const data_size_t right_leaf_num_data = cpu_split_info_buffer[4]; - global_timer.Start("CUDADataPartition::CopyDataIndicesKernel"); - int grid_dim_copy = 0; - int block_dim_copy = 0; - CalcBlockDimInCopy(num_data_in_leaf, &grid_dim_copy, &block_dim_copy); - CopyDataIndicesKernel<<>>( - left_leaf_num_data + right_leaf_num_data, cuda_out_data_indices_in_leaf_, cuda_data_indices_ + left_leaf_data_start); - global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); - const int left_leaf_index = cpu_split_info_buffer[0]; - const int right_leaf_index = cpu_split_info_buffer[3]; - const data_size_t right_leaf_data_start = cpu_split_info_buffer[5]; - (*cpu_leaf_num_data)[left_leaf_index] = left_leaf_num_data; - (*cpu_leaf_data_start)[left_leaf_index] = left_leaf_data_start; - (*cpu_leaf_num_data)[right_leaf_index] = right_leaf_num_data; - (*cpu_leaf_data_start)[right_leaf_index] = right_leaf_data_start; - (*cpu_leaf_sum_hessians)[left_leaf_index] = cpu_sum_hessians_info[0]; - (*cpu_leaf_sum_hessians)[right_leaf_index] = cpu_sum_hessians_info[1]; - *smaller_leaf_index = cpu_split_info_buffer[6]; - *larger_leaf_index = cpu_split_info_buffer[7]; -} - __global__ void PrefixSumKernel(uint32_t* cuda_elements) { __shared__ uint32_t elements[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1]; const unsigned int threadIdx_x = threadIdx.x; @@ -3211,30 +1808,20 @@ __global__ void PrefixSumKernel(uint32_t* cuda_elements) { void CUDADataPartition::LaunchPrefixSumKernel(uint32_t* cuda_elements) { PrefixSumKernel<<<1, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION / 2>>>(cuda_elements); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } -__global__ void AddPredictionToScoreKernel(const double* data_partition_leaf_output, +__global__ void AddPredictionToScoreKernel(const double* cuda_leaf_output, const data_size_t* num_data_in_leaf, const data_size_t* data_indices_in_leaf, const data_size_t* leaf_data_start, const double learning_rate, double* cuda_scores, const int* cuda_data_index_to_leaf_index, const data_size_t num_data) { const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockIdx_x = blockIdx.x; const unsigned int blockDim_x = blockDim.x; - //const data_size_t num_data = num_data_in_leaf[blockIdx_x]; - //const data_size_t* data_indices = data_indices_in_leaf + leaf_data_start[blockIdx_x]; const int data_index = static_cast(blockIdx_x * blockDim_x + threadIdx_x); - //const double leaf_prediction_value = data_partition_leaf_output[blockIdx_x] * learning_rate; - /*for (unsigned int offset = 0; offset < static_cast(num_data); offset += blockDim_x) { - const data_size_t inner_data_index = static_cast(offset + threadIdx_x); - if (inner_data_index < num_data) { - const data_size_t data_index = data_indices[inner_data_index]; - cuda_scores[data_index] += leaf_prediction_value; - } - }*/ if (data_index < num_data) { const int leaf_index = cuda_data_index_to_leaf_index[data_index]; - const double leaf_prediction_value = data_partition_leaf_output[leaf_index] * learning_rate; + const double leaf_prediction_value = cuda_leaf_output[leaf_index] * learning_rate; cuda_scores[data_index] += leaf_prediction_value; } } @@ -3242,9 +1829,9 @@ __global__ void AddPredictionToScoreKernel(const double* data_partition_leaf_out void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double learning_rate, double* cuda_scores) { global_timer.Start("CUDADataPartition::AddPredictionToScoreKernel"); const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; - AddPredictionToScoreKernel<<>>(data_partition_leaf_output_, + AddPredictionToScoreKernel<<>>(cuda_leaf_output_, cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, learning_rate, cuda_scores, cuda_data_index_to_leaf_index_, num_data_); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel"); } @@ -3261,106 +1848,6 @@ __global__ void CopyColWiseDataKernel(const uint8_t* row_wise_data, } } -__global__ void CUDACheckKernel(const data_size_t** data_indices_in_leaf_ptr, - const data_size_t num_data_in_leaf, - const score_t* gradients, - const score_t* hessians, - double* gradients_sum_buffer, - double* hessians_sum_buffer) { - const data_size_t* data_indices_in_leaf = *data_indices_in_leaf_ptr; - const data_size_t local_data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - __shared__ double local_gradients[1024]; - __shared__ double local_hessians[1024]; - if (local_data_index < num_data_in_leaf) { - const data_size_t global_data_index = data_indices_in_leaf[local_data_index]; - local_gradients[threadIdx.x] = gradients[global_data_index]; - local_hessians[threadIdx.x] = hessians[global_data_index]; - } else { - local_gradients[threadIdx.x] = 0.0f; - local_hessians[threadIdx.x] = 0.0f; - } - __syncthreads(); - ReduceSum(local_gradients, 1024); - __syncthreads(); - ReduceSum(local_hessians, 1024); - __syncthreads(); - if (threadIdx.x == 0) { - gradients_sum_buffer[blockIdx.x] = local_gradients[0]; - hessians_sum_buffer[blockIdx.x] = local_hessians[0]; - } -} - -__global__ void CUDACheckKernel2( - const int leaf_index, - const data_size_t* num_data_expected, - const double* sum_gradients_expected, - const double* sum_hessians_expected, - const double* gradients_sum_buffer, - const double* hessians_sum_buffer, - const int num_blocks) { - double sum_gradients = 0.0f; - double sum_hessians = 0.0f; - for (int i = 0; i < num_blocks; ++i) { - sum_gradients += gradients_sum_buffer[i]; - sum_hessians += hessians_sum_buffer[i]; - } - if (fabs(sum_gradients - *sum_gradients_expected) >= 1.0f) { - printf("error in leaf_index = %d\n", leaf_index); - printf("num data expected = %d\n", *num_data_expected); - printf("error sum_gradients: %f vs %f\n", sum_gradients, *sum_gradients_expected); - } - if (fabs(sum_hessians - *sum_hessians_expected) >= 1.0f) { - printf("error in leaf_index = %d\n", leaf_index); - printf("num data expected = %d\n", *num_data_expected); - printf("error sum_hessians: %f vs %f\n", sum_hessians, *sum_hessians_expected); - } -} - -void CUDADataPartition::LaunchCUDACheckKernel( - const int smaller_leaf_index, - const int larger_leaf_index, - const std::vector& num_data_in_leaf, - const CUDALeafSplits* smaller_leaf_splits, - const CUDALeafSplits* larger_leaf_splits, - const score_t* gradients, - const score_t* hessians) { - const data_size_t num_data_in_smaller_leaf = num_data_in_leaf[smaller_leaf_index]; - const int block_dim = 1024; - const int smaller_num_blocks = (num_data_in_smaller_leaf + block_dim - 1) / block_dim; - CUDACheckKernel<<>>(smaller_leaf_splits->cuda_data_indices_in_leaf(), - num_data_in_smaller_leaf, - gradients, - hessians, - cuda_gradients_sum_buffer_, - cuda_hessians_sum_buffer_); - CUDACheckKernel2<<<1, 1>>>( - smaller_leaf_index, - smaller_leaf_splits->cuda_num_data_in_leaf(), - smaller_leaf_splits->cuda_sum_of_gradients(), - smaller_leaf_splits->cuda_sum_of_hessians(), - cuda_gradients_sum_buffer_, - cuda_hessians_sum_buffer_, - smaller_num_blocks); - if (larger_leaf_index >= 0) { - const data_size_t num_data_in_larger_leaf = num_data_in_leaf[larger_leaf_index]; - const int larger_num_blocks = (num_data_in_larger_leaf + block_dim - 1) / block_dim; - CUDACheckKernel<<>>(larger_leaf_splits->cuda_data_indices_in_leaf(), - num_data_in_larger_leaf, - gradients, - hessians, - cuda_gradients_sum_buffer_, - cuda_hessians_sum_buffer_); - CUDACheckKernel2<<<1, 1>>>( - larger_leaf_index, - larger_leaf_splits->cuda_num_data_in_leaf(), - larger_leaf_splits->cuda_sum_of_gradients(), - larger_leaf_splits->cuda_sum_of_hessians(), - cuda_gradients_sum_buffer_, - cuda_hessians_sum_buffer_, - larger_num_blocks); - } -} - } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 915b808892a2..a90b50530464 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -8,28 +8,34 @@ #ifdef USE_CUDA +#include #include #include #include #include "new_cuda_utils.hpp" #include "cuda_leaf_splits.hpp" +#include "cuda_split_info.hpp" +// TODO(shiyu1994): adjust these values according to different CUDA and GPU versions #define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) #define SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION (512) #define NUM_BANKS_DATA_PARTITION (32) #define LOG_NUM_BANKS_DATA_PARTITION (5) -#define AGGREGATE_BLOCK_SIZE (1024) +#define AGGREGATE_BLOCK_SIZE_DATA_PARTITION (1024) namespace LightGBM { class CUDADataPartition { public: - CUDADataPartition(const data_size_t num_data, const int num_features, const int num_leaves, - const int num_threads, const data_size_t* cuda_num_data, const int* cuda_num_leaves, - const int* cuda_num_features, const std::vector& feature_hist_offsets, const Dataset* train_data, - hist_t* cuda_hist); + CUDADataPartition( + const Dataset* train_data, + const int num_total_bin, + const int num_leaves, + const int num_threads, + const data_size_t* cuda_num_data, + hist_t* cuda_hist); - void Init(const Dataset* train_data); + void Init(); void BeforeTrain(const data_size_t* data_indices); @@ -60,93 +66,8 @@ class CUDADataPartition { int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index, const int cur_max_leaf_index); - void CUDACheck( - const int smaller_leaf_index, - const int larger_leaf_index, - const std::vector& num_data_in_leaf, - const CUDALeafSplits* smaller_leaf_splits, - const CUDALeafSplits* larger_leaf_splits, - const score_t* gradients, - const score_t* hessians); - Tree* GetCPUTree(); - void Test() { - PrintLastCUDAError(); - std::vector test_data_indices(num_data_, -1); - CopyFromCUDADeviceToHost(test_data_indices.data(), cuda_data_indices_, static_cast(num_data_)); - for (data_size_t i = 0; i < num_data_; ++i) { - CHECK_EQ(i, test_data_indices[i]); - } - data_size_t test_leaf_data_start_0 = 0, test_leaf_data_end_0 = 0, test_leaf_num_data_0 = 0; - data_size_t test_leaf_data_start_1 = 0, test_leaf_data_end_1 = 0, test_leaf_num_data_1 = 0; - CopyFromCUDADeviceToHost(&test_leaf_data_start_0, cuda_leaf_data_start_, 1); - CopyFromCUDADeviceToHost(&test_leaf_data_end_0, cuda_leaf_data_end_, 1); - CopyFromCUDADeviceToHost(&test_leaf_num_data_0, cuda_leaf_num_data_, 1); - CopyFromCUDADeviceToHost(&test_leaf_data_start_1, cuda_leaf_data_start_ + 1, 1); - CopyFromCUDADeviceToHost(&test_leaf_data_end_1, cuda_leaf_data_end_ + 1, 1); - CopyFromCUDADeviceToHost(&test_leaf_num_data_1, cuda_leaf_num_data_ + 1, 1); - Log::Warning("test_leaf_data_start_0 = %d", test_leaf_data_start_0); - Log::Warning("test_leaf_data_end_0 = %d", test_leaf_data_end_0); - Log::Warning("test_leaf_num_data_0 = %d", test_leaf_num_data_0); - Log::Warning("test_leaf_data_start_1 = %d", test_leaf_data_start_1); - Log::Warning("test_leaf_data_end_1 = %d", test_leaf_data_end_1); - Log::Warning("test_leaf_num_data_1 = %d", test_leaf_num_data_1); - Log::Warning("CUDADataPartition::Test Pass"); - } - - void TestAfterSplit() { - std::vector test_bit_vector(num_data_, 0); - CopyFromCUDADeviceToHost(test_bit_vector.data(), cuda_data_to_left_, static_cast(num_data_)); - data_size_t num_data_to_left = 0; - #pragma omp parallel for schedule(static) num_threads(num_threads_) reduction(+:num_data_to_left) - for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { - if (test_bit_vector[data_index]) { - ++num_data_to_left; - } - } - Log::Warning("CUDADataPartition::TestAfterSplit num_data_to_left = %d", num_data_to_left); - std::vector test_data_indices(num_data_, 0); - CopyFromCUDADeviceToHost(test_data_indices.data(), cuda_data_indices_, static_cast(num_data_)); - std::vector test_leaf_num_data(num_leaves_, 0), test_leaf_data_start(num_leaves_, 0), test_leaf_data_end(num_leaves_, 0); - CopyFromCUDADeviceToHost(test_leaf_num_data.data(), cuda_leaf_num_data_, static_cast(num_leaves_)); - CopyFromCUDADeviceToHost(test_leaf_data_start.data(), cuda_leaf_data_start_, static_cast(num_leaves_)); - CopyFromCUDADeviceToHost(test_leaf_data_end.data(), cuda_leaf_data_end_, static_cast(num_leaves_)); - const data_size_t start_pos = test_leaf_data_start[2]; - const int check_window_size = 10; - for (data_size_t i = 0; i < check_window_size; ++i) { - Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); - } - Log::Warning("=========================================================="); - for (data_size_t i = start_pos - check_window_size; i < start_pos; ++i) { - Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); - } - Log::Warning("=========================================================="); - for (data_size_t i = start_pos; i < start_pos + check_window_size; ++i) { - Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); - } - Log::Warning("=========================================================="); - const data_size_t end_pos = test_leaf_data_end[2]; - for (data_size_t i = end_pos - check_window_size; i < end_pos; ++i) { - Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); - } - Log::Warning("=========================================================="); - for (data_size_t i = end_pos; i < end_pos + check_window_size; ++i) { - Log::Warning("test_data_indices[%d] = %d", i, test_data_indices[i]); - } - } - - void TestPrefixSum() { - std::vector test_elements(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION, 1); - uint32_t* cuda_elements = nullptr; - InitCUDAMemoryFromHostMemory(&cuda_elements, test_elements.data(), test_elements.size()); - LaunchPrefixSumKernel(cuda_elements); - CopyFromCUDADeviceToHost(test_elements.data(), cuda_elements, test_elements.size()); - for (int i = 0; i < SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; ++i) { - Log::Warning("test_elements[%d] = %d", i, test_elements[i]); - } - } - void UpdateTrainScore(const double learning_rate, double* cuda_scores); const data_size_t* cuda_leaf_data_start() const { return cuda_leaf_data_start_; } @@ -155,8 +76,6 @@ class CUDADataPartition { const data_size_t* cuda_leaf_num_data() const { return cuda_leaf_num_data_; } - //const data_size_t* cuda_leaf_num_data_offsets() const { return cuda_leaf_num_data_offsets_; } - const data_size_t* cuda_data_indices() const { return cuda_data_indices_; } const int* cuda_cur_num_leaves() const { return cuda_cur_num_leaves_; } @@ -194,8 +113,6 @@ class CUDADataPartition { int* grid_dim, int* block_dim); - void CopyColWiseData(const Dataset* train_data); - void GenDataToLeftBitVector(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start, @@ -248,38 +165,11 @@ class CUDADataPartition { std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index); - void LaunchSplitInnerKernel2(const int* leaf_index, const data_size_t num_data_in_leaf, - const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, const double* best_split_gain, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, - // for leaf splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, - std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, - std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index); - void LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start, const int left_leaf_index, const int right_leaf_index); - void LaunchGenDataToLeftBitVectorKernel2(const data_size_t num_data_in_leaf, - const int split_feature_index, const uint32_t split_threshold, - const uint8_t split_default_left, const data_size_t leaf_data_start, - const int left_leaf_index, const int right_leaf_index); - template void LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( const bool missing_is_zero, @@ -329,55 +219,6 @@ class CUDADataPartition { const int default_leaf_index, const int missing_default_leaf_index); - template - void LaunchGenDataToLeftBitVectorKernelMaxIsMinInner2( - const bool missing_is_zero, - const bool missing_is_na, - const bool mfb_is_zero, - const bool mfb_is_na, - const bool max_bin_to_left, - const int column_index, - const int num_blocks_final, - const int split_indices_block_size_data_partition_aligned, - const int split_feature_index, - const data_size_t leaf_data_start, - const data_size_t num_data_in_leaf, - const uint32_t th, - const uint32_t t_zero_bin, - const uint32_t most_freq_bin, - const uint32_t max_bin, - const uint32_t min_bin, - const uint8_t split_default_to_left, - const uint8_t split_missing_default_to_left, - const int left_leaf_index, - const int right_leaf_index, - const int default_leaf_index, - const int missing_default_leaf_index); - - template - void LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner2( - const bool missing_is_zero, - const bool missing_is_na, - const bool mfb_is_zero, - const bool mfb_is_na, - const int column_index, - const int num_blocks_final, - const int split_indices_block_size_data_partition_aligned, - const int split_feature_index, - const data_size_t leaf_data_start, - const data_size_t num_data_in_leaf, - const uint32_t th, - const uint32_t t_zero_bin, - const uint32_t most_freq_bin, - const uint32_t max_bin, - const uint32_t min_bin, - const uint8_t split_default_to_left, - const uint8_t split_missing_default_to_left, - const int left_leaf_index, - const int right_leaf_index, - const int default_leaf_index, - const int missing_default_leaf_index); - template void LaunchUpdateDataIndexToLeafIndexKernel(const data_size_t cuda_leaf_data_start, const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, @@ -393,92 +234,113 @@ class CUDADataPartition { void LaunchAddPredictionToScoreKernel(const double learning_rate, double* cuda_scores); - void LaunchCUDACheckKernel( - const int smaller_leaf_index, - const int larger_leaf_index, - const std::vector& num_data_in_leaf, - const CUDALeafSplits* smaller_leaf_splits, - const CUDALeafSplits* larger_leaf_splits, - const score_t* gradients, - const score_t* hessians); // Host memory + + // dataset information + /*! \brief number of training data */ const data_size_t num_data_; + /*! \brief number of features in training data */ const int num_features_; - const int num_leaves_; - const int num_threads_; + /*! \brief number of total bins in training data */ const int num_total_bin_; - int max_num_split_indices_blocks_; - std::vector feature_default_bins_; - std::vector feature_most_freq_bins_; - std::vector feature_max_bins_; - std::vector feature_min_bins_; - std::vector feature_missing_is_zero_; - std::vector feature_missing_is_na_; - std::vector feature_mfb_is_zero_; - std::vector feature_mfb_is_na_; - std::vector num_data_in_leaf_; - int cur_num_leaves_; - std::vector cpu_split_info_buffer_; - std::vector column_bit_type_; - std::vector feature_index_to_column_index_; - const Dataset* train_data_; + /*! \brief upper bounds of feature histogram bins */ std::vector> bin_upper_bounds_; + /*! \brief number of bins per feature */ std::vector feature_num_bins_; + /*! \brief bin data stored by column */ + const CUDAColumnData* cuda_column_data_; + + // config information + /*! \brief maximum number of leaves in a tree */ + const int num_leaves_; + /*! \brief number of threads */ + const int num_threads_; + + // tree structure information + /*! \brief current number of leaves in tree */ + int cur_num_leaves_; + + // split algorithm related + /*! \brief maximum number of blocks to aggregate after finding bit vector by blocks */ + int max_num_split_indices_blocks_; // CUDA streams + /*! \brief cuda streams used for asynchronizing kernel computing and memory copy */ std::vector cuda_streams_; + // CUDA memory, held by this object + + // tree structure information + /*! \brief data indices by leaf */ data_size_t* cuda_data_indices_; + /*! \brief start position of each leaf in cuda_data_indices_ */ data_size_t* cuda_leaf_data_start_; + /*! \brief end position of each leaf in cuda_data_indices_ */ data_size_t* cuda_leaf_data_end_; + /*! \brief number of data in each leaf */ data_size_t* cuda_leaf_num_data_; + /*! \brief currnet number of leaves in tree */ int* cuda_cur_num_leaves_; - // for split + /*! \brief records the histogram of each leaf */ + hist_t** cuda_hist_pool_; + /*! \brief records the value of each leaf */ + double* cuda_leaf_output_; + + // split data algorithm related + /*! \brief marks whether each data goes to left or right, 1 for left, and 0 for right */ uint8_t* cuda_data_to_left_; + /*! \brief maps data index to leaf index, for adding scores to training data set */ int* cuda_data_index_to_leaf_index_; + /*! \brief prefix sum of number of data going to left in all blocks */ data_size_t* cuda_block_data_to_left_offset_; + /*! \brief prefix sum of number of data going to right in all blocks */ data_size_t* cuda_block_data_to_right_offset_; + /*! \brief buffer for splitting data indices, will be copied back to cuda_data_indices_ after split */ data_size_t* cuda_out_data_indices_in_leaf_; - uint32_t* cuda_feature_default_bins_; - uint32_t* cuda_feature_most_freq_bins_; - uint32_t* cuda_feature_max_bins_; - uint32_t* cuda_feature_min_bins_; - uint8_t* cuda_feature_missing_is_zero_; - uint8_t* cuda_feature_missing_is_na_; - uint8_t* cuda_feature_mfb_is_zero_; - uint8_t* cuda_feature_mfb_is_na_; - int* cuda_num_total_bin_; - int* cuda_split_info_buffer_; // prepared to be copied to cpu - // for histogram pool - hist_t** cuda_hist_pool_; - // for tree structure + + // split tree structure algorithm related + /*! \brief buffer to store split information, prepared to be copied to cpu */ + int* cuda_split_info_buffer_; + /*! \brief the sequence of leaf indices being split during tree growing */ int* tree_split_leaf_index_; + /*! \brief the sequence of inner split indices during tree growing */ int* tree_inner_feature_index_; + /*! \brief the sequence of inner threshold during tree growing */ uint32_t* tree_threshold_; + /*! \brief the sequence of real threshold during tree growing */ double* tree_threshold_real_; + /*! \brief the sequence of left child output value of splits during tree growing */ double* tree_left_output_; + /*! \brief the sequence of right child output value of splits during tree growing */ double* tree_right_output_; + /*! \brief the sequence of left child data number value of splits during tree growing */ data_size_t* tree_left_count_; + /*! \brief the sequence of right child data number value of splits during tree growing */ data_size_t* tree_right_count_; + /*! \brief the sequence of left child hessian sum value of splits during tree growing */ double* tree_left_sum_hessian_; + /*! \brief the sequence of right child hessian sum value of splits during tree growing */ double* tree_right_sum_hessian_; + /*! \brief the sequence of split gains during tree growing */ double* tree_gain_; + /*! \brief the sequence of split default left during tree growing */ uint8_t* tree_default_left_; - double* data_partition_leaf_output_; + + // dataset information + /*! \brief upper bounds of bin boundaries for feature histograms */ double* cuda_bin_upper_bounds_; + /*! \brief the bin offsets of features, used to access cuda_bin_upper_bounds_ */ int* cuda_feature_num_bin_offsets_; - // for debug - double* cuda_gradients_sum_buffer_; - double* cuda_hessians_sum_buffer_; - // for train data split - std::vector cuda_data_by_column_; + // CUDA memory, held by other object + + // dataset information + /*! \brief number of data in training set, for intialization of cuda_leaf_num_data_ and cuda_leaf_data_end_ */ const data_size_t* cuda_num_data_; - const int* cuda_num_leaves_; - const int* cuda_num_features_; + /*! \brief beginning of histograms, for initialization of cuda_hist_pool_ */ hist_t* cuda_hist_; }; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index fec55bdc7571..d62e5d09092d 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -276,7 +276,7 @@ void CUDAHistogramConstructor::InitCUDAData(TrainingShareStates* share_state) { } else { Log::Fatal("Unknow bit type = %d", bit_type_); } - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, @@ -300,7 +300,7 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller } LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_num_data_in_smaller_leaf, cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist, num_data_in_smaller_leaf); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); LaunchSubtractHistogramKernel(cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_smaller_leaf_sum_gradients, cuda_smaller_leaf_sum_hessians, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 91e1db0a7725..4dbb1509687d 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -687,13 +687,13 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(const int* cuda_sma cuda_feature_most_freq_bins_, smaller_leaf_sum_gradients, smaller_leaf_sum_hessians, cuda_smaller_leaf_hist, cuda_need_fix_histogram_features_, cuda_need_fix_histogram_features_num_bin_aligned_); - //SynchronizeCUDADevice(); + //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDAHistogramConstructor::FixHistogramKernel"); global_timer.Start("CUDAHistogramConstructor::SubtractHistogramKernel"); SubtractHistogramKernel<<>>( cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_feature_mfb_offsets_, cuda_feature_num_bins_, cuda_num_total_bin_, cuda_smaller_leaf_hist, cuda_larger_leaf_hist); - //SynchronizeCUDADevice(); + //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDAHistogramConstructor::SubtractHistogramKernel"); } @@ -717,7 +717,7 @@ void CUDAHistogramConstructor::LaunchGetOrderedGradientsKernel( const int num_blocks = (num_data_in_leaf + num_data_per_block - 1) / num_data_per_block; GetOrderedGradientsKernel<<>>(num_data_in_leaf, cuda_data_indices_in_leaf, cuda_gradients_, cuda_hessians_, cuda_ordered_gradients_, cuda_ordered_hessians_); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } } diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 9beb4e0ecae9..3fd177d4bc35 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -56,7 +56,7 @@ void CUDALeafSplits::InitValues( CopyFromHostToCUDADevice(cuda_hist_in_leaf_, &cuda_hist_in_leaf, 1); CopyFromCUDADeviceToCUDADevice(cuda_gain_, cuda_gain, 1); CopyFromCUDADeviceToCUDADevice(cuda_leaf_value_, cuda_leaf_value, 1); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } void CUDALeafSplits::InitValues() { @@ -66,7 +66,7 @@ void CUDALeafSplits::InitValues() { CopyFromHostToCUDADevice(cuda_leaf_index_, &larger_leaf_index, 1); SetCUDAMemory(cuda_gain_, 0, 1); SetCUDAMemory(cuda_leaf_value_, 0, 1); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } void CUDALeafSplits::InitValues( @@ -85,7 +85,7 @@ void CUDALeafSplits::InitValues( CopyFromCUDADeviceToHostAsync(root_sum_hessians, cuda_sum_of_hessians_, 1, cuda_streams_[1]); SetCUDAMemory(cuda_gain_, 0, 1); SetCUDAMemory(cuda_leaf_value_, 0, 1); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index bc1a8870dbd3..a5cea5c0e351 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -58,10 +58,10 @@ void CUDALeafSplits::LaunchInitValuesKernal() { cuda_gradients_, cuda_hessians_, cuda_num_data_, cuda_sum_of_gradients_, cuda_sum_of_hessians_); CopyFromCUDADeviceToCUDADevice(cuda_num_data_in_leaf_, cuda_num_data_, 1); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CUDAInitValuesKernel2<<>>( cuda_sum_of_gradients_, cuda_sum_of_hessians_); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_ranking_objective.cu b/src/treelearner/cuda/cuda_ranking_objective.cu index 8f52916a8fec..ea24b458f907 100644 --- a/src/treelearner/cuda/cuda_ranking_objective.cu +++ b/src/treelearner/cuda/cuda_ranking_objective.cu @@ -574,7 +574,7 @@ void CUDARankingObjective::LaunchCalcInverseMaxDCGKernel() { truncation_level_, num_queries_, cuda_inverse_max_dcgs_); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_regression_objective.cu b/src/treelearner/cuda/cuda_regression_objective.cu index 750188bb31a2..ef771fc48d7a 100644 --- a/src/treelearner/cuda/cuda_regression_objective.cu +++ b/src/treelearner/cuda/cuda_regression_objective.cu @@ -44,9 +44,9 @@ void CUDARegressionObjective::LaunchCalcInitScoreKernel() { const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; CalcInitScoreKernel_1_Regression<<>>(cuda_labels_, num_data_, cuda_init_score_); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CalcInitScoreKernel_2_Regression<<<1, 1>>>(cuda_init_score_, num_data_); - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } __global__ void GetGradientsKernel_Regression(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, diff --git a/src/treelearner/cuda/cuda_split_info.hpp b/src/treelearner/cuda/cuda_split_info.hpp new file mode 100644 index 000000000000..aa95d4b1049b --- /dev/null +++ b/src/treelearner/cuda/cuda_split_info.hpp @@ -0,0 +1,42 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ +#define LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ + +#ifdef USE_CUDA + +#include +#include "new_cuda_utils.hpp" + +namespace LightGBM { + +struct CUDASplitInfo { + public: + bool is_valid; + int leaf_index; + double gain; + int feature_index; + uint32_t threshold; + bool default_left; + + double left_sum_gradients; + double left_sum_hessians; + data_size_t left_count; + double left_gain; + double left_value; + + double right_sum_gradients; + double right_sum_hessians; + data_size_t right_count; + double right_gain; + double right_value; +}; + +} // namespace LightGBM + +#endif // USE_CUDA + +#endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 77ccc52065a1..125d6a892120 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -8,6 +8,7 @@ #include "new_cuda_tree_learner.hpp" +#include #include #include @@ -33,11 +34,12 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia share_state_->feature_hist_offsets(), config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf)); cuda_histogram_constructor_->Init(train_data_, share_state_.get()); - cuda_data_partition_.reset(new CUDADataPartition(num_data_, num_features_, this->config_->num_leaves, num_threads_, - cuda_centralized_info_->cuda_num_data(), cuda_centralized_info_->cuda_num_leaves(), - cuda_centralized_info_->cuda_num_features(), - share_state_->feature_hist_offsets(), train_data_, cuda_histogram_constructor_->cuda_hist_pointer())); - cuda_data_partition_->Init(train_data_); + + cuda_data_partition_.reset(new CUDADataPartition( + train_data_, share_state_->feature_hist_offsets().back(), this->config_->num_leaves, num_threads_, + cuda_centralized_info_->cuda_num_data(), + cuda_histogram_constructor_->cuda_hist_pointer())); + cuda_data_partition_->Init(); cuda_best_split_finder_.reset(new CUDABestSplitFinder(cuda_histogram_constructor_->cuda_hist(), train_data_, this->share_state_->feature_hist_offsets(), this->config_->num_leaves, this->config_->lambda_l1, this->config_->lambda_l2, this->config_->min_data_in_leaf, @@ -252,7 +254,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, split_data_indices_time += duration.count(); ++num_leaves; } - SynchronizeCUDADevice(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); const auto end = std::chrono::steady_clock::now(); const double duration = (static_cast>(end - start)).count(); const auto build_tree_start = std::chrono::steady_clock::now(); @@ -266,6 +268,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, Log::Warning("find best split time from all leaves %f", find_best_split_from_all_leaves_time); Log::Warning("split data indices time %f", split_data_indices_time); Log::Warning("build tree time %f", build_tre_duration); + tree.reset(new CUDATree(tree.get())); return tree.release(); } From f05da3c107a4a8d524acbea420bf5b6adcc3115b Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 21 Jul 2021 05:49:01 +0000 Subject: [PATCH 039/166] use structure for split information --- .../cuda/cuda_best_split_finder.cpp | 35 +- .../cuda/cuda_best_split_finder.cu | 463 +++++------------- .../cuda/cuda_best_split_finder.hpp | 81 +-- src/treelearner/cuda/cuda_data_partition.cpp | 31 +- src/treelearner/cuda/cuda_data_partition.cu | 110 ++--- src/treelearner/cuda/cuda_data_partition.hpp | 25 +- .../cuda/new_cuda_tree_learner.cpp | 18 +- 7 files changed, 173 insertions(+), 590 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index ef4c60b6380d..335aca3881c5 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -114,20 +114,6 @@ void CUDABestSplitFinder::Init() { AllocateCUDAMemoryOuter(&cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__); AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_feature_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_default_left_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_threshold_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_gain_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_left_sum_gradient_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_left_sum_hessian_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_left_count_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_left_gain_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_left_output_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_right_sum_gradient_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_right_sum_hessian_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_right_count_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_right_gain_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_right_output_); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_found_); InitCUDAMemoryFromHostMemory(&cuda_task_feature_index_, cpu_task_feature_index_.data(), cpu_task_feature_index_.size()); InitCUDAMemoryFromHostMemory(&cuda_task_reverse_, cpu_task_reverse_.data(), cpu_task_reverse_.size()); @@ -137,20 +123,6 @@ void CUDABestSplitFinder::Init() { const size_t output_buffer_size = 2 * static_cast(num_tasks_); AllocateCUDAMemoryOuter(&cuda_best_split_info_, output_buffer_size, __FILE__, __LINE__); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_default_left_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_threshold_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_gain_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_left_sum_gradient_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_left_sum_hessian_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_left_count_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_left_gain_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_left_output_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_right_sum_gradient_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_right_sum_hessian_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_right_count_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_right_gain_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_right_output_); - AllocateCUDAMemory(output_buffer_size, &cuda_best_split_found_); AllocateCUDAMemory(7, &cuda_best_split_info_buffer_); cuda_streams_.resize(2); @@ -158,12 +130,7 @@ void CUDABestSplitFinder::Init() { CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); } -void CUDABestSplitFinder::BeforeTrain() { - SetCUDAMemory(cuda_leaf_best_split_gain_, 0, static_cast(num_leaves_)); - SetCUDAMemory(cuda_best_split_found_, 0, static_cast(num_tasks_)); - SetCUDAMemory(cuda_best_split_gain_, 0, static_cast(num_tasks_)); - SetCUDAMemory(cuda_leaf_best_split_found_, 0, static_cast(num_leaves_)); -} +void CUDABestSplitFinder::BeforeTrain() {} void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 48eacddf6541..1c515ebac4b6 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -204,26 +204,13 @@ __device__ void FindBestSplitsForLeafKernelInner( const bool na_as_missing, const uint8_t assume_out_default_left, // output parameters - uint32_t* output_threshold, - double* output_gain, - uint8_t* output_default_left, - double* output_left_sum_gradients, - double* output_left_sum_hessians, - data_size_t* output_left_num_data, - double* output_left_gain, - double* output_left_output, - double* output_right_sum_gradients, - double* output_right_sum_hessians, - data_size_t* output_right_num_data, - double* output_right_gain, - double* output_right_output, - uint8_t* output_found) { + CUDASplitInfo* cuda_best_split_info) { const double cnt_factor = num_data / sum_hessians; const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; - *output_found = 0; + cuda_best_split_info->is_valid = 0; __shared__ hist_t local_grad_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; __shared__ hist_t local_hess_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; @@ -334,10 +321,10 @@ __device__ void FindBestSplitsForLeafKernelInner( ReduceBestGain(local_gain, local_grad_hist, local_hess_hist, threshold_found, threshold_value); const uint8_t found = threshold_found[0]; if (found && threadIdx_x == 0) { - *output_found = 1; - *output_threshold = threshold_value[0]; - *output_gain = local_gain[0]; - *output_default_left = assume_out_default_left; + cuda_best_split_info->is_valid = 1; + cuda_best_split_info->threshold = threshold_value[0]; + cuda_best_split_info->gain = local_gain[0]; + cuda_best_split_info->default_left = assume_out_default_left; if (reverse) { const double sum_right_gradient = local_grad_hist[1]; const double sum_right_hessian = local_hess_hist[1] - K_EPSILON; @@ -349,17 +336,17 @@ __device__ void FindBestSplitsForLeafKernelInner( sum_left_hessian, lambda_l1, use_l1, lambda_l2); const double right_output = CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, lambda_l2); - *output_left_sum_gradients = sum_left_gradient; - *output_left_sum_hessians = sum_left_hessian; - *output_left_num_data = left_count; - *output_right_sum_gradients = sum_right_gradient; - *output_right_sum_hessians = sum_right_hessian; - *output_right_num_data = right_count; - *output_left_output = left_output; - *output_left_gain = GetLeafGainGivenOutput(sum_left_gradient, + cuda_best_split_info->left_sum_gradients = sum_left_gradient; + cuda_best_split_info->left_sum_hessians = sum_left_hessian; + cuda_best_split_info->left_count = left_count; + cuda_best_split_info->right_sum_gradients = sum_right_gradient; + cuda_best_split_info->right_sum_hessians = sum_right_hessian; + cuda_best_split_info->right_count = right_count; + cuda_best_split_info->left_value = left_output; + cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); - *output_right_output = right_output; - *output_right_gain = GetLeafGainGivenOutput(sum_right_gradient, + cuda_best_split_info->right_value = right_output; + cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); } else { const double sum_left_gradient = local_grad_hist[1]; @@ -372,17 +359,17 @@ __device__ void FindBestSplitsForLeafKernelInner( sum_left_hessian, lambda_l1, use_l1, lambda_l2); const double right_output = CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, lambda_l2); - *output_left_sum_gradients = sum_left_gradient; - *output_left_sum_hessians = sum_left_hessian; - *output_left_num_data = left_count; - *output_right_sum_gradients = sum_right_gradient; - *output_right_sum_hessians = sum_right_hessian; - *output_right_num_data = right_count; - *output_left_output = left_output; - *output_left_gain = GetLeafGainGivenOutput(sum_left_gradient, + cuda_best_split_info->left_sum_gradients = sum_left_gradient; + cuda_best_split_info->left_sum_hessians = sum_left_hessian; + cuda_best_split_info->left_count = left_count; + cuda_best_split_info->right_sum_gradients = sum_right_gradient; + cuda_best_split_info->right_sum_hessians = sum_right_hessian; + cuda_best_split_info->right_count = right_count; + cuda_best_split_info->left_value = left_output; + cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); - *output_right_output = right_output; - *output_right_gain = GetLeafGainGivenOutput(sum_right_gradient, + cuda_best_split_info->right_value = right_output; + cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); } } @@ -423,20 +410,7 @@ __global__ void FindBestSplitsForLeafKernel( const double lambda_l1, const double lambda_l2, // output - uint32_t* cuda_best_split_threshold, - uint8_t* cuda_best_split_default_left, - double* cuda_best_split_gain, - double* cuda_best_split_left_sum_gradient, - double* cuda_best_split_left_sum_hessian, - data_size_t* cuda_best_split_left_count, - double* cuda_best_split_left_gain, - double* cuda_best_split_left_output, - double* cuda_best_split_right_sum_gradient, - double* cuda_best_split_right_sum_hessian, - data_size_t* cuda_best_split_right_count, - double* cuda_best_split_right_gain, - double* cuda_best_split_right_output, - uint8_t* cuda_best_split_found) { + CUDASplitInfo* cuda_best_split_info) { const unsigned int task_index = blockIdx.x % num_tasks; const bool is_larger = static_cast(blockIdx.x >= num_tasks || larger_only); @@ -450,20 +424,7 @@ __global__ void FindBestSplitsForLeafKernel( const double sum_hessians = (is_larger ? *larger_sum_hessians_in_leaf : *smaller_sum_hessians_in_leaf) + 2 * K_EPSILON; const double num_data = is_larger ? *larger_num_data_in_leaf : *smaller_num_data_in_leaf; const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; - uint8_t* out_default_left = cuda_best_split_default_left + output_offset; - uint32_t* out_threshold = cuda_best_split_threshold + output_offset; - double* out_left_sum_gradients = cuda_best_split_left_sum_gradient + output_offset; - double* out_left_sum_hessians = cuda_best_split_left_sum_hessian + output_offset; - double* out_right_sum_gradients = cuda_best_split_right_sum_gradient + output_offset; - double* out_right_sum_hessians = cuda_best_split_right_sum_hessian + output_offset; - data_size_t* out_left_num_data = cuda_best_split_left_count + output_offset; - data_size_t* out_right_num_data = cuda_best_split_right_count + output_offset; - double* out_left_output = cuda_best_split_left_output + output_offset; - double* out_right_output = cuda_best_split_right_output + output_offset; - double* out_left_gain = cuda_best_split_left_gain + output_offset; - double* out_right_gain = cuda_best_split_right_gain + output_offset; - uint8_t* out_found = cuda_best_split_found + output_offset; - double* out_gain = cuda_best_split_gain + output_offset; + CUDASplitInfo* out = cuda_best_split_info + output_offset; const hist_t* hist_ptr = (is_larger ? *larger_leaf_hist : *smaller_leaf_hist) + feature_hist_offsets[inner_feature_index] * 2; FindBestSplitsForLeafKernelInner( // input feature information @@ -490,20 +451,7 @@ __global__ void FindBestSplitsForLeafKernel( na_as_missing, assume_out_default_left, // output parameters - out_threshold, - out_gain, - out_default_left, - out_left_sum_gradients, - out_left_sum_hessians, - out_left_num_data, - out_left_gain, - out_left_output, - out_right_sum_gradients, - out_right_sum_hessians, - out_right_num_data, - out_right_gain, - out_right_output, - out_found); + out); } void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( @@ -556,20 +504,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( lambda_l1_, lambda_l2_, // output parameters - cuda_best_split_threshold_, - cuda_best_split_default_left_, - cuda_best_split_gain_, - cuda_best_split_left_sum_gradient_, - cuda_best_split_left_sum_hessian_, - cuda_best_split_left_count_, - cuda_best_split_left_gain_, - cuda_best_split_left_output_, - cuda_best_split_right_sum_gradient_, - cuda_best_split_right_sum_hessian_, - cuda_best_split_right_count_, - cuda_best_split_right_gain_, - cuda_best_split_right_output_, - cuda_best_split_found_); + cuda_best_split_info_); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); if (larger_leaf_index >= 0) { @@ -608,20 +543,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( lambda_l1_, lambda_l2_, // output parameters - cuda_best_split_threshold_, - cuda_best_split_default_left_, - cuda_best_split_gain_, - cuda_best_split_left_sum_gradient_, - cuda_best_split_left_sum_hessian_, - cuda_best_split_left_count_, - cuda_best_split_left_gain_, - cuda_best_split_left_output_, - cuda_best_split_right_sum_gradient_, - cuda_best_split_right_sum_hessian_, - cuda_best_split_right_count_, - cuda_best_split_right_gain_, - cuda_best_split_right_output_, - cuda_best_split_found_); + cuda_best_split_info_); } } @@ -643,31 +565,11 @@ __device__ void ReduceBestSplit(uint8_t* found, double* gain, uint32_t* shared_r } __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const int larger_leaf_index, - const int* cuda_num_features, int* cuda_leaf_best_split_feature, uint8_t* cuda_leaf_best_split_default_left, - uint32_t* cuda_leaf_best_split_threshold, double* cuda_leaf_best_split_gain, - double* cuda_leaf_best_split_left_sum_gradient, double* cuda_leaf_best_split_left_sum_hessian, - data_size_t* cuda_leaf_best_split_left_count, double* cuda_leaf_best_split_left_gain, - double* cuda_leaf_best_split_left_output, - double* cuda_leaf_best_split_right_sum_gradient, double* cuda_leaf_best_split_right_sum_hessian, - data_size_t* cuda_leaf_best_split_right_count, double* cuda_leaf_best_split_right_gain, - double* cuda_leaf_best_split_right_output, - uint8_t* cuda_leaf_best_split_found, + const int* cuda_num_features, int* cuda_leaf_best_split_feature, + CUDASplitInfo* cuda_leaf_best_split_info, // input parameters const int* cuda_task_feature_index, - const uint8_t* cuda_best_split_default_left, - const uint32_t* cuda_best_split_threshold, - const double* cuda_best_split_gain, - const double* cuda_best_split_left_sum_gradient, - const double* cuda_best_split_left_sum_hessian, - const data_size_t* cuda_best_split_left_count, - const double* cuda_best_split_left_gain, - const double* cuda_best_split_left_output, - const double* cuda_best_split_right_sum_gradient, - const double* cuda_best_split_right_sum_hessian, - const data_size_t* cuda_best_split_right_count, - const double* cuda_best_split_right_gain, - const double* cuda_best_split_right_output, - const uint8_t* cuda_best_split_found, + const CUDASplitInfo* cuda_best_split_info, const uint32_t* cuda_feature_default_bins, const int num_tasks, const int num_tasks_aligned, @@ -687,8 +589,8 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i const int task_index = static_cast(leaf_block_index * blockDim.x + threadIdx_x); const uint32_t read_index = is_smaller ? static_cast(task_index) : static_cast(task_index + num_tasks); if (task_index < num_tasks) { - best_found[threadIdx_x] = cuda_best_split_found[read_index]; - best_gain[threadIdx_x] = cuda_best_split_gain[read_index]; + best_found[threadIdx_x] = cuda_best_split_info[read_index].is_valid; + best_gain[threadIdx_x] = cuda_best_split_info[read_index].gain; shared_read_index[threadIdx_x] = read_index; } else { best_found[threadIdx_x] = 0; @@ -700,26 +602,28 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i const int leaf_index_ref = is_smaller ? smaller_leaf_index : larger_leaf_index; const unsigned buffer_write_pos = static_cast(leaf_index_ref) + leaf_block_index * num_leaves; const uint32_t best_read_index = shared_read_index[0]; + CUDASplitInfo* cuda_split_info = cuda_leaf_best_split_info + buffer_write_pos; + const CUDASplitInfo* best_split_info = cuda_best_split_info + best_read_index; if (best_found[0]) { - cuda_leaf_best_split_gain[buffer_write_pos] = best_gain[0]; + cuda_split_info->gain = best_gain[0]; cuda_leaf_best_split_feature[buffer_write_pos] = is_smaller ? cuda_task_feature_index[best_read_index] : cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; - cuda_leaf_best_split_default_left[buffer_write_pos] = cuda_best_split_default_left[best_read_index]; - cuda_leaf_best_split_threshold[buffer_write_pos] = cuda_best_split_threshold[best_read_index]; - cuda_leaf_best_split_left_sum_gradient[buffer_write_pos] = cuda_best_split_left_sum_gradient[best_read_index]; - cuda_leaf_best_split_left_sum_hessian[buffer_write_pos] = cuda_best_split_left_sum_hessian[best_read_index]; - cuda_leaf_best_split_left_count[buffer_write_pos] = cuda_best_split_left_count[best_read_index]; - cuda_leaf_best_split_left_gain[buffer_write_pos] = cuda_best_split_left_gain[best_read_index]; - cuda_leaf_best_split_left_output[buffer_write_pos] = cuda_best_split_left_output[best_read_index]; - cuda_leaf_best_split_right_sum_gradient[buffer_write_pos] = cuda_best_split_right_sum_gradient[best_read_index]; - cuda_leaf_best_split_right_sum_hessian[buffer_write_pos] = cuda_best_split_right_sum_hessian[best_read_index]; - cuda_leaf_best_split_right_count[buffer_write_pos] = cuda_best_split_right_count[best_read_index]; - cuda_leaf_best_split_right_gain[buffer_write_pos] = cuda_best_split_right_gain[best_read_index]; - cuda_leaf_best_split_right_output[buffer_write_pos] = cuda_best_split_right_output[best_read_index]; - cuda_leaf_best_split_found[buffer_write_pos] = 1; + cuda_split_info->default_left = best_split_info->default_left; + cuda_split_info->threshold = best_split_info->threshold; + cuda_split_info->left_sum_gradients = best_split_info->left_sum_gradients; + cuda_split_info->left_sum_hessians = best_split_info->left_sum_hessians; + cuda_split_info->left_count = best_split_info->left_count; + cuda_split_info->left_gain = best_split_info->left_gain; + cuda_split_info->left_value = best_split_info->left_value; + cuda_split_info->right_sum_gradients = best_split_info->right_sum_gradients; + cuda_split_info->right_sum_hessians = best_split_info->right_sum_hessians; + cuda_split_info->right_count = best_split_info->right_count; + cuda_split_info->right_gain = best_split_info->right_gain; + cuda_split_info->right_value = best_split_info->right_value; + cuda_split_info->is_valid = true; } else { - cuda_leaf_best_split_gain[buffer_write_pos] = K_MIN_SCORE; - cuda_leaf_best_split_found[buffer_write_pos] = 0; + cuda_split_info->gain = K_MIN_SCORE; + cuda_split_info->is_valid = false; } } } @@ -729,38 +633,33 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( const int larger_leaf_index, const unsigned int num_blocks_per_leaf, const int num_leaves, - int* cuda_leaf_best_split_feature, uint8_t* cuda_leaf_best_split_default_left, - uint32_t* cuda_leaf_best_split_threshold, double* cuda_leaf_best_split_gain, - double* cuda_leaf_best_split_left_sum_gradient, double* cuda_leaf_best_split_left_sum_hessian, - data_size_t* cuda_leaf_best_split_left_count, double* cuda_leaf_best_split_left_gain, - double* cuda_leaf_best_split_left_output, - double* cuda_leaf_best_split_right_sum_gradient, double* cuda_leaf_best_split_right_sum_hessian, - data_size_t* cuda_leaf_best_split_right_count, double* cuda_leaf_best_split_right_gain, - double* cuda_leaf_best_split_right_output, - uint8_t* cuda_leaf_best_split_found, + int* cuda_leaf_best_split_feature, + CUDASplitInfo* cuda_leaf_best_split_info, const bool larger_only) { if (!larger_only) { if (blockIdx.x == 0) { for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { const unsigned int leaf_read_pos = static_cast(smaller_leaf_index) + block_index * static_cast(num_leaves); - if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[smaller_leaf_index] == 1 && - cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[smaller_leaf_index]) || - (cuda_leaf_best_split_found[smaller_leaf_index] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { - cuda_leaf_best_split_found[smaller_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; + CUDASplitInfo* smaller_leaf_split_info = cuda_leaf_best_split_info + smaller_leaf_index; + const CUDASplitInfo* other_split_info = cuda_leaf_best_split_info + leaf_read_pos; + if ((other_split_info->is_valid && smaller_leaf_split_info->is_valid && + other_split_info->gain > smaller_leaf_split_info->gain) || + (!smaller_leaf_split_info->is_valid && other_split_info->is_valid)) { + smaller_leaf_split_info->is_valid = other_split_info->is_valid; cuda_leaf_best_split_feature[smaller_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; - cuda_leaf_best_split_default_left[smaller_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; - cuda_leaf_best_split_threshold[smaller_leaf_index] = cuda_leaf_best_split_threshold[leaf_read_pos]; - cuda_leaf_best_split_gain[smaller_leaf_index] = cuda_leaf_best_split_gain[leaf_read_pos]; - cuda_leaf_best_split_left_sum_gradient[smaller_leaf_index] = cuda_leaf_best_split_left_sum_gradient[leaf_read_pos]; - cuda_leaf_best_split_left_sum_hessian[smaller_leaf_index] = cuda_leaf_best_split_left_sum_hessian[leaf_read_pos]; - cuda_leaf_best_split_left_count[smaller_leaf_index] = cuda_leaf_best_split_left_count[leaf_read_pos]; - cuda_leaf_best_split_left_gain[smaller_leaf_index] = cuda_leaf_best_split_left_gain[leaf_read_pos]; - cuda_leaf_best_split_left_output[smaller_leaf_index] = cuda_leaf_best_split_left_output[leaf_read_pos]; - cuda_leaf_best_split_right_sum_gradient[smaller_leaf_index] = cuda_leaf_best_split_right_sum_gradient[leaf_read_pos]; - cuda_leaf_best_split_right_sum_hessian[smaller_leaf_index] = cuda_leaf_best_split_right_sum_hessian[leaf_read_pos]; - cuda_leaf_best_split_right_count[smaller_leaf_index] = cuda_leaf_best_split_right_count[leaf_read_pos]; - cuda_leaf_best_split_right_gain[smaller_leaf_index] = cuda_leaf_best_split_right_gain[leaf_read_pos]; - cuda_leaf_best_split_right_output[smaller_leaf_index] = cuda_leaf_best_split_right_output[leaf_read_pos]; + smaller_leaf_split_info->default_left = other_split_info->default_left; + smaller_leaf_split_info->threshold = other_split_info->threshold; + smaller_leaf_split_info->gain = other_split_info->gain; + smaller_leaf_split_info->left_sum_gradients = other_split_info->left_sum_gradients; + smaller_leaf_split_info->left_sum_hessians = other_split_info->left_sum_hessians; + smaller_leaf_split_info->left_count = other_split_info->left_count; + smaller_leaf_split_info->left_gain = other_split_info->left_gain; + smaller_leaf_split_info->left_value = other_split_info->left_value; + smaller_leaf_split_info->right_sum_gradients = other_split_info->right_sum_gradients; + smaller_leaf_split_info->right_sum_hessians = other_split_info->right_sum_hessians; + smaller_leaf_split_info->right_count = other_split_info->right_count; + smaller_leaf_split_info->right_gain = other_split_info->right_gain; + smaller_leaf_split_info->right_value = other_split_info->right_value; } } } @@ -769,24 +668,26 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( if (blockIdx.x == 1 || larger_only) { for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { const unsigned int leaf_read_pos = static_cast(larger_leaf_index) + block_index * static_cast(num_leaves); - if ((cuda_leaf_best_split_found[leaf_read_pos] == 1 && cuda_leaf_best_split_found[larger_leaf_index] == 1 && - cuda_leaf_best_split_gain[leaf_read_pos] > cuda_leaf_best_split_gain[larger_leaf_index]) || - (cuda_leaf_best_split_found[larger_leaf_index] == 0 && cuda_leaf_best_split_found[leaf_read_pos] == 1)) { - cuda_leaf_best_split_found[larger_leaf_index] = cuda_leaf_best_split_found[leaf_read_pos]; + CUDASplitInfo* larger_leaf_split_info = cuda_leaf_best_split_info + larger_leaf_index; + const CUDASplitInfo* other_split_info = cuda_leaf_best_split_info + leaf_read_pos; + if ((other_split_info->is_valid && larger_leaf_split_info->is_valid && + other_split_info->gain > larger_leaf_split_info->gain) || + (!larger_leaf_split_info->is_valid && other_split_info->is_valid)) { + larger_leaf_split_info->is_valid = other_split_info->is_valid; cuda_leaf_best_split_feature[larger_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; - cuda_leaf_best_split_default_left[larger_leaf_index] = cuda_leaf_best_split_default_left[leaf_read_pos]; - cuda_leaf_best_split_threshold[larger_leaf_index] = cuda_leaf_best_split_threshold[leaf_read_pos]; - cuda_leaf_best_split_gain[larger_leaf_index] = cuda_leaf_best_split_gain[leaf_read_pos]; - cuda_leaf_best_split_left_sum_gradient[larger_leaf_index] = cuda_leaf_best_split_left_sum_gradient[leaf_read_pos]; - cuda_leaf_best_split_left_sum_hessian[larger_leaf_index] = cuda_leaf_best_split_left_sum_hessian[leaf_read_pos]; - cuda_leaf_best_split_left_count[larger_leaf_index] = cuda_leaf_best_split_left_count[leaf_read_pos]; - cuda_leaf_best_split_left_gain[larger_leaf_index] = cuda_leaf_best_split_left_gain[leaf_read_pos]; - cuda_leaf_best_split_left_output[larger_leaf_index] = cuda_leaf_best_split_left_output[leaf_read_pos]; - cuda_leaf_best_split_right_sum_gradient[larger_leaf_index] = cuda_leaf_best_split_right_sum_gradient[leaf_read_pos]; - cuda_leaf_best_split_right_sum_hessian[larger_leaf_index] = cuda_leaf_best_split_right_sum_hessian[leaf_read_pos]; - cuda_leaf_best_split_right_count[larger_leaf_index] = cuda_leaf_best_split_right_count[leaf_read_pos]; - cuda_leaf_best_split_right_gain[larger_leaf_index] = cuda_leaf_best_split_right_gain[leaf_read_pos]; - cuda_leaf_best_split_right_output[larger_leaf_index] = cuda_leaf_best_split_right_output[leaf_read_pos]; + larger_leaf_split_info->default_left = other_split_info->default_left; + larger_leaf_split_info->threshold = other_split_info->threshold; + larger_leaf_split_info->gain = other_split_info->gain; + larger_leaf_split_info->left_sum_gradients = other_split_info->left_sum_gradients; + larger_leaf_split_info->left_sum_hessians = other_split_info->left_sum_hessians; + larger_leaf_split_info->left_count = other_split_info->left_count; + larger_leaf_split_info->left_gain = other_split_info->left_gain; + larger_leaf_split_info->left_value = other_split_info->left_value; + larger_leaf_split_info->right_sum_gradients = other_split_info->right_sum_gradients; + larger_leaf_split_info->right_sum_hessians = other_split_info->right_sum_hessians; + larger_leaf_split_info->right_count = other_split_info->right_count; + larger_leaf_split_info->right_gain = other_split_info->right_gain; + larger_leaf_split_info->right_value = other_split_info->right_value; } } } @@ -813,35 +714,9 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cpu_larger_leaf_index, cuda_num_features_, cuda_leaf_best_split_feature_, - cuda_leaf_best_split_default_left_, - cuda_leaf_best_split_threshold_, - cuda_leaf_best_split_gain_, - cuda_leaf_best_split_left_sum_gradient_, - cuda_leaf_best_split_left_sum_hessian_, - cuda_leaf_best_split_left_count_, - cuda_leaf_best_split_left_gain_, - cuda_leaf_best_split_left_output_, - cuda_leaf_best_split_right_sum_gradient_, - cuda_leaf_best_split_right_sum_hessian_, - cuda_leaf_best_split_right_count_, - cuda_leaf_best_split_right_gain_, - cuda_leaf_best_split_right_output_, - cuda_leaf_best_split_found_, + cuda_leaf_best_split_info_, cuda_task_feature_index_, - cuda_best_split_default_left_, - cuda_best_split_threshold_, - cuda_best_split_gain_, - cuda_best_split_left_sum_gradient_, - cuda_best_split_left_sum_hessian_, - cuda_best_split_left_count_, - cuda_best_split_left_gain_, - cuda_best_split_left_output_, - cuda_best_split_right_sum_gradient_, - cuda_best_split_right_sum_hessian_, - cuda_best_split_right_count_, - cuda_best_split_right_gain_, - cuda_best_split_right_output_, - cuda_best_split_found_, + cuda_best_split_info_, cuda_feature_default_bins_, num_tasks_, num_tasks_aligned, @@ -855,20 +730,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_blocks_per_leaf, num_leaves_, cuda_leaf_best_split_feature_, - cuda_leaf_best_split_default_left_, - cuda_leaf_best_split_threshold_, - cuda_leaf_best_split_gain_, - cuda_leaf_best_split_left_sum_gradient_, - cuda_leaf_best_split_left_sum_hessian_, - cuda_leaf_best_split_left_count_, - cuda_leaf_best_split_left_gain_, - cuda_leaf_best_split_left_output_, - cuda_leaf_best_split_right_sum_gradient_, - cuda_leaf_best_split_right_sum_hessian_, - cuda_leaf_best_split_right_count_, - cuda_leaf_best_split_right_gain_, - cuda_leaf_best_split_right_output_, - cuda_leaf_best_split_found_, + cuda_leaf_best_split_info_, false); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); @@ -877,35 +739,9 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cpu_larger_leaf_index, cuda_num_features_, cuda_leaf_best_split_feature_, - cuda_leaf_best_split_default_left_, - cuda_leaf_best_split_threshold_, - cuda_leaf_best_split_gain_, - cuda_leaf_best_split_left_sum_gradient_, - cuda_leaf_best_split_left_sum_hessian_, - cuda_leaf_best_split_left_count_, - cuda_leaf_best_split_left_gain_, - cuda_leaf_best_split_left_output_, - cuda_leaf_best_split_right_sum_gradient_, - cuda_leaf_best_split_right_sum_hessian_, - cuda_leaf_best_split_right_count_, - cuda_leaf_best_split_right_gain_, - cuda_leaf_best_split_right_output_, - cuda_leaf_best_split_found_, + cuda_leaf_best_split_info_, cuda_task_feature_index_, - cuda_best_split_default_left_, - cuda_best_split_threshold_, - cuda_best_split_gain_, - cuda_best_split_left_sum_gradient_, - cuda_best_split_left_sum_hessian_, - cuda_best_split_left_count_, - cuda_best_split_left_gain_, - cuda_best_split_left_output_, - cuda_best_split_right_sum_gradient_, - cuda_best_split_right_sum_hessian_, - cuda_best_split_right_count_, - cuda_best_split_right_gain_, - cuda_best_split_right_output_, - cuda_best_split_found_, + cuda_best_split_info_, cuda_feature_default_bins_, num_tasks_, num_tasks_aligned, @@ -919,20 +755,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_blocks_per_leaf, num_leaves_, cuda_leaf_best_split_feature_, - cuda_leaf_best_split_default_left_, - cuda_leaf_best_split_threshold_, - cuda_leaf_best_split_gain_, - cuda_leaf_best_split_left_sum_gradient_, - cuda_leaf_best_split_left_sum_hessian_, - cuda_leaf_best_split_left_count_, - cuda_leaf_best_split_left_gain_, - cuda_leaf_best_split_left_output_, - cuda_leaf_best_split_right_sum_gradient_, - cuda_leaf_best_split_right_sum_hessian_, - cuda_leaf_best_split_right_count_, - cuda_leaf_best_split_right_gain_, - cuda_leaf_best_split_right_output_, - cuda_leaf_best_split_found_, + cuda_leaf_best_split_info_, true); } } else { @@ -942,35 +765,9 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cpu_larger_leaf_index, cuda_num_features_, cuda_leaf_best_split_feature_, - cuda_leaf_best_split_default_left_, - cuda_leaf_best_split_threshold_, - cuda_leaf_best_split_gain_, - cuda_leaf_best_split_left_sum_gradient_, - cuda_leaf_best_split_left_sum_hessian_, - cuda_leaf_best_split_left_count_, - cuda_leaf_best_split_left_gain_, - cuda_leaf_best_split_left_output_, - cuda_leaf_best_split_right_sum_gradient_, - cuda_leaf_best_split_right_sum_hessian_, - cuda_leaf_best_split_right_count_, - cuda_leaf_best_split_right_gain_, - cuda_leaf_best_split_right_output_, - cuda_leaf_best_split_found_, + cuda_leaf_best_split_info_, cuda_task_feature_index_, - cuda_best_split_default_left_, - cuda_best_split_threshold_, - cuda_best_split_gain_, - cuda_best_split_left_sum_gradient_, - cuda_best_split_left_sum_hessian_, - cuda_best_split_left_count_, - cuda_best_split_left_gain_, - cuda_best_split_left_output_, - cuda_best_split_right_sum_gradient_, - cuda_best_split_right_sum_hessian_, - cuda_best_split_right_count_, - cuda_best_split_right_gain_, - cuda_best_split_right_output_, - cuda_best_split_found_, + cuda_best_split_info_, cuda_feature_default_bins_, num_tasks_, num_tasks_aligned, @@ -985,36 +782,16 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_blocks_per_leaf, num_leaves_, cuda_leaf_best_split_feature_, - cuda_leaf_best_split_default_left_, - cuda_leaf_best_split_threshold_, - cuda_leaf_best_split_gain_, - cuda_leaf_best_split_left_sum_gradient_, - cuda_leaf_best_split_left_sum_hessian_, - cuda_leaf_best_split_left_count_, - cuda_leaf_best_split_left_gain_, - cuda_leaf_best_split_left_output_, - cuda_leaf_best_split_right_sum_gradient_, - cuda_leaf_best_split_right_sum_hessian_, - cuda_leaf_best_split_right_count_, - cuda_leaf_best_split_right_gain_, - cuda_leaf_best_split_right_output_, - cuda_leaf_best_split_found_, + cuda_leaf_best_split_info_, larger_only); } } } __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, - const double* cuda_leaf_best_split_gain, int* out_best_leaf, - const int* cuda_leaf_best_split_feature, const uint32_t* cuda_leaf_best_split_threshold, - const uint32_t* cuda_feature_default_bins, - const double* cuda_leaf_best_split_left_sum_gradient, - const double* cuda_leaf_best_split_left_sum_hessian, - const double* cuda_leaf_best_split_right_sum_gradient, - const double* cuda_leaf_best_split_right_sum_hessian, - const data_size_t* cuda_leaf_best_split_left_count, - const data_size_t* cuda_leaf_best_split_right_count, - const uint8_t* cuda_leaf_best_split_found, + int* out_best_leaf, + const int* cuda_leaf_best_split_feature, + const CUDASplitInfo* cuda_leaf_best_split_info, int* cuda_best_split_info_buffer) { const int cuda_cur_num_leaves_ref = *cuda_cur_num_leaves; __shared__ double thread_best_gain[NUM_THREADS_FIND_BEST_LEAF]; @@ -1028,8 +805,8 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, const int start = num_leaves_per_thread * threadIdx_x; const int end = min(start + num_leaves_per_thread, cuda_cur_num_leaves_ref); for (int leaf_index = threadIdx_x; leaf_index < cuda_cur_num_leaves_ref; leaf_index += cur_num_valid_threads) { - const double leaf_best_gain = cuda_leaf_best_split_gain[leaf_index]; - if (cuda_leaf_best_split_found[leaf_index] && leaf_best_gain > thread_best_gain[threadIdx_x]) { + const double leaf_best_gain = cuda_leaf_best_split_info[leaf_index].gain; + if (cuda_leaf_best_split_info[leaf_index].is_valid && leaf_best_gain > thread_best_gain[threadIdx_x]) { thread_best_gain[threadIdx_x] = leaf_best_gain; thread_best_leaf[threadIdx_x] = leaf_index; } @@ -1045,22 +822,22 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, __global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int larger_leaf_index, int* cuda_best_split_info_buffer, const int* cuda_leaf_best_split_feature, - const uint32_t* cuda_leaf_best_split_threshold, const uint8_t* cuda_leaf_best_split_default_left) { + const CUDASplitInfo* cuda_leaf_best_split_info) { const unsigned int threadIdx_x = blockIdx.x; if (threadIdx_x == 0) { cuda_best_split_info_buffer[0] = cuda_leaf_best_split_feature[smaller_leaf_index]; } else if (threadIdx_x == 1) { - cuda_best_split_info_buffer[1] = cuda_leaf_best_split_threshold[smaller_leaf_index]; + cuda_best_split_info_buffer[1] = cuda_leaf_best_split_info[smaller_leaf_index].threshold; } else if (threadIdx_x == 2) { - cuda_best_split_info_buffer[2] = cuda_leaf_best_split_default_left[smaller_leaf_index]; + cuda_best_split_info_buffer[2] = cuda_leaf_best_split_info[smaller_leaf_index].default_left; } if (larger_leaf_index >= 0) { if (threadIdx_x == 3) { cuda_best_split_info_buffer[3] = cuda_leaf_best_split_feature[larger_leaf_index]; } else if (threadIdx_x == 4) { - cuda_best_split_info_buffer[4] = cuda_leaf_best_split_threshold[larger_leaf_index]; + cuda_best_split_info_buffer[4] = cuda_leaf_best_split_info[larger_leaf_index].threshold; } else if (threadIdx_x == 5) { - cuda_best_split_info_buffer[5] = cuda_leaf_best_split_default_left[larger_leaf_index]; + cuda_best_split_info_buffer[5] = cuda_leaf_best_split_info[larger_leaf_index].default_left; } } } @@ -1068,19 +845,13 @@ __global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { - FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF, 0, cuda_streams_[1]>>>(cuda_cur_num_leaves, cuda_leaf_best_split_gain_, cuda_best_leaf_, - cuda_leaf_best_split_feature_, cuda_leaf_best_split_threshold_, cuda_feature_default_bins_, - cuda_leaf_best_split_left_sum_gradient_, - cuda_leaf_best_split_left_sum_hessian_, - cuda_leaf_best_split_right_sum_gradient_, - cuda_leaf_best_split_right_sum_hessian_, - cuda_leaf_best_split_left_count_, - cuda_leaf_best_split_right_count_, - cuda_leaf_best_split_found_, + FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF, 0, cuda_streams_[1]>>>(cuda_cur_num_leaves, cuda_best_leaf_, + cuda_leaf_best_split_feature_, + cuda_leaf_best_split_info_, cuda_best_split_info_buffer_); PrepareLeafBestSplitInfo<<<6, 1, 0, cuda_streams_[0]>>>(smaller_leaf_index, larger_leaf_index, cuda_best_split_info_buffer_, cuda_leaf_best_split_feature_, - cuda_leaf_best_split_threshold_, cuda_leaf_best_split_default_left_); + cuda_leaf_best_split_info_); std::vector cpu_leaf_best_split_info_buffer(7); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CopyFromCUDADeviceToHost(cpu_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 7); diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 4783b5296bd6..4dcb6fbcae15 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -51,56 +51,7 @@ class CUDABestSplitFinder { const int* cuda_leaf_best_split_feature() const { return cuda_leaf_best_split_feature_; } - const uint32_t* cuda_leaf_best_split_threshold() const { return cuda_leaf_best_split_threshold_; } - - const uint8_t* cuda_leaf_best_split_default_left() const { return cuda_leaf_best_split_default_left_; } - - const double* cuda_leaf_best_split_gain() const { return cuda_leaf_best_split_gain_; } - - const double* cuda_leaf_best_split_left_sum_gradient() const { return cuda_leaf_best_split_left_sum_gradient_; } - - const double* cuda_leaf_best_split_left_sum_hessian() const { return cuda_leaf_best_split_left_sum_hessian_; } - - const data_size_t* cuda_leaf_best_split_left_count() const { return cuda_leaf_best_split_left_count_; } - - const double* cuda_leaf_best_split_left_gain() const { return cuda_leaf_best_split_left_gain_; } - - const double* cuda_leaf_best_split_left_output() const { return cuda_leaf_best_split_left_output_; } - - const double* cuda_leaf_best_split_right_sum_gradient() const { return cuda_leaf_best_split_right_sum_gradient_; } - - const double* cuda_leaf_best_split_right_sum_hessian() const { return cuda_leaf_best_split_right_sum_hessian_; } - - const data_size_t* cuda_leaf_best_split_right_count() const { return cuda_leaf_best_split_right_count_; } - - const double* cuda_leaf_best_split_right_gain() const { return cuda_leaf_best_split_right_gain_; } - - const double* cuda_leaf_best_split_right_output() const { return cuda_leaf_best_split_right_output_; } - - uint8_t* cuda_leaf_best_split_found() const { return cuda_leaf_best_split_found_; } - - void TestAfterInit() { - PrintLastCUDAError(); - } - - void TestAfterFindBestSplits() { - PrintLastCUDAError(); - const size_t feature_best_split_info_buffer_size = static_cast(num_features_) * 4; - std::vector test_best_split_threshold(feature_best_split_info_buffer_size, 0); - std::vector test_best_split_found(feature_best_split_info_buffer_size, 0); - CopyFromCUDADeviceToHost(test_best_split_threshold.data(), - cuda_best_split_threshold_, feature_best_split_info_buffer_size); - CopyFromCUDADeviceToHost(test_best_split_found.data(), - cuda_best_split_found_, feature_best_split_info_buffer_size); - for (size_t i = 0; i < feature_best_split_info_buffer_size; ++i) { - Log::Warning("test_best_split_threshold[%d] = %d", i, test_best_split_threshold[i]); - Log::Warning("test_best_split_found[%d] = %d", i, test_best_split_found[i]); - } - - int test_best_leaf = 0; - CopyFromCUDADeviceToHost(&test_best_leaf, cuda_best_leaf_, 1); - Log::Warning("test_best_leaf = %d", test_best_leaf); - } + CUDASplitInfo* cuda_leaf_best_split_info() { return cuda_leaf_best_split_info_; } private: void LaunchFindBestSplitsForLeafKernel(const CUDALeafSplits* smaller_leaf_splits, @@ -146,38 +97,10 @@ class CUDABestSplitFinder { // for per leaf best split information int* cuda_best_leaf_; int* cuda_leaf_best_split_feature_; - uint8_t* cuda_leaf_best_split_default_left_; - uint32_t* cuda_leaf_best_split_threshold_; - double* cuda_leaf_best_split_gain_; - double* cuda_leaf_best_split_left_sum_gradient_; - double* cuda_leaf_best_split_left_sum_hessian_; - data_size_t* cuda_leaf_best_split_left_count_; - double* cuda_leaf_best_split_left_gain_; - double* cuda_leaf_best_split_left_output_; - double* cuda_leaf_best_split_right_sum_gradient_; - double* cuda_leaf_best_split_right_sum_hessian_; - data_size_t* cuda_leaf_best_split_right_count_; - double* cuda_leaf_best_split_right_gain_; - double* cuda_leaf_best_split_right_output_; - uint8_t* cuda_leaf_best_split_found_; CUDASplitInfo* cuda_leaf_best_split_info_; // for best split information when finding best split - uint8_t* cuda_best_split_default_left_; - uint32_t* cuda_best_split_threshold_; - double* cuda_best_split_gain_; - double* cuda_best_split_left_sum_gradient_; - double* cuda_best_split_left_sum_hessian_; - data_size_t* cuda_best_split_left_count_; - double* cuda_best_split_left_gain_; - double* cuda_best_split_left_output_; - double* cuda_best_split_right_sum_gradient_; - double* cuda_best_split_right_sum_hessian_; - data_size_t* cuda_best_split_right_count_; - double* cuda_best_split_right_gain_; - double* cuda_best_split_right_output_; - uint8_t* cuda_best_split_found_; - int* cuda_num_total_bin_; CUDASplitInfo* cuda_best_split_info_; + int* cuda_num_total_bin_; // TODO(shiyu1994): use prefix sum to accelerate best split finding hist_t* prefix_sum_hist_left_; hist_t* prefix_sum_hist_right_; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index dcb2b310d9c4..1b31b8398388 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -124,15 +124,8 @@ void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { } void CUDADataPartition::Split(const int* leaf_id, - const double* best_split_gain, const int* best_split_feature, - const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, - uint8_t* best_split_found, + CUDASplitInfo* best_split_info, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -165,11 +158,8 @@ void CUDADataPartition::Split(const int* leaf_id, global_timer.Start("SplitInner"); SplitInner(leaf_id, num_data_in_leaf, - best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, - best_left_sum_gradients, best_left_sum_hessians, best_left_count, - best_left_gain, best_left_leaf_value, - best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, best_split_found, + best_split_feature, + best_split_info, smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, @@ -192,12 +182,8 @@ void CUDADataPartition::GenDataToLeftBitVector(const data_size_t num_data_in_lea } void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, - const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, const double* best_split_gain, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, + const int* best_split_feature, + CUDASplitInfo* best_split_info, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -213,11 +199,8 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { LaunchSplitInnerKernel(leaf_index, num_data_in_leaf, - best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, - best_left_sum_gradients, best_left_sum_hessians, best_left_count, - best_left_gain, best_left_leaf_value, - best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, best_split_found, + best_split_feature, + best_split_info, smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index bcbd4046a321..fdac907330fc 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -1123,12 +1123,6 @@ __global__ void AggregateBlockOffsetKernel0(const int* leaf_index, data_size_t* data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, int* cuda_cur_num_leaves, - const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, const double* best_split_gain, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -1208,12 +1202,6 @@ __global__ void AggregateBlockOffsetKernel1(const int* leaf_index, data_size_t* data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, int* cuda_cur_num_leaves, - const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, const double* best_split_gain, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -1272,12 +1260,8 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, int* cuda_cur_num_leaves, - const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, const double* best_split_gain, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, + const int* best_split_feature, + CUDASplitInfo* best_split_info, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -1303,32 +1287,33 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo const unsigned int to_left_total_cnt = cuda_leaf_num_data[leaf_index_ref]; double* cuda_split_info_buffer_for_hessians = reinterpret_cast(cuda_split_info_buffer + 8); const unsigned int global_thread_index = blockIdx.x * blockDim.x + threadIdx.x; + const CUDASplitInfo* leaf_split_info = best_split_info + leaf_index_ref; if (global_thread_index == 0) { tree_split_leaf_index[cur_max_leaf_index - 1] = leaf_index_ref; } else if (global_thread_index == 1) { tree_inner_feature_index[cur_max_leaf_index - 1] = best_split_feature[leaf_index_ref]; } else if (global_thread_index == 2) { - tree_threshold[cur_max_leaf_index - 1] = best_split_threshold[leaf_index_ref]; + tree_threshold[cur_max_leaf_index - 1] = leaf_split_info->threshold; } else if (global_thread_index == 3) { - tree_left_output[cur_max_leaf_index - 1] = best_left_leaf_value[leaf_index_ref]; + tree_left_output[cur_max_leaf_index - 1] = leaf_split_info->left_value; } else if (global_thread_index == 4) { - tree_right_output[cur_max_leaf_index - 1] = best_right_leaf_value[leaf_index_ref]; + tree_right_output[cur_max_leaf_index - 1] = leaf_split_info->right_value; } else if (global_thread_index == 5) { - tree_left_count[cur_max_leaf_index - 1] = best_left_count[leaf_index_ref]; + tree_left_count[cur_max_leaf_index - 1] = leaf_split_info->left_count; } else if (global_thread_index == 6) { - tree_right_count[cur_max_leaf_index - 1] = best_right_count[leaf_index_ref]; + tree_right_count[cur_max_leaf_index - 1] = leaf_split_info->right_count; } else if (global_thread_index == 7) { - tree_left_sum_hessian[cur_max_leaf_index - 1] = best_left_sum_hessians[leaf_index_ref]; + tree_left_sum_hessian[cur_max_leaf_index - 1] = leaf_split_info->left_sum_hessians; } else if (global_thread_index == 8) { - tree_right_sum_hessian[cur_max_leaf_index - 1] = best_right_sum_hessians[leaf_index_ref]; + tree_right_sum_hessian[cur_max_leaf_index - 1] = leaf_split_info->right_sum_hessians; } else if (global_thread_index == 9) { - tree_gain[cur_max_leaf_index - 1] = best_split_gain[leaf_index_ref]; + tree_gain[cur_max_leaf_index - 1] = leaf_split_info->gain; } else if (global_thread_index == 10) { - tree_default_left[cur_max_leaf_index - 1] = best_split_default_left[leaf_index_ref]; + tree_default_left[cur_max_leaf_index - 1] = leaf_split_info->default_left; } else if (global_thread_index == 11) { - cuda_leaf_output[leaf_index_ref] = best_left_leaf_value[leaf_index_ref]; + cuda_leaf_output[leaf_index_ref] = leaf_split_info->left_value; } else if (global_thread_index == 12) { - cuda_leaf_output[cur_max_leaf_index] = best_right_leaf_value[leaf_index_ref]; + cuda_leaf_output[cur_max_leaf_index] = leaf_split_info->right_value; } else if (global_thread_index == 13) { cuda_split_info_buffer[0] = leaf_index_ref; } else if (global_thread_index == 14) { @@ -1342,15 +1327,15 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo } else if (global_thread_index == 18) { cuda_split_info_buffer[5] = cuda_leaf_data_start[cur_max_leaf_index]; } else if (global_thread_index == 19) { - cuda_split_info_buffer_for_hessians[0] = best_left_sum_hessians[leaf_index_ref]; + cuda_split_info_buffer_for_hessians[0] = leaf_split_info->left_sum_hessians; } else if (global_thread_index == 20) { - cuda_split_info_buffer_for_hessians[1] = best_right_sum_hessians[leaf_index_ref]; + cuda_split_info_buffer_for_hessians[1] = leaf_split_info->right_sum_hessians; } else if (global_thread_index == 21) { - best_split_found[leaf_index_ref] = 0; + best_split_info[leaf_index_ref].is_valid = false; } else if (global_thread_index == 22) { - best_split_found[cur_max_leaf_index] = 0; + best_split_info[cur_max_leaf_index].is_valid = false; } else if (global_thread_index == 23) { - const uint32_t threshold_int = best_split_threshold[leaf_index_ref]; + const uint32_t threshold_int = leaf_split_info->threshold; const int split_inner_feature_index = best_split_feature[leaf_index_ref]; const double threshold_real = cuda_bin_upper_bounds[cuda_feature_num_bin_offsets[split_inner_feature_index] + threshold_int]; tree_threshold_real[cur_max_leaf_index - 1] = threshold_real; @@ -1364,29 +1349,29 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; } else if (global_thread_index == 1) { - *smaller_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; + *smaller_leaf_cuda_sum_of_gradients_pointer = leaf_split_info->left_sum_gradients; } else if (global_thread_index == 2) { - *smaller_leaf_cuda_sum_of_hessians_pointer = best_left_sum_hessians[leaf_index_ref]; + *smaller_leaf_cuda_sum_of_hessians_pointer = leaf_split_info->left_sum_hessians; } else if (global_thread_index == 3) { *smaller_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt; } else if (global_thread_index == 4) { - *smaller_leaf_cuda_gain_pointer = best_left_gain[leaf_index_ref]; + *smaller_leaf_cuda_gain_pointer = leaf_split_info->left_gain; } else if (global_thread_index == 5) { - *smaller_leaf_cuda_leaf_value_pointer = best_left_leaf_value[leaf_index_ref]; + *smaller_leaf_cuda_leaf_value_pointer = leaf_split_info->left_value; } else if (global_thread_index == 6) { *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices; } else if (global_thread_index == 7) { *larger_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; } else if (global_thread_index == 8) { - *larger_leaf_cuda_sum_of_gradients_pointer = best_right_sum_gradients[leaf_index_ref]; + *larger_leaf_cuda_sum_of_gradients_pointer = leaf_split_info->right_sum_gradients; } else if (global_thread_index == 9) { - *larger_leaf_cuda_sum_of_hessians_pointer = best_right_sum_hessians[leaf_index_ref]; + *larger_leaf_cuda_sum_of_hessians_pointer = leaf_split_info->right_sum_hessians; } else if (global_thread_index == 10) { *larger_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index]; } else if (global_thread_index == 11) { - *larger_leaf_cuda_gain_pointer = best_right_gain[leaf_index_ref]; + *larger_leaf_cuda_gain_pointer = leaf_split_info->right_gain; } else if (global_thread_index == 12) { - *larger_leaf_cuda_leaf_value_pointer = best_right_leaf_value[leaf_index_ref]; + *larger_leaf_cuda_leaf_value_pointer = leaf_split_info->right_value; } else if (global_thread_index == 13) { *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_num_data[leaf_index_ref]; } else if (global_thread_index == 14) { @@ -1400,29 +1385,29 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo if (global_thread_index == 0) { *larger_leaf_cuda_leaf_index_pointer = leaf_index_ref; } else if (global_thread_index == 1) { - *larger_leaf_cuda_sum_of_gradients_pointer = best_left_sum_gradients[leaf_index_ref]; + *larger_leaf_cuda_sum_of_gradients_pointer = leaf_split_info->left_sum_gradients; } else if (global_thread_index == 2) { - *larger_leaf_cuda_sum_of_hessians_pointer = best_left_sum_hessians[leaf_index_ref]; + *larger_leaf_cuda_sum_of_hessians_pointer = leaf_split_info->left_sum_hessians; } else if (global_thread_index == 3) { *larger_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt; } else if (global_thread_index == 4) { - *larger_leaf_cuda_gain_pointer = best_left_gain[leaf_index_ref]; + *larger_leaf_cuda_gain_pointer = leaf_split_info->left_gain; } else if (global_thread_index == 5) { - *larger_leaf_cuda_leaf_value_pointer = best_left_leaf_value[leaf_index_ref]; + *larger_leaf_cuda_leaf_value_pointer = leaf_split_info->left_value; } else if (global_thread_index == 6) { *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices; } else if (global_thread_index == 7) { *smaller_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; } else if (global_thread_index == 8) { - *smaller_leaf_cuda_sum_of_gradients_pointer = best_right_sum_gradients[leaf_index_ref]; + *smaller_leaf_cuda_sum_of_gradients_pointer = leaf_split_info->right_sum_gradients; } else if (global_thread_index == 9) { - *smaller_leaf_cuda_sum_of_hessians_pointer = best_right_sum_hessians[leaf_index_ref]; + *smaller_leaf_cuda_sum_of_hessians_pointer = leaf_split_info->right_sum_hessians; } else if (global_thread_index == 10) { *smaller_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index]; } else if (global_thread_index == 11) { - *smaller_leaf_cuda_gain_pointer = best_right_gain[leaf_index_ref]; + *smaller_leaf_cuda_gain_pointer = leaf_split_info->right_gain; } else if (global_thread_index == 12) { - *smaller_leaf_cuda_leaf_value_pointer = best_right_leaf_value[leaf_index_ref]; + *smaller_leaf_cuda_leaf_value_pointer = leaf_split_info->right_value; } else if (global_thread_index == 13) { *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_num_data[leaf_index_ref]; } else if (global_thread_index == 14) { @@ -1638,12 +1623,8 @@ __global__ void CopyDataIndicesKernel( } void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data_size_t num_data_in_leaf, - const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, const double* best_split_gain, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, + const int* best_split_feature, + CUDASplitInfo* best_split_info, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -1680,11 +1661,6 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, cuda_cur_num_leaves_, - best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, - best_left_sum_gradients, best_left_sum_hessians, best_left_count, - best_left_gain, best_left_leaf_value, - best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -1704,11 +1680,6 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, cuda_cur_num_leaves_, - best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, - best_left_sum_gradients, best_left_sum_hessians, best_left_count, - best_left_gain, best_left_leaf_value, - best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -1740,11 +1711,8 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_cur_num_leaves_, - best_split_feature, best_split_threshold, best_split_default_left, best_split_gain, - best_left_sum_gradients, best_left_sum_hessians, best_left_count, - best_left_gain, best_left_leaf_value, - best_right_sum_gradients, best_right_sum_hessians, best_right_count, - best_right_gain, best_right_leaf_value, best_split_found, + best_split_feature, + best_split_info, smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index a90b50530464..e3a28eb2082b 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -39,13 +39,8 @@ class CUDADataPartition { void BeforeTrain(const data_size_t* data_indices); - void Split(const int* leaf_id, const double* best_split_gain, const int* best_split_feature, - const uint32_t* best_split_threshold, const uint8_t* best_split_default_left, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, - uint8_t* best_split_found, + void Split(const int* leaf_id, const int* best_split_feature, + CUDASplitInfo* best_split_info, // for splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -119,12 +114,8 @@ class CUDADataPartition { const int left_leaf_index, const int right_leaf_index); void SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, - const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, const double* best_split_gain, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, + const int* best_split_feature, + CUDASplitInfo* best_split_info, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, @@ -144,12 +135,8 @@ class CUDADataPartition { void LaunchFillDataIndicesBeforeTrain(); void LaunchSplitInnerKernel(const int* leaf_index, const data_size_t num_data_in_leaf, - const int* best_split_feature, const uint32_t* best_split_threshold, - const uint8_t* best_split_default_left, const double* best_split_gain, - const double* best_left_sum_gradients, const double* best_left_sum_hessians, const data_size_t* best_left_count, - const double* best_left_gain, const double* best_left_leaf_value, - const double* best_right_sum_gradients, const double* best_right_sum_hessians, const data_size_t* best_right_count, - const double* best_right_gain, const double* best_right_leaf_value, uint8_t* best_split_found, + const int* best_split_feature, + CUDASplitInfo* best_split_info, // for leaf splits information update int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 125d6a892120..19e6d8ee7367 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -204,24 +204,8 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, global_timer.Start("NewCUDATreeLearner::Split"); start = std::chrono::steady_clock::now(); cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), - cuda_best_split_finder_->cuda_leaf_best_split_gain(), cuda_best_split_finder_->cuda_leaf_best_split_feature(), - cuda_best_split_finder_->cuda_leaf_best_split_threshold(), - cuda_best_split_finder_->cuda_leaf_best_split_default_left(), - - cuda_best_split_finder_->cuda_leaf_best_split_left_sum_gradient(), - cuda_best_split_finder_->cuda_leaf_best_split_left_sum_hessian(), - cuda_best_split_finder_->cuda_leaf_best_split_left_count(), - cuda_best_split_finder_->cuda_leaf_best_split_left_gain(), - cuda_best_split_finder_->cuda_leaf_best_split_left_output(), - cuda_best_split_finder_->cuda_leaf_best_split_right_sum_gradient(), - cuda_best_split_finder_->cuda_leaf_best_split_right_sum_hessian(), - cuda_best_split_finder_->cuda_leaf_best_split_right_count(), - cuda_best_split_finder_->cuda_leaf_best_split_right_gain(), - cuda_best_split_finder_->cuda_leaf_best_split_right_output(), - - cuda_best_split_finder_->cuda_leaf_best_split_found(), - + cuda_best_split_finder_->cuda_leaf_best_split_info(), cuda_smaller_leaf_splits_->cuda_leaf_index_pointer(), cuda_smaller_leaf_splits_->cuda_sum_of_gradients_pointer(), cuda_smaller_leaf_splits_->cuda_sum_of_hessians_pointer(), From 400622a2aa7d719aa3720f56efc36431665eed82 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 21 Jul 2021 07:33:54 +0000 Subject: [PATCH 040/166] use structure for leaf splits --- .../cuda/cuda_best_split_finder.cpp | 4 +- .../cuda/cuda_best_split_finder.cu | 50 +- .../cuda/cuda_best_split_finder.hpp | 6 +- src/treelearner/cuda/cuda_data_partition.cpp | 50 +- src/treelearner/cuda/cuda_data_partition.cu | 269 ++--------- src/treelearner/cuda/cuda_data_partition.hpp | 36 +- .../cuda/cuda_histogram_constructor.cpp | 13 +- .../cuda/cuda_histogram_constructor.cu | 454 +++--------------- .../cuda/cuda_histogram_constructor.hpp | 56 +-- src/treelearner/cuda/cuda_leaf_splits.cpp | 57 +-- src/treelearner/cuda/cuda_leaf_splits.cu | 65 ++- src/treelearner/cuda/cuda_leaf_splits.hpp | 70 +-- .../cuda/new_cuda_tree_learner.cpp | 37 +- 13 files changed, 250 insertions(+), 917 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 335aca3881c5..bba4d9ae59b6 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -132,8 +132,8 @@ void CUDABestSplitFinder::Init() { void CUDABestSplitFinder::BeforeTrain() {} -void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, - const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, +void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplitsStruct* smaller_leaf_splits, + const CUDALeafSplitsStruct* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ && sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 1c515ebac4b6..42cf165e7155 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -392,17 +392,9 @@ __global__ void FindBestSplitsForLeafKernel( const uint8_t* task_out_default_left, // input leaf information const int smaller_leaf_index, - const double* smaller_leaf_gain, - const double* smaller_sum_gradients_in_leaf, - const double* smaller_sum_hessians_in_leaf, - const data_size_t* smaller_num_data_in_leaf, - hist_t** smaller_leaf_hist, + const CUDALeafSplitsStruct* smaller_leaf_splits, const int larger_leaf_index, - const double* larger_leaf_gain, - const double* larger_sum_gradients_in_leaf, - const double* larger_sum_hessians_in_leaf, - const data_size_t* larger_num_data_in_leaf, - hist_t** larger_leaf_hist, + const CUDALeafSplitsStruct* larger_leaf_splits, // input config parameter values const data_size_t min_data_in_leaf, const double min_sum_hessian_in_leaf, @@ -419,13 +411,13 @@ __global__ void FindBestSplitsForLeafKernel( const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); const bool na_as_missing = static_cast(task_na_as_missing[task_index]); const bool assume_out_default_left = task_out_default_left[task_index]; - const double parent_gain = is_larger ? *larger_leaf_gain : *smaller_leaf_gain; - const double sum_gradients = is_larger ? *larger_sum_gradients_in_leaf : *smaller_sum_gradients_in_leaf; - const double sum_hessians = (is_larger ? *larger_sum_hessians_in_leaf : *smaller_sum_hessians_in_leaf) + 2 * K_EPSILON; - const double num_data = is_larger ? *larger_num_data_in_leaf : *smaller_num_data_in_leaf; + const double parent_gain = is_larger ? larger_leaf_splits->gain : smaller_leaf_splits->gain; + const double sum_gradients = is_larger ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients; + const double sum_hessians = (is_larger ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * K_EPSILON; + const double num_data = is_larger ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf; const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; CUDASplitInfo* out = cuda_best_split_info + output_offset; - const hist_t* hist_ptr = (is_larger ? *larger_leaf_hist : *smaller_leaf_hist) + feature_hist_offsets[inner_feature_index] * 2; + const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + feature_hist_offsets[inner_feature_index] * 2; FindBestSplitsForLeafKernelInner( // input feature information hist_ptr, @@ -455,8 +447,8 @@ __global__ void FindBestSplitsForLeafKernel( } void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( - const CUDALeafSplits* smaller_leaf_splits, - const CUDALeafSplits* larger_leaf_splits, + const CUDALeafSplitsStruct* smaller_leaf_splits, + const CUDALeafSplitsStruct* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, const bool is_smaller_leaf_valid, @@ -486,17 +478,9 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_task_out_default_left_, // input leaf information smaller_leaf_index, - smaller_leaf_splits->cuda_gain(), - smaller_leaf_splits->cuda_sum_of_gradients(), - smaller_leaf_splits->cuda_sum_of_hessians(), - smaller_leaf_splits->cuda_num_data_in_leaf(), - smaller_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), + smaller_leaf_splits, larger_leaf_index, - larger_leaf_splits->cuda_gain(), - larger_leaf_splits->cuda_sum_of_gradients(), - larger_leaf_splits->cuda_sum_of_hessians(), - larger_leaf_splits->cuda_num_data_in_leaf(), - larger_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), + larger_leaf_splits, // configuration parameter values min_data_in_leaf_, min_sum_hessian_in_leaf_, @@ -525,17 +509,9 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_task_out_default_left_, // input leaf information smaller_leaf_index, - smaller_leaf_splits->cuda_gain(), - smaller_leaf_splits->cuda_sum_of_gradients(), - smaller_leaf_splits->cuda_sum_of_hessians(), - smaller_leaf_splits->cuda_num_data_in_leaf(), - smaller_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), + smaller_leaf_splits, larger_leaf_index, - larger_leaf_splits->cuda_gain(), - larger_leaf_splits->cuda_sum_of_gradients(), - larger_leaf_splits->cuda_sum_of_hessians(), - larger_leaf_splits->cuda_num_data_in_leaf(), - larger_leaf_splits->cuda_hist_in_leaf_pointer_pointer(), + larger_leaf_splits, // configuration parameter values min_data_in_leaf_, min_sum_hessian_in_leaf_, diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 4dcb6fbcae15..838294883484 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -38,7 +38,7 @@ class CUDABestSplitFinder { void BeforeTrain(); - void FindBestSplitsForLeaf(const CUDALeafSplits* smaller_leaf_splits, const CUDALeafSplits* larger_leaf_splits, + void FindBestSplitsForLeaf(const CUDALeafSplitsStruct* smaller_leaf_splits, const CUDALeafSplitsStruct* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); @@ -54,8 +54,8 @@ class CUDABestSplitFinder { CUDASplitInfo* cuda_leaf_best_split_info() { return cuda_leaf_best_split_info_; } private: - void LaunchFindBestSplitsForLeafKernel(const CUDALeafSplits* smaller_leaf_splits, - const CUDALeafSplits* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, + void LaunchFindBestSplitsForLeafKernel(const CUDALeafSplitsStruct* smaller_leaf_splits, + const CUDALeafSplitsStruct* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, const bool is_smaller_leaf_valid, const bool is_larger_leaf_valid); void LaunchSyncBestSplitForLeafKernel( diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 1b31b8398388..1f316706ed84 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -127,16 +127,8 @@ void CUDADataPartition::Split(const int* leaf_id, const int* best_split_feature, CUDASplitInfo* best_split_info, // for leaf splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, + CUDALeafSplitsStruct* smaller_leaf_splits, + CUDALeafSplitsStruct* larger_leaf_splits, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, @@ -160,16 +152,9 @@ void CUDADataPartition::Split(const int* leaf_id, SplitInner(leaf_id, num_data_in_leaf, best_split_feature, best_split_info, - smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, - smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, - smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, - smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - smaller_leaf_cuda_hist_pointer_pointer, - larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, - larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, - larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, + smaller_leaf_splits, + larger_leaf_splits, + cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, smaller_leaf_index, larger_leaf_index, cpu_leaf_index); global_timer.Stop("SplitInner"); } @@ -185,32 +170,17 @@ void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_ const int* best_split_feature, CUDASplitInfo* best_split_info, // for leaf splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, + CUDALeafSplitsStruct* smaller_leaf_splits, + CUDALeafSplitsStruct* larger_leaf_splits, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { LaunchSplitInnerKernel(leaf_index, num_data_in_leaf, best_split_feature, best_split_info, - smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, - smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, - smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, - smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - smaller_leaf_cuda_hist_pointer_pointer, - larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, - larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, - larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, + smaller_leaf_splits, + larger_leaf_splits, + cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, smaller_leaf_index, larger_leaf_index, cpu_leaf_index); ++cur_num_leaves_; } diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index fdac907330fc..6292dc802ada 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -1123,18 +1123,6 @@ __global__ void AggregateBlockOffsetKernel0(const int* leaf_index, data_size_t* data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, int* cuda_cur_num_leaves, - // for leaf splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, - hist_t* cuda_hist, hist_t** cuda_hist_pool, const data_size_t num_blocks) { __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; @@ -1202,18 +1190,6 @@ __global__ void AggregateBlockOffsetKernel1(const int* leaf_index, data_size_t* data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, int* cuda_cur_num_leaves, - // for leaf splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, - hist_t* cuda_hist, hist_t** cuda_hist_pool, const data_size_t num_blocks, const data_size_t num_blocks_aligned) { __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; @@ -1263,16 +1239,8 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo const int* best_split_feature, CUDASplitInfo* best_split_info, // for leaf splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, + CUDALeafSplitsStruct* smaller_leaf_splits, + CUDALeafSplitsStruct* larger_leaf_splits, const int num_total_bin, hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition, @@ -1346,75 +1314,75 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo hist_t* parent_hist_ptr = cuda_hist_pool[leaf_index_ref]; cuda_hist_pool[cur_max_leaf_index] = parent_hist_ptr; cuda_hist_pool[leaf_index_ref] = cuda_hist + 2 * cur_max_leaf_index * num_total_bin; - *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; - *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; + smaller_leaf_splits->hist_in_leaf = cuda_hist_pool[leaf_index_ref]; + larger_leaf_splits->hist_in_leaf = cuda_hist_pool[cur_max_leaf_index]; } else if (global_thread_index == 1) { - *smaller_leaf_cuda_sum_of_gradients_pointer = leaf_split_info->left_sum_gradients; + smaller_leaf_splits->sum_of_gradients = leaf_split_info->left_sum_gradients; } else if (global_thread_index == 2) { - *smaller_leaf_cuda_sum_of_hessians_pointer = leaf_split_info->left_sum_hessians; + smaller_leaf_splits->sum_of_hessians = leaf_split_info->left_sum_hessians; } else if (global_thread_index == 3) { - *smaller_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt; + smaller_leaf_splits->num_data_in_leaf = to_left_total_cnt; } else if (global_thread_index == 4) { - *smaller_leaf_cuda_gain_pointer = leaf_split_info->left_gain; + smaller_leaf_splits->gain = leaf_split_info->left_gain; } else if (global_thread_index == 5) { - *smaller_leaf_cuda_leaf_value_pointer = leaf_split_info->left_value; + smaller_leaf_splits->leaf_value = leaf_split_info->left_value; } else if (global_thread_index == 6) { - *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices; + smaller_leaf_splits->data_indices_in_leaf = cuda_data_indices; } else if (global_thread_index == 7) { - *larger_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; + larger_leaf_splits->leaf_index = cur_max_leaf_index; } else if (global_thread_index == 8) { - *larger_leaf_cuda_sum_of_gradients_pointer = leaf_split_info->right_sum_gradients; + larger_leaf_splits->sum_of_gradients = leaf_split_info->right_sum_gradients; } else if (global_thread_index == 9) { - *larger_leaf_cuda_sum_of_hessians_pointer = leaf_split_info->right_sum_hessians; + larger_leaf_splits->sum_of_hessians = leaf_split_info->right_sum_hessians; } else if (global_thread_index == 10) { - *larger_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index]; + larger_leaf_splits->num_data_in_leaf = cuda_leaf_num_data[cur_max_leaf_index]; } else if (global_thread_index == 11) { - *larger_leaf_cuda_gain_pointer = leaf_split_info->right_gain; + larger_leaf_splits->gain = leaf_split_info->right_gain; } else if (global_thread_index == 12) { - *larger_leaf_cuda_leaf_value_pointer = leaf_split_info->right_value; + larger_leaf_splits->leaf_value = leaf_split_info->right_value; } else if (global_thread_index == 13) { - *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_num_data[leaf_index_ref]; + larger_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[leaf_index_ref]; } else if (global_thread_index == 14) { cuda_split_info_buffer[6] = leaf_index_ref; } else if (global_thread_index == 15) { cuda_split_info_buffer[7] = cur_max_leaf_index; } else if (global_thread_index == 16) { - *smaller_leaf_cuda_leaf_index_pointer = leaf_index_ref; + smaller_leaf_splits->leaf_index = leaf_index_ref; } } else { if (global_thread_index == 0) { - *larger_leaf_cuda_leaf_index_pointer = leaf_index_ref; + larger_leaf_splits->leaf_index = leaf_index_ref; } else if (global_thread_index == 1) { - *larger_leaf_cuda_sum_of_gradients_pointer = leaf_split_info->left_sum_gradients; + larger_leaf_splits->sum_of_gradients = leaf_split_info->left_sum_gradients; } else if (global_thread_index == 2) { - *larger_leaf_cuda_sum_of_hessians_pointer = leaf_split_info->left_sum_hessians; + larger_leaf_splits->sum_of_hessians = leaf_split_info->left_sum_hessians; } else if (global_thread_index == 3) { - *larger_leaf_cuda_num_data_in_leaf_pointer = to_left_total_cnt; + larger_leaf_splits->num_data_in_leaf = to_left_total_cnt; } else if (global_thread_index == 4) { - *larger_leaf_cuda_gain_pointer = leaf_split_info->left_gain; + larger_leaf_splits->gain = leaf_split_info->left_gain; } else if (global_thread_index == 5) { - *larger_leaf_cuda_leaf_value_pointer = leaf_split_info->left_value; + larger_leaf_splits->leaf_value = leaf_split_info->left_value; } else if (global_thread_index == 6) { - *larger_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices; + larger_leaf_splits->data_indices_in_leaf = cuda_data_indices; } else if (global_thread_index == 7) { - *smaller_leaf_cuda_leaf_index_pointer = cur_max_leaf_index; + smaller_leaf_splits->leaf_index = cur_max_leaf_index; } else if (global_thread_index == 8) { - *smaller_leaf_cuda_sum_of_gradients_pointer = leaf_split_info->right_sum_gradients; + smaller_leaf_splits->sum_of_gradients = leaf_split_info->right_sum_gradients; } else if (global_thread_index == 9) { - *smaller_leaf_cuda_sum_of_hessians_pointer = leaf_split_info->right_sum_hessians; + smaller_leaf_splits->sum_of_hessians = leaf_split_info->right_sum_hessians; } else if (global_thread_index == 10) { - *smaller_leaf_cuda_num_data_in_leaf_pointer = cuda_leaf_num_data[cur_max_leaf_index]; + smaller_leaf_splits->num_data_in_leaf = cuda_leaf_num_data[cur_max_leaf_index]; } else if (global_thread_index == 11) { - *smaller_leaf_cuda_gain_pointer = leaf_split_info->right_gain; + smaller_leaf_splits->gain = leaf_split_info->right_gain; } else if (global_thread_index == 12) { - *smaller_leaf_cuda_leaf_value_pointer = leaf_split_info->right_value; + smaller_leaf_splits->leaf_value = leaf_split_info->right_value; } else if (global_thread_index == 13) { - *smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer = cuda_data_indices + cuda_leaf_num_data[leaf_index_ref]; + smaller_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[leaf_index_ref]; } else if (global_thread_index == 14) { cuda_hist_pool[cur_max_leaf_index] = cuda_hist + 2 * cur_max_leaf_index * num_total_bin; - *smaller_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[cur_max_leaf_index]; + smaller_leaf_splits->hist_in_leaf = cuda_hist_pool[cur_max_leaf_index]; } else if (global_thread_index == 15) { - *larger_leaf_cuda_hist_pointer_pointer = cuda_hist_pool[leaf_index_ref]; + larger_leaf_splits->hist_in_leaf = cuda_hist_pool[leaf_index_ref]; } else if (global_thread_index == 16) { cuda_split_info_buffer[6] = cur_max_leaf_index; } else if (global_thread_index == 17) { @@ -1493,124 +1461,6 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ } } -__global__ void SplitInnerKernel2(const int* leaf_index, const int* cuda_cur_num_leaves, - const data_size_t* cuda_leaf_data_start, const data_size_t* cuda_leaf_num_data, - const data_size_t* cuda_data_indices, const uint8_t* split_to_left_bit_vector, - const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, - data_size_t* out_data_indices_in_leaf, const int split_indices_block_size_data_partition) { - __shared__ uint16_t thread_to_left_pos[(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + 1 + - ((SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1) + 2) / NUM_BANKS_DATA_PARTITION]; - __shared__ uint16_t thread_to_right_pos[(SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION << 1)]; - const int leaf_index_ref = *leaf_index; - const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; - const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index_ref] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; - const unsigned int threadIdx_x = threadIdx.x; - const unsigned int blockDim_x = blockDim.x; - const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); - const unsigned int global_thread_index = blockIdx.x * blockDim_x * 2 + threadIdx_x; - const data_size_t* cuda_data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; - const uint32_t* split_to_left_bit_vector_uint32 = reinterpret_cast(split_to_left_bit_vector); - const uint32_t bit32_0 = split_to_left_bit_vector_uint32[global_thread_index]; - const uint8_t bit_0 = static_cast(bit32_0 & 0xf); - uint8_t bit_1 = static_cast((bit32_0 >> 8) & 0xf); - uint8_t bit_2 = static_cast((bit32_0 >> 16) & 0xf); - uint8_t bit_3 = static_cast((bit32_0 >> 24) & 0xf); - const uint8_t bit_1_acc = bit_1 + bit_0; - const uint8_t bit_2_acc = bit_1_acc + bit_2; - const uint8_t bit_3_acc = bit_2_acc + bit_3; - thread_to_left_pos[conflict_free_threadIdx_x_plus_1] = bit_3_acc; - const unsigned int conflict_free_threadIdx_x_plus_blockDim_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x + 1); - const unsigned int global_thread_index_plus_blockDim_x = global_thread_index + blockDim_x; - const uint32_t bit32_1 = split_to_left_bit_vector_uint32[global_thread_index_plus_blockDim_x]; - const uint8_t bit_4 = static_cast(bit32_1 & 0xf); - uint8_t bit_5 = static_cast((bit32_1 >> 8) & 0xf); - uint8_t bit_6 = static_cast((bit32_1 >> 16) & 0xf); - uint8_t bit_7 = static_cast((bit32_1 >> 24) & 0xf); - const uint8_t bit_5_acc = bit_4 + bit_5; - const uint8_t bit_6_acc = bit_5_acc + bit_6; - const uint8_t bit_7_acc = bit_6_acc + bit_7; - thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = bit_7_acc; - __syncthreads(); - const uint32_t to_right_block_offset = block_to_right_offset_buffer[blockIdx.x]; - const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x]; - if (threadIdx_x == 0) { - thread_to_left_pos[0] = 0; - thread_to_right_pos[0] = 0; - } - __syncthreads(); - PrefixSum(thread_to_left_pos, (split_indices_block_size_data_partition << 1)); - __syncthreads(); - if (threadIdx_x > 0) { - thread_to_right_pos[threadIdx_x] = ((threadIdx_x * 4) - thread_to_left_pos[conflict_free_threadIdx_x_plus_1]); - } - thread_to_right_pos[threadIdx_x + blockDim_x] = (((threadIdx_x + blockDim_x) * 4) - thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]); - __syncthreads(); - data_size_t* left_out_data_indices_in_leaf = out_data_indices_in_leaf + to_left_block_offset; - data_size_t* right_out_data_indices_in_leaf = out_data_indices_in_leaf + to_right_block_offset; - const data_size_t global_thread_index_base = global_thread_index * 4; - const data_size_t global_thread_index_plus_blockDim_x_base = global_thread_index_plus_blockDim_x * 4; - const uint16_t to_left_pos_offset_0 = thread_to_left_pos[conflict_free_threadIdx_x_plus_1]; - const uint16_t to_right_pos_offset_0 = thread_to_right_pos[threadIdx_x]; - const uint16_t to_left_pos_offset_1 = thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]; - const uint16_t to_right_pos_offset_1 = thread_to_right_pos[threadIdx_x + blockDim_x]; - if (global_thread_index_base < num_data_in_leaf_ref) { - if (bit_0 == 1) { - left_out_data_indices_in_leaf[to_left_pos_offset_0] = cuda_data_indices_in_leaf[global_thread_index_base]; - } else { - right_out_data_indices_in_leaf[to_right_pos_offset_0] = cuda_data_indices_in_leaf[global_thread_index_base]; - } - } - if (global_thread_index_base + 1 < num_data_in_leaf_ref) { - if (bit_1 == 1) { - left_out_data_indices_in_leaf[to_left_pos_offset_0 + bit_0] = cuda_data_indices_in_leaf[global_thread_index_base + 1]; - } else { - right_out_data_indices_in_leaf[to_right_pos_offset_0 + 1 - bit_0] = cuda_data_indices_in_leaf[global_thread_index_base + 1]; - } - } - if (global_thread_index_base + 2 < num_data_in_leaf_ref) { - if (bit_2 == 1) { - left_out_data_indices_in_leaf[to_left_pos_offset_0 + bit_1_acc] = cuda_data_indices_in_leaf[global_thread_index_base + 2]; - } else { - right_out_data_indices_in_leaf[to_right_pos_offset_0 + 2 - bit_1_acc] = cuda_data_indices_in_leaf[global_thread_index_base + 2]; - } - } - if (global_thread_index_base + 3 < num_data_in_leaf_ref) { - if (bit_3 == 1) { - left_out_data_indices_in_leaf[to_left_pos_offset_0 + bit_2_acc] = cuda_data_indices_in_leaf[global_thread_index_base + 3]; - } else { - right_out_data_indices_in_leaf[to_right_pos_offset_0 + 3 - bit_2_acc] = cuda_data_indices_in_leaf[global_thread_index_base + 3]; - } - } - if (global_thread_index_plus_blockDim_x_base < num_data_in_leaf_ref) { - if (bit_4 == 1) { - left_out_data_indices_in_leaf[to_left_pos_offset_1] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base]; - } else { - right_out_data_indices_in_leaf[to_right_pos_offset_1] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base]; - } - } - if (global_thread_index_plus_blockDim_x_base + 1 < num_data_in_leaf_ref) { - if (bit_5 == 1) { - left_out_data_indices_in_leaf[to_left_pos_offset_1 + bit_4] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 1]; - } else { - right_out_data_indices_in_leaf[to_right_pos_offset_1 + 1 - bit_4] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 1]; - } - } - if (global_thread_index_plus_blockDim_x_base + 2 < num_data_in_leaf_ref) { - if (bit_6 == 1) { - left_out_data_indices_in_leaf[to_left_pos_offset_1 + bit_5_acc] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 2]; - } else { - right_out_data_indices_in_leaf[to_right_pos_offset_1 + 2 - bit_5_acc] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 2]; - } - } - if (global_thread_index_plus_blockDim_x_base + 3 < num_data_in_leaf_ref) { - if (bit_7 == 1) { - left_out_data_indices_in_leaf[to_left_pos_offset_1 + bit_6_acc] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 3]; - } else { - right_out_data_indices_in_leaf[to_right_pos_offset_1 + 3 - bit_6_acc] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x_base + 3]; - } - } -} - __global__ void CopyDataIndicesKernel( const data_size_t num_data_in_leaf, const data_size_t* out_data_indices_in_leaf, @@ -1626,16 +1476,8 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data const int* best_split_feature, CUDASplitInfo* best_split_info, // for leaf splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, + CUDALeafSplitsStruct* smaller_leaf_splits, + CUDALeafSplitsStruct* larger_leaf_splits, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { @@ -1661,38 +1503,12 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, cuda_cur_num_leaves_, - - smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, - smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, - smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, - smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - smaller_leaf_cuda_hist_pointer_pointer, - larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, - larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, - larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, - cuda_hist_, - cuda_hist_pool_, num_blocks_final); } else { AggregateBlockOffsetKernel1<<<1, num_blocks_final_aligned, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, cuda_cur_num_leaves_, - - smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, - smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, - smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, - smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - smaller_leaf_cuda_hist_pointer_pointer, - larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, - larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, - larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, - cuda_hist_, - cuda_hist_pool_, num_blocks_final, num_blocks_final_aligned); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); @@ -1713,17 +1529,8 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data cuda_cur_num_leaves_, best_split_feature, best_split_info, - - smaller_leaf_cuda_leaf_index_pointer, smaller_leaf_cuda_sum_of_gradients_pointer, - smaller_leaf_cuda_sum_of_hessians_pointer, smaller_leaf_cuda_num_data_in_leaf_pointer, - smaller_leaf_cuda_gain_pointer, smaller_leaf_cuda_leaf_value_pointer, - smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - smaller_leaf_cuda_hist_pointer_pointer, - larger_leaf_cuda_leaf_index_pointer, larger_leaf_cuda_sum_of_gradients_pointer, - larger_leaf_cuda_sum_of_hessians_pointer, larger_leaf_cuda_num_data_in_leaf_pointer, - larger_leaf_cuda_gain_pointer, larger_leaf_cuda_leaf_value_pointer, - larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - larger_leaf_cuda_hist_pointer_pointer, + smaller_leaf_splits, + larger_leaf_splits, num_total_bin_, cuda_hist_, cuda_hist_pool_, split_indices_block_size_data_partition_aligned, diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index e3a28eb2082b..ccbfd6bedd84 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -42,16 +42,8 @@ class CUDADataPartition { void Split(const int* leaf_id, const int* best_split_feature, CUDASplitInfo* best_split_info, // for splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, + CUDALeafSplitsStruct* smaller_leaf_splits, + CUDALeafSplitsStruct* larger_leaf_splits, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, @@ -117,16 +109,8 @@ class CUDADataPartition { const int* best_split_feature, CUDASplitInfo* best_split_info, // for leaf splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, + CUDALeafSplitsStruct* smaller_leaf_splits, + CUDALeafSplitsStruct* larger_leaf_splits, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index); @@ -138,16 +122,8 @@ class CUDADataPartition { const int* best_split_feature, CUDASplitInfo* best_split_info, // for leaf splits information update - int* smaller_leaf_cuda_leaf_index_pointer, double* smaller_leaf_cuda_sum_of_gradients_pointer, - double* smaller_leaf_cuda_sum_of_hessians_pointer, data_size_t* smaller_leaf_cuda_num_data_in_leaf_pointer, - double* smaller_leaf_cuda_gain_pointer, double* smaller_leaf_cuda_leaf_value_pointer, - const data_size_t** smaller_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** smaller_leaf_cuda_hist_pointer_pointer, - int* larger_leaf_cuda_leaf_index_pointer, double* larger_leaf_cuda_sum_of_gradients_pointer, - double* larger_leaf_cuda_sum_of_hessians_pointer, data_size_t* larger_leaf_cuda_num_data_in_leaf_pointer, - double* larger_leaf_cuda_gain_pointer, double* larger_leaf_cuda_leaf_value_pointer, - const data_size_t** larger_leaf_cuda_data_indices_in_leaf_pointer_pointer, - hist_t** larger_leaf_cuda_hist_pointer_pointer, + CUDALeafSplitsStruct* smaller_leaf_splits, + CUDALeafSplitsStruct* larger_leaf_splits, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index d62e5d09092d..98fe0daf582e 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -288,23 +288,18 @@ void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, data_[index] = feature_bin_value_uint8; } -void CUDAHistogramConstructor::ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const data_size_t* cuda_num_data_in_smaller_leaf, - const int* cuda_larger_leaf_index, const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** /*cuda_data_indices_in_larger_leaf*/, - const double* cuda_smaller_leaf_sum_gradients, const double* cuda_smaller_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, - const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, +void CUDAHistogramConstructor::ConstructHistogramForLeaf( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits, const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { if ((num_data_in_smaller_leaf <= min_data_in_leaf_ || sum_hessians_in_smaller_leaf <= min_sum_hessian_in_leaf_) && (num_data_in_larger_leaf <= min_data_in_leaf_ || sum_hessians_in_larger_leaf <= min_sum_hessian_in_leaf_)) { return; } - LaunchConstructHistogramKernel(cuda_smaller_leaf_index, cuda_num_data_in_smaller_leaf, - cuda_data_indices_in_smaller_leaf, cuda_leaf_num_data, cuda_smaller_leaf_hist, num_data_in_smaller_leaf); + LaunchConstructHistogramKernel(cuda_smaller_leaf_splits, cuda_leaf_num_data, num_data_in_smaller_leaf); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); - LaunchSubtractHistogramKernel(cuda_smaller_leaf_index, - cuda_larger_leaf_index, cuda_smaller_leaf_sum_gradients, cuda_smaller_leaf_sum_hessians, - cuda_larger_leaf_sum_gradients, cuda_larger_leaf_sum_hessians, cuda_smaller_leaf_hist, cuda_larger_leaf_hist); + LaunchSubtractHistogramKernel(cuda_smaller_leaf_splits, cuda_larger_leaf_splits); global_timer.Stop("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 4dbb1509687d..061c23eaae6f 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -79,11 +79,9 @@ __device__ void ReduceSumHistogramConstructorMerge(hist_t* array, const size_t s template __global__ void CUDAConstructHistogramDenseKernel( - const int* leaf_index, + const CUDALeafSplitsStruct* smaller_leaf_splits, const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t** data_indices_ptr, - hist_t** feature_histogram, const int* num_feature_groups, const data_size_t* leaf_num_data, const BIN_TYPE* data, @@ -92,11 +90,11 @@ __global__ void CUDAConstructHistogramDenseKernel( const int* feature_partition_column_index_offsets, const data_size_t num_data) { - const int leaf_index_ref = *leaf_index; + const int leaf_index_ref = smaller_leaf_splits->leaf_index; const int dim_y = static_cast(gridDim.y * blockDim.y); const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; - const data_size_t* data_indices_ref = *data_indices_ptr; + const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf; __shared__ float shared_hist[SHRAE_HIST_SIZE]; const unsigned int num_threads_per_block = blockDim.x * blockDim.y; const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x]; @@ -136,7 +134,7 @@ __global__ void CUDAConstructHistogramDenseKernel( } } __syncthreads(); - hist_t* feature_histogram_ptr = (*feature_histogram) + (partition_hist_start << 1); + hist_t* feature_histogram_ptr = smaller_leaf_splits->hist_in_leaf + (partition_hist_start << 1); for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); } @@ -144,11 +142,9 @@ __global__ void CUDAConstructHistogramDenseKernel( template __global__ void CUDAConstructHistogramSparseKernel( - const int* leaf_index, + const CUDALeafSplitsStruct* smaller_leaf_splits, const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t** data_indices_ptr, - hist_t** feature_histogram, const int* num_feature_groups, const data_size_t* leaf_num_data, const BIN_TYPE* data, @@ -157,11 +153,11 @@ __global__ void CUDAConstructHistogramSparseKernel( const uint32_t* column_hist_offsets_full, const data_size_t num_data) { - const int leaf_index_ref = *leaf_index; + const int leaf_index_ref = smaller_leaf_splits->leaf_index; const int dim_y = static_cast(gridDim.y * blockDim.y); const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; - const data_size_t* data_indices_ref = *data_indices_ptr; + const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf; __shared__ float shared_hist[SHRAE_HIST_SIZE]; const unsigned int num_threads_per_block = blockDim.x * blockDim.y; const DATA_PTR_TYPE* block_row_ptr = row_ptr + blockIdx.x * (num_data + 1); @@ -200,177 +196,15 @@ __global__ void CUDAConstructHistogramSparseKernel( inner_data_index += blockDim.y; } __syncthreads(); - hist_t* feature_histogram_ptr = (*feature_histogram) + (partition_hist_start << 1); + hist_t* feature_histogram_ptr = smaller_leaf_splits->hist_in_leaf + (partition_hist_start << 1); for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); } } -template -__global__ void CUDAConstructHistogramDenseKernel2( - const int* leaf_index, - const score_t* cuda_gradients, - const score_t* cuda_hessians, - const data_size_t** data_indices_ptr, - const int* num_feature_groups, - const data_size_t* leaf_num_data, - const BIN_TYPE* data, - const uint32_t* column_hist_offsets, - const uint32_t* column_hist_offsets_full, - const int* feature_partition_column_index_offsets, - const data_size_t num_data, - hist_t* histogram_buffer, - const int total_num_bin) { - - const int leaf_index_ref = *leaf_index; - const int dim_y = static_cast(gridDim.y * blockDim.y); - const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; - const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; - const data_size_t* data_indices_ref = *data_indices_ptr; - __shared__ float shared_hist[SHRAE_HIST_SIZE]; - const unsigned int num_threads_per_block = blockDim.x * blockDim.y; - const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x]; - const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1]; - const BIN_TYPE* data_ptr = data + partition_column_start * num_data; - const int num_columns_in_partition = partition_column_end - partition_column_start; - const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; - const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; - const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1; - const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; - for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { - shared_hist[i] = 0.0f; - } - __syncthreads(); - const unsigned int threadIdx_y = threadIdx.y; - const unsigned int blockIdx_y = blockIdx.y; - const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread; - const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; - data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf_ref - block_start, num_data_per_thread * static_cast(blockDim.y))); - const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y; - const data_size_t remainder = block_num_data % blockDim.y; - const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); - data_size_t inner_data_index = static_cast(threadIdx_y); - const int column_index = static_cast(threadIdx.x) + partition_column_start; - if (threadIdx.x < static_cast(num_columns_in_partition)) { - float* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); - for (data_size_t i = 0; i < num_iteration_this; ++i) { - const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; - const score_t grad = cuda_gradients[data_index]; - const score_t hess = cuda_hessians[data_index]; - const uint32_t bin = static_cast(data_ptr[data_index * num_columns_in_partition + threadIdx.x]); - const uint32_t pos = bin << 1; - float* pos_ptr = shared_hist_ptr + pos; - atomicAdd_block(pos_ptr, grad); - atomicAdd_block(pos_ptr + 1, hess); - inner_data_index += blockDim.y; - } - } - __syncthreads(); - hist_t* feature_histogram_ptr = histogram_buffer + total_num_bin * (blockIdx.y % USED_HISTOGRAM_BUFFER_NUM) * 2 + (partition_hist_start << 1); - for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { - atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); - } -} - -template -__global__ void CUDAConstructHistogramSparseKernel2( - const int* leaf_index, - const score_t* cuda_gradients, - const score_t* cuda_hessians, - const data_size_t** data_indices_ptr, - const int* num_feature_groups, - const data_size_t* leaf_num_data, - const BIN_TYPE* data, - const DATA_PTR_TYPE* row_ptr, - const DATA_PTR_TYPE* partition_ptr, - const uint32_t* column_hist_offsets_full, - const data_size_t num_data, - hist_t* histogram_buffer, - const int total_num_bin) { - - const int leaf_index_ref = *leaf_index; - const int dim_y = static_cast(gridDim.y * blockDim.y); - const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; - const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; - const data_size_t* data_indices_ref = *data_indices_ptr; - __shared__ float shared_hist[SHRAE_HIST_SIZE]; - const unsigned int num_threads_per_block = blockDim.x * blockDim.y; - const DATA_PTR_TYPE* block_row_ptr = row_ptr + blockIdx.x * (num_data + 1); - const BIN_TYPE* data_ptr = data + partition_ptr[blockIdx.x]; - const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; - const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; - const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1; - const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; - for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { - shared_hist[i] = 0.0f; - } - __syncthreads(); - const unsigned int threadIdx_y = threadIdx.y; - const unsigned int blockIdx_y = blockIdx.y; - const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread; - const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; - data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf_ref - block_start, num_data_per_thread * static_cast(blockDim.y))); - const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y; - const data_size_t remainder = block_num_data % blockDim.y; - const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); - data_size_t inner_data_index = static_cast(threadIdx_y); - for (data_size_t i = 0; i < num_iteration_this; ++i) { - const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; - const DATA_PTR_TYPE row_start = block_row_ptr[data_index]; - const DATA_PTR_TYPE row_end = block_row_ptr[data_index + 1]; - const DATA_PTR_TYPE row_size = row_end - row_start; - if (threadIdx.x < row_size) { - const score_t grad = cuda_gradients[data_index]; - const score_t hess = cuda_hessians[data_index]; - const uint32_t bin = static_cast(data_ptr[row_start + threadIdx.x]); - const uint32_t pos = bin << 1; - float* pos_ptr = shared_hist + pos; - atomicAdd_block(pos_ptr, grad); - atomicAdd_block(pos_ptr + 1, hess); - } - inner_data_index += blockDim.y; - } - __syncthreads(); - hist_t* feature_histogram_ptr = histogram_buffer + total_num_bin * (blockIdx.y % USED_HISTOGRAM_BUFFER_NUM) * 2 + (partition_hist_start << 1); - for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { - atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); - } -} - -__global__ void MergeHistogramBufferKernel( - hist_t* histogram_buffer, - const int num_total_bin, - const int num_bin_per_block, - hist_t** output_histogram_ptr) { - hist_t* output_histogram = *output_histogram_ptr; - __shared__ hist_t gradient_buffer[1024]; - __shared__ hist_t hessian_buffer[1024]; - const uint32_t threadIdx_x = threadIdx.x; - const uint32_t blockIdx_x = blockIdx.x; - const uint32_t bin_index = threadIdx_x / USED_HISTOGRAM_BUFFER_NUM + num_bin_per_block * blockIdx_x; - const uint32_t histogram_position = (num_total_bin * (threadIdx_x % USED_HISTOGRAM_BUFFER_NUM) + bin_index) << 1; - if (bin_index < num_total_bin) { - gradient_buffer[threadIdx_x] = histogram_buffer[histogram_position]; - hessian_buffer[threadIdx_x] = histogram_buffer[histogram_position + 1]; - } - const uint32_t start = threadIdx_x / USED_HISTOGRAM_BUFFER_NUM * USED_HISTOGRAM_BUFFER_NUM; - __syncthreads(); - ReduceSumHistogramConstructorMerge(gradient_buffer + start, USED_HISTOGRAM_BUFFER_NUM); - ReduceSumHistogramConstructorMerge(hessian_buffer + start, USED_HISTOGRAM_BUFFER_NUM); - __syncthreads(); - const unsigned int global_histogram_position = bin_index << 1; - if (threadIdx_x % USED_HISTOGRAM_BUFFER_NUM == 0 && bin_index < num_total_bin) { - output_histogram[global_histogram_position] = gradient_buffer[threadIdx_x]; - output_histogram[global_histogram_position + 1] = hessian_buffer[threadIdx_x]; - } -} - void CUDAHistogramConstructor::LaunchConstructHistogramKernel( - const int* cuda_smaller_leaf_index, - const data_size_t* cuda_smaller_leaf_num_data, - const data_size_t** cuda_data_indices_in_smaller_leaf, + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t* cuda_leaf_num_data, - hist_t** cuda_leaf_hist, const data_size_t num_data_in_smaller_leaf) { int grid_dim_x = 0; int grid_dim_y = 0; @@ -382,24 +216,30 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( if (is_sparse_) { if (bit_type_ == 8) { if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, cuda_row_ptr_uint16_t_, cuda_partition_ptr_uint16_t_, cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, cuda_row_ptr_uint32_t_, cuda_partition_ptr_uint32_t_, cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, cuda_row_ptr_uint64_t_, cuda_partition_ptr_uint64_t_, @@ -408,24 +248,30 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( } } else if (bit_type_ == 16) { if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, cuda_row_ptr_uint16_t_, cuda_partition_ptr_uint16_t_, cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, cuda_row_ptr_uint32_t_, cuda_partition_ptr_uint32_t_, cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, cuda_row_ptr_uint64_t_, cuda_partition_ptr_uint64_t_, @@ -434,24 +280,30 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( } } else if (bit_type_ == 32) { if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, cuda_row_ptr_uint16_t_, cuda_partition_ptr_uint16_t_, cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, cuda_row_ptr_uint32_t_, cuda_partition_ptr_uint32_t_, cuda_column_hist_offsets_full_, num_data_); } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, cuda_row_ptr_uint64_t_, cuda_partition_ptr_uint64_t_, @@ -461,22 +313,28 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( } } else { if (bit_type_ == 8) { - CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, + CUDAConstructHistogramDenseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, cuda_column_hist_offsets_, cuda_column_hist_offsets_full_, cuda_feature_partition_column_index_offsets_, num_data_); } else if (bit_type_ == 16) { - CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, + CUDAConstructHistogramDenseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, cuda_column_hist_offsets_, cuda_column_hist_offsets_full_, cuda_feature_partition_column_index_offsets_, num_data_); } else if (bit_type_ == 32) { - CUDAConstructHistogramDenseKernel<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_leaf_hist, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, + CUDAConstructHistogramDenseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, cuda_column_hist_offsets_, cuda_column_hist_offsets_full_, cuda_feature_partition_column_index_offsets_, @@ -485,146 +343,16 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( } } -void CUDAHistogramConstructor::LaunchConstructHistogramKernel2( - const int* cuda_smaller_leaf_index, - const data_size_t* cuda_smaller_leaf_num_data, - const data_size_t** cuda_data_indices_in_smaller_leaf, - const data_size_t* cuda_leaf_num_data, - hist_t** cuda_leaf_hist, - const data_size_t num_data_in_smaller_leaf) { - int grid_dim_x = 0; - int grid_dim_y = 0; - int block_dim_x = 0; - int block_dim_y = 0; - CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf); - dim3 grid_dim(grid_dim_x, grid_dim_y); - dim3 block_dim(block_dim_x, block_dim_y); - SetCUDAMemory(block_cuda_hist_buffer_, 0, 2 * num_total_bin_ * USED_HISTOGRAM_BUFFER_NUM); - global_timer.Start("CUDAConstructHistogramKernel2"); - if (is_sparse_) { - if (bit_type_ == 8) { - if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, - cuda_data_uint8_t_, - cuda_row_ptr_uint16_t_, - cuda_partition_ptr_uint16_t_, - cuda_column_hist_offsets_full_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, - cuda_data_uint8_t_, - cuda_row_ptr_uint32_t_, - cuda_partition_ptr_uint32_t_, - cuda_column_hist_offsets_full_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, - cuda_data_uint8_t_, - cuda_row_ptr_uint64_t_, - cuda_partition_ptr_uint64_t_, - cuda_column_hist_offsets_full_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } - } else if (bit_type_ == 16) { - if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, - cuda_data_uint16_t_, - cuda_row_ptr_uint16_t_, - cuda_partition_ptr_uint16_t_, - cuda_column_hist_offsets_full_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, - cuda_data_uint16_t_, - cuda_row_ptr_uint32_t_, - cuda_partition_ptr_uint32_t_, - cuda_column_hist_offsets_full_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, - cuda_data_uint16_t_, - cuda_row_ptr_uint64_t_, - cuda_partition_ptr_uint64_t_, - cuda_column_hist_offsets_full_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } - } else if (bit_type_ == 32) { - if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, - cuda_data_uint32_t_, - cuda_row_ptr_uint16_t_, - cuda_partition_ptr_uint16_t_, - cuda_column_hist_offsets_full_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, - cuda_data_uint32_t_, - cuda_row_ptr_uint32_t_, - cuda_partition_ptr_uint32_t_, - cuda_column_hist_offsets_full_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, - cuda_data_uint32_t_, - cuda_row_ptr_uint64_t_, - cuda_partition_ptr_uint64_t_, - cuda_column_hist_offsets_full_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } - } - } else { - if (bit_type_ == 8) { - CUDAConstructHistogramDenseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, - cuda_column_hist_offsets_, - cuda_column_hist_offsets_full_, - cuda_feature_partition_column_index_offsets_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } else if (bit_type_ == 16) { - CUDAConstructHistogramDenseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, - cuda_column_hist_offsets_, - cuda_column_hist_offsets_full_, - cuda_feature_partition_column_index_offsets_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } else if (bit_type_ == 32) { - CUDAConstructHistogramDenseKernel2<<>>(cuda_smaller_leaf_index, cuda_gradients_, cuda_hessians_, - cuda_data_indices_in_smaller_leaf, cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, - cuda_column_hist_offsets_, - cuda_column_hist_offsets_full_, - cuda_feature_partition_column_index_offsets_, - num_data_, block_cuda_hist_buffer_, num_total_bin_); - } - } - global_timer.Stop("CUDAConstructHistogramKernel2"); - const int merge_block_dim = 1024; - const int num_bin_per_block = merge_block_dim / USED_HISTOGRAM_BUFFER_NUM; - const int num_blocks = (num_total_bin_ + num_bin_per_block - 1) / num_bin_per_block; - global_timer.Start("MergeHistogramBufferKernel"); - MergeHistogramBufferKernel<<>>( - block_cuda_hist_buffer_, num_total_bin_, num_bin_per_block, cuda_leaf_hist); - global_timer.Stop("MergeHistogramBufferKernel"); -} - -__global__ void SubtractHistogramKernel(const int* /*cuda_smaller_leaf_index*/, - const int* cuda_larger_leaf_index, const uint8_t* cuda_feature_mfb_offsets, - const uint32_t* cuda_feature_num_bins, const int* cuda_num_total_bin, - hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist) { +__global__ void SubtractHistogramKernel( + const int* cuda_num_total_bin, + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const CUDALeafSplitsStruct* cuda_larger_leaf_splits) { const int cuda_num_total_bin_ref = *cuda_num_total_bin; const unsigned int global_thread_index = threadIdx.x + blockIdx.x * blockDim.x; - const int cuda_larger_leaf_index_ref = *cuda_larger_leaf_index; + const int cuda_larger_leaf_index_ref = cuda_larger_leaf_splits->leaf_index; if (cuda_larger_leaf_index_ref >= 0) { - const hist_t* smaller_leaf_hist = *cuda_smaller_leaf_hist; - hist_t* larger_leaf_hist = *cuda_larger_leaf_hist; + const hist_t* smaller_leaf_hist = cuda_smaller_leaf_splits->hist_in_leaf; + hist_t* larger_leaf_hist = cuda_larger_leaf_splits->hist_in_leaf; if (global_thread_index < 2 * cuda_num_total_bin_ref) { larger_leaf_hist[global_thread_index] -= smaller_leaf_hist[global_thread_index]; } @@ -635,10 +363,9 @@ __global__ void FixHistogramKernel( const uint32_t* cuda_feature_num_bins, const uint32_t* cuda_feature_hist_offsets, const uint32_t* cuda_feature_most_freq_bins, - const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, - hist_t** cuda_smaller_leaf_hist, const int* cuda_need_fix_histogram_features, - const uint32_t* cuda_need_fix_histogram_features_num_bin_aligned) { + const uint32_t* cuda_need_fix_histogram_features_num_bin_aligned, + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits) { const unsigned int blockIdx_x = blockIdx.x; const int feature_index = cuda_need_fix_histogram_features[blockIdx_x]; __shared__ double hist_gradients[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; @@ -646,9 +373,9 @@ __global__ void FixHistogramKernel( const uint32_t num_bin_aligned = cuda_need_fix_histogram_features_num_bin_aligned[blockIdx_x]; const uint32_t feature_hist_offset = cuda_feature_hist_offsets[feature_index]; const uint32_t most_freq_bin = cuda_feature_most_freq_bins[feature_index]; - const double leaf_sum_gradients = *smaller_leaf_sum_gradients; - const double leaf_sum_hessians = *smaller_leaf_sum_hessians; - hist_t* feature_hist = (*cuda_smaller_leaf_hist) + feature_hist_offset * 2; + const double leaf_sum_gradients = cuda_smaller_leaf_splits->sum_of_gradients; + const double leaf_sum_hessians = cuda_smaller_leaf_splits->sum_of_hessians; + hist_t* feature_hist = cuda_smaller_leaf_splits->hist_in_leaf + feature_hist_offset * 2; const unsigned int threadIdx_x = threadIdx.x; const uint32_t num_bin = cuda_feature_num_bins[feature_index]; const uint32_t hist_pos = threadIdx_x << 1; @@ -674,53 +401,30 @@ __global__ void FixHistogramKernel( } } -void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(const int* cuda_smaller_leaf_index, - const int* cuda_larger_leaf_index, const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, - const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians, - hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist) { +void CUDAHistogramConstructor::LaunchSubtractHistogramKernel( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const CUDALeafSplitsStruct* cuda_larger_leaf_splits) { const int num_subtract_threads = 2 * num_total_bin_; const int num_subtract_blocks = (num_subtract_threads + SUBTRACT_BLOCK_SIZE - 1) / SUBTRACT_BLOCK_SIZE; global_timer.Start("CUDAHistogramConstructor::FixHistogramKernel"); FixHistogramKernel<<>>( cuda_feature_num_bins_, cuda_feature_hist_offsets_, - cuda_feature_most_freq_bins_, smaller_leaf_sum_gradients, smaller_leaf_sum_hessians, - cuda_smaller_leaf_hist, cuda_need_fix_histogram_features_, - cuda_need_fix_histogram_features_num_bin_aligned_); + cuda_feature_most_freq_bins_, + cuda_need_fix_histogram_features_, + cuda_need_fix_histogram_features_num_bin_aligned_, + cuda_smaller_leaf_splits); //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDAHistogramConstructor::FixHistogramKernel"); global_timer.Start("CUDAHistogramConstructor::SubtractHistogramKernel"); SubtractHistogramKernel<<>>( - cuda_smaller_leaf_index, cuda_larger_leaf_index, cuda_feature_mfb_offsets_, - cuda_feature_num_bins_, cuda_num_total_bin_, cuda_smaller_leaf_hist, cuda_larger_leaf_hist); + cuda_num_total_bin_, + cuda_smaller_leaf_splits, + cuda_larger_leaf_splits); //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDAHistogramConstructor::SubtractHistogramKernel"); } -__global__ void GetOrderedGradientsKernel(const data_size_t num_data_in_leaf, const data_size_t** cuda_data_indices_in_leaf, - const score_t* cuda_gradients, const score_t* cuda_hessians, - score_t* cuda_ordered_gradients, score_t* cuda_ordered_hessians) { - const data_size_t* cuda_data_indices_in_leaf_ref = *cuda_data_indices_in_leaf; - const unsigned int local_data_index = threadIdx.x + blockIdx.x * blockDim.x; - if (local_data_index < static_cast(num_data_in_leaf)) { - const data_size_t global_data_index = cuda_data_indices_in_leaf_ref[local_data_index]; - cuda_ordered_gradients[local_data_index] = cuda_gradients[global_data_index]; - cuda_ordered_hessians[local_data_index] = cuda_hessians[global_data_index]; - } -} - -void CUDAHistogramConstructor::LaunchGetOrderedGradientsKernel( - const data_size_t num_data_in_leaf, - const data_size_t** cuda_data_indices_in_leaf) { - if (num_data_in_leaf < num_data_) { - const int num_data_per_block = 1024; - const int num_blocks = (num_data_in_leaf + num_data_per_block - 1) / num_data_per_block; - GetOrderedGradientsKernel<<>>(num_data_in_leaf, cuda_data_indices_in_leaf, - cuda_gradients_, cuda_hessians_, cuda_ordered_gradients_, cuda_ordered_hessians_); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - } -} - } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index cac22493f703..34f7764c411b 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -14,6 +14,7 @@ #include #include "new_cuda_utils.hpp" +#include "cuda_leaf_splits.hpp" #include @@ -36,10 +37,8 @@ class CUDAHistogramConstructor { void Init(const Dataset* train_data, TrainingShareStates* share_state); - void ConstructHistogramForLeaf(const int* cuda_smaller_leaf_index, const data_size_t* cuda_num_data_in_smaller_leaf, const int* cuda_larger_leaf_index, - const data_size_t** cuda_data_indices_in_smaller_leaf, const data_size_t** cuda_data_indices_in_larger_leaf, - const double* cuda_smaller_leaf_sum_gradients, const double* cuda_smaller_leaf_sum_hessians, hist_t** cuda_smaller_leaf_hist, - const double* cuda_larger_leaf_sum_gradients, const double* cuda_larger_leaf_sum_hessians, hist_t** cuda_larger_leaf_hist, + void ConstructHistogramForLeaf( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits, const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); @@ -53,58 +52,19 @@ class CUDAHistogramConstructor { const uint8_t* cuda_data() const { return cuda_data_uint8_t_; } - void TestAfterInit() { - /*std::vector test_data(data_.size(), 0); - CopyFromCUDADeviceToHost(test_data.data(), cuda_data_, data_.size()); - for (size_t i = 0; i < 100; ++i) { - Log::Warning("CUDAHistogramConstructor::TestAfterInit test_data[%d] = %d", i, test_data[i]); - }*/ - } - - void TestAfterConstructHistogram() { - PrintLastCUDAError(); - std::vector test_hist(num_total_bin_ * 2, 0.0f); - /*CopyFromCUDADeviceToHost(test_hist.data(), cuda_hist_, static_cast(num_total_bin_) * 2); - for (int i = 0; i < 100; ++i) { - Log::Warning("bin %d grad %f hess %f", i, test_hist[2 * i], test_hist[2 * i + 1]); - }*/ - const hist_t* leaf_2_cuda_hist_ptr = cuda_hist_;// + 3 * 2 * num_total_bin_; - Log::Warning("cuda_hist_ptr = %ld", leaf_2_cuda_hist_ptr); - CopyFromCUDADeviceToHost(test_hist.data(), leaf_2_cuda_hist_ptr, 2 * num_total_bin_); - std::ofstream fout("leaf_2_cuda_hist.txt"); - for (int i = 0; i < num_total_bin_; ++i) { - Log::Warning("bin %d grad %f hess %f", i, test_hist[2 * i], test_hist[2 * i + 1]); - fout << "bin " << i << " grad " << test_hist[2 * i] << " hess " << test_hist[2 * i + 1] << "\n"; - } - fout.close(); - } - private: - void LaunchGetOrderedGradientsKernel( - const data_size_t num_data_in_leaf, - const data_size_t** cuda_data_indices_in_leaf); void CalcConstructHistogramKernelDim(int* grid_dim_x, int* grid_dim_y, int* block_dim_x, int* block_dim_y, const data_size_t num_data_in_smaller_leaf); - void LaunchConstructHistogramKernel(const int* cuda_leaf_index, - const data_size_t* cuda_smaller_leaf_num_data, - const data_size_t** cuda_data_indices_in_leaf, - const data_size_t* cuda_leaf_num_data, - hist_t** cuda_leaf_hist, - const data_size_t num_data_in_smaller_leaf); - - void LaunchConstructHistogramKernel2(const int* cuda_leaf_index, - const data_size_t* cuda_smaller_leaf_num_data, - const data_size_t** cuda_data_indices_in_leaf, + void LaunchConstructHistogramKernel( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t* cuda_leaf_num_data, - hist_t** cuda_leaf_hist, const data_size_t num_data_in_smaller_leaf); - void LaunchSubtractHistogramKernel(const int* cuda_smaller_leaf_index, - const int* cuda_larger_leaf_index, const double* smaller_leaf_sum_gradients, const double* smaller_leaf_sum_hessians, - const double* larger_leaf_sum_gradients, const double* larger_leaf_sum_hessians, - hist_t** cuda_smaller_leaf_hist, hist_t** cuda_larger_leaf_hist); + void LaunchSubtractHistogramKernel( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const CUDALeafSplitsStruct* cuda_larger_leaf_splits); void InitCUDAData(TrainingShareStates* share_state); diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 3fd177d4bc35..c17f7d8ba399 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -12,13 +12,7 @@ namespace LightGBM { CUDALeafSplits::CUDALeafSplits(const data_size_t num_data, const int leaf_index, const int* cuda_num_data): num_data_(num_data), leaf_index_(leaf_index) { - cuda_sum_of_gradients_ = nullptr; - cuda_sum_of_hessians_ = nullptr; - cuda_num_data_in_leaf_ = nullptr; - cuda_gain_ = nullptr; - cuda_leaf_value_ = nullptr; - - cuda_data_indices_in_leaf_ = nullptr; + cuda_struct_ = nullptr; cuda_num_data_ = cuda_num_data; } @@ -27,45 +21,18 @@ void CUDALeafSplits::Init() { // allocate more memory for sum reduction in CUDA // only the first element records the final sum - AllocateCUDAMemory(num_blocks_init_from_gradients_, &cuda_sum_of_gradients_); - AllocateCUDAMemory(num_blocks_init_from_gradients_, &cuda_sum_of_hessians_); - - InitCUDAMemoryFromHostMemory(&cuda_num_data_in_leaf_, &num_data_, 1); - // TODO(shiyu1994): should initialize root gain for min_gain_shift - InitCUDAValueFromConstant(&cuda_gain_, 0.0f); - // since smooth is not used, so the output value for root node is useless - InitCUDAValueFromConstant(&cuda_leaf_value_, 0.0f); - AllocateCUDAMemory(1, &cuda_data_indices_in_leaf_); - AllocateCUDAMemory(1, &cuda_hist_in_leaf_); + AllocateCUDAMemory(num_blocks_init_from_gradients_, &cuda_sum_of_gradients_buffer_); + AllocateCUDAMemory(num_blocks_init_from_gradients_, &cuda_sum_of_hessians_buffer_); - InitCUDAMemoryFromHostMemory(&cuda_leaf_index_, &leaf_index_, 1); + AllocateCUDAMemoryOuter(&cuda_struct_, 1, __FILE__, __LINE__); cuda_streams_.resize(2); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); } -void CUDALeafSplits::InitValues( - const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, - const data_size_t* cuda_num_data_in_leaf, const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, - const double* cuda_gain, const double* cuda_leaf_value) { - CopyFromCUDADeviceToCUDADevice(cuda_sum_of_gradients_, cuda_sum_of_gradients, 1); - CopyFromCUDADeviceToCUDADevice(cuda_sum_of_hessians_, cuda_sum_of_hessians, 1); - CopyFromCUDADeviceToCUDADevice(cuda_num_data_in_leaf_, cuda_num_data_in_leaf, 1); - CopyFromHostToCUDADevice(cuda_data_indices_in_leaf_, &cuda_data_indices_in_leaf, 1); - CopyFromHostToCUDADevice(cuda_hist_in_leaf_, &cuda_hist_in_leaf, 1); - CopyFromCUDADeviceToCUDADevice(cuda_gain_, cuda_gain, 1); - CopyFromCUDADeviceToCUDADevice(cuda_leaf_value_, cuda_leaf_value, 1); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - void CUDALeafSplits::InitValues() { - SetCUDAMemory(cuda_sum_of_gradients_, 0, num_blocks_init_from_gradients_); - SetCUDAMemory(cuda_sum_of_hessians_, 0, num_blocks_init_from_gradients_); - const int larger_leaf_index = -1; - CopyFromHostToCUDADevice(cuda_leaf_index_, &larger_leaf_index, 1); - SetCUDAMemory(cuda_gain_, 0, 1); - SetCUDAMemory(cuda_leaf_value_, 0, 1); + LaunchInitValuesEmptyKernel(); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } @@ -75,16 +42,10 @@ void CUDALeafSplits::InitValues( double* root_sum_hessians) { cuda_gradients_ = cuda_gradients; cuda_hessians_ = cuda_hessians; - SetCUDAMemory(cuda_sum_of_gradients_, 0, num_blocks_init_from_gradients_); - SetCUDAMemory(cuda_sum_of_hessians_, 0, num_blocks_init_from_gradients_); - LaunchInitValuesKernal(); - SetCUDAMemory(cuda_leaf_index_, 0, 1); - CopyFromHostToCUDADeviceAsync(cuda_data_indices_in_leaf_, &cuda_data_indices_in_leaf, 1, cuda_streams_[0]); - CopyFromHostToCUDADeviceAsync(cuda_hist_in_leaf_, &cuda_hist_in_leaf, 1, cuda_streams_[0]); - CopyFromHostToCUDADeviceAsync(cuda_num_data_in_leaf_, &num_data_, 1, cuda_streams_[0]); - CopyFromCUDADeviceToHostAsync(root_sum_hessians, cuda_sum_of_hessians_, 1, cuda_streams_[1]); - SetCUDAMemory(cuda_gain_, 0, 1); - SetCUDAMemory(cuda_leaf_value_, 0, 1); + SetCUDAMemory(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_); + SetCUDAMemory(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_); + LaunchInitValuesKernal(cuda_data_indices_in_leaf, cuda_hist_in_leaf); + CopyFromCUDADeviceToHostAsync(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, cuda_streams_[1]); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index a5cea5c0e351..77a5c515a086 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -40,27 +40,60 @@ __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score } } -__global__ void CUDAInitValuesKernel2(double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) { - if (blockIdx.x == 0) { - double sum_of_gradients = 0.0f; - double sum_of_hessians = 0.0f; - for (unsigned int i = 1; i < gridDim.x; ++i) { - sum_of_gradients += cuda_sum_of_gradients[i]; - sum_of_hessians += cuda_sum_of_hessians[i]; - } - cuda_sum_of_gradients[0] += sum_of_gradients; - cuda_sum_of_hessians[0] += sum_of_hessians; +__global__ void CUDAInitValuesKernel2( + double* cuda_sum_of_gradients, + double* cuda_sum_of_hessians, + const data_size_t num_data, + const data_size_t* cuda_data_indices_in_leaf, + hist_t* cuda_hist_in_leaf, + CUDALeafSplitsStruct* cuda_struct) { + double sum_of_gradients = 0.0f; + double sum_of_hessians = 0.0f; + for (unsigned int i = 0; i < gridDim.x; ++i) { + sum_of_gradients += cuda_sum_of_gradients[i]; + sum_of_hessians += cuda_sum_of_hessians[i]; } + cuda_sum_of_gradients[0] = sum_of_gradients; + cuda_sum_of_hessians[0] = sum_of_hessians; + cuda_struct->leaf_index = 0; + cuda_struct->sum_of_gradients = sum_of_gradients; + cuda_struct->sum_of_hessians = sum_of_hessians; + cuda_struct->num_data_in_leaf = num_data; + cuda_struct->gain = kMinScore; + cuda_struct->leaf_value = 0.0f; + cuda_struct->data_indices_in_leaf = cuda_data_indices_in_leaf; + cuda_struct->hist_in_leaf = cuda_hist_in_leaf; +} + +__global__ void InitValuesEmptyKernel(CUDALeafSplitsStruct* cuda_struct) { + cuda_struct->leaf_index = -1; + cuda_struct->sum_of_gradients = 0.0f; + cuda_struct->sum_of_hessians = 0.0f; + cuda_struct->num_data_in_leaf = 0; + cuda_struct->gain = kMinScore; + cuda_struct->leaf_value = 0.0f; + cuda_struct->data_indices_in_leaf = nullptr; + cuda_struct->hist_in_leaf = nullptr; +} + +void CUDALeafSplits::LaunchInitValuesEmptyKernel() { + InitValuesEmptyKernel<<<1, 1>>>(cuda_struct_); } -void CUDALeafSplits::LaunchInitValuesKernal() { +void CUDALeafSplits::LaunchInitValuesKernal( + const data_size_t* cuda_data_indices_in_leaf, + hist_t* cuda_hist_in_leaf) { CUDAInitValuesKernel1<<>>( - cuda_gradients_, cuda_hessians_, cuda_num_data_, cuda_sum_of_gradients_, - cuda_sum_of_hessians_); - CopyFromCUDADeviceToCUDADevice(cuda_num_data_in_leaf_, cuda_num_data_, 1); + cuda_gradients_, cuda_hessians_, cuda_num_data_, cuda_sum_of_gradients_buffer_, + cuda_sum_of_hessians_buffer_); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CUDAInitValuesKernel2<<>>( - cuda_sum_of_gradients_, cuda_sum_of_hessians_); + CUDAInitValuesKernel2<<<1, 1>>>( + cuda_sum_of_gradients_buffer_, + cuda_sum_of_hessians_buffer_, + num_data_, + cuda_data_indices_in_leaf, + cuda_hist_in_leaf, + cuda_struct_); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index adda11525d4b..2e1b81e86fbe 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -19,6 +19,18 @@ namespace LightGBM { +struct CUDALeafSplitsStruct { + public: + int leaf_index; + double sum_of_gradients; + double sum_of_hessians; + data_size_t num_data_in_leaf; + double gain; + double leaf_value; + const data_size_t* data_indices_in_leaf; + hist_t* hist_in_leaf; +}; + class CUDALeafSplits { public: CUDALeafSplits(const data_size_t num_data, const int leaf_index, @@ -28,11 +40,6 @@ class CUDALeafSplits { void Init(); - void InitValues( - const double* cuda_sum_of_gradients, const double* cuda_sum_of_hessians, - const data_size_t* cuda_num_data_in_leaf, const data_size_t* cuda_data_indices_in_leaf, - hist_t* cuda_hist_in_leaf, const double* cuda_gain, const double* cuda_leaf_value); - void InitValues( const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, @@ -40,45 +47,15 @@ class CUDALeafSplits { void InitValues(); - const int* cuda_leaf_index() const { return cuda_leaf_index_; } - - const data_size_t** cuda_data_indices_in_leaf() const { return cuda_data_indices_in_leaf_; } - - const double* cuda_gain() const { return cuda_gain_; } - - const double* cuda_sum_of_gradients() const { return cuda_sum_of_gradients_; } - - const double* cuda_sum_of_hessians() const { return cuda_sum_of_hessians_; } - - const data_size_t* cuda_num_data_in_leaf() const { return cuda_num_data_in_leaf_; } + const CUDALeafSplitsStruct* GetCUDAStruct() const { return cuda_struct_; } - int* cuda_leaf_index_pointer() const { return cuda_leaf_index_; } - - double* cuda_sum_of_gradients_pointer() const { return cuda_sum_of_gradients_; } - - double* cuda_sum_of_hessians_pointer() const { return cuda_sum_of_hessians_; } - - data_size_t* cuda_num_data_in_leaf_pointer() const { return cuda_num_data_in_leaf_; } - - double* cuda_gain_pointer() const { return cuda_gain_; } - - double* cuda_leaf_value_pointer() const { return cuda_leaf_value_; } - - const data_size_t** cuda_data_indices_in_leaf_pointer_pointer() { return cuda_data_indices_in_leaf_; } - - hist_t** cuda_hist_in_leaf_pointer_pointer() const { return cuda_hist_in_leaf_; } - - void Test() { - PrintLastCUDAError(); - double test_sum_of_gradients = 0.0f, test_sum_of_hessians = 0.0f; - CopyFromCUDADeviceToHost(&test_sum_of_gradients, cuda_sum_of_gradients_, 1); - CopyFromCUDADeviceToHost(&test_sum_of_hessians, cuda_sum_of_hessians_, 1); - Log::Warning("CUDALeafSplits::Test test_sum_of_gradients = %f", test_sum_of_gradients); - Log::Warning("CUDALeafSplits::Test test_sum_of_hessians = %f", test_sum_of_hessians); - } + CUDALeafSplitsStruct* GetCUDAStructRef() { return cuda_struct_; } private: - void LaunchInitValuesKernal(); + void LaunchInitValuesEmptyKernel(); + + void LaunchInitValuesKernal(const data_size_t* cuda_data_indices_in_leaf, + hist_t* cuda_hist_in_leaf); // Host memory const int num_data_; @@ -87,16 +64,11 @@ class CUDALeafSplits { std::vector cuda_streams_; // CUDA memory, held by this object - int* cuda_leaf_index_; - double* cuda_sum_of_gradients_; - double* cuda_sum_of_hessians_; - data_size_t* cuda_num_data_in_leaf_; - double* cuda_gain_; - double* cuda_leaf_value_; + CUDALeafSplitsStruct* cuda_struct_; + double* cuda_sum_of_gradients_buffer_; + double* cuda_sum_of_hessians_buffer_; // CUDA memory, held by other object - const data_size_t** cuda_data_indices_in_leaf_; - hist_t** cuda_hist_in_leaf_; const score_t* cuda_gradients_; const score_t* cuda_hessians_; const int* cuda_num_data_; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 19e6d8ee7367..91bd9355b248 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -156,17 +156,8 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, const double sum_hessians_in_smaller_leaf = leaf_sum_hessians_[smaller_leaf_index_]; const double sum_hessians_in_larger_leaf = larger_leaf_index_ < 0 ? 0 : leaf_sum_hessians_[larger_leaf_index_]; cuda_histogram_constructor_->ConstructHistogramForLeaf( - cuda_smaller_leaf_splits_->cuda_leaf_index(), - cuda_smaller_leaf_splits_->cuda_num_data_in_leaf(), - cuda_larger_leaf_splits_->cuda_leaf_index(), - cuda_smaller_leaf_splits_->cuda_data_indices_in_leaf(), - cuda_larger_leaf_splits_->cuda_data_indices_in_leaf(), - cuda_smaller_leaf_splits_->cuda_sum_of_gradients_pointer(), - cuda_smaller_leaf_splits_->cuda_sum_of_hessians_pointer(), - cuda_smaller_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), - cuda_larger_leaf_splits_->cuda_sum_of_gradients_pointer(), - cuda_larger_leaf_splits_->cuda_sum_of_hessians_pointer(), - cuda_larger_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), + cuda_smaller_leaf_splits_->GetCUDAStruct(), + cuda_larger_leaf_splits_->GetCUDAStruct(), cuda_data_partition_->cuda_leaf_num_data(), num_data_in_smaller_leaf, num_data_in_larger_leaf, @@ -178,8 +169,10 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, construct_histogram_time += duration.count(); global_timer.Start("NewCUDATreeLearner::FindBestSplitsForLeaf"); start = std::chrono::steady_clock::now(); - cuda_best_split_finder_->FindBestSplitsForLeaf(cuda_smaller_leaf_splits_.get(), - cuda_larger_leaf_splits_.get(), smaller_leaf_index_, larger_leaf_index_, + cuda_best_split_finder_->FindBestSplitsForLeaf( + cuda_smaller_leaf_splits_->GetCUDAStruct(), + cuda_larger_leaf_splits_->GetCUDAStruct(), + smaller_leaf_index_, larger_leaf_index_, num_data_in_smaller_leaf, num_data_in_larger_leaf, sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf); end = std::chrono::steady_clock::now(); @@ -206,22 +199,8 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), cuda_best_split_finder_->cuda_leaf_best_split_feature(), cuda_best_split_finder_->cuda_leaf_best_split_info(), - cuda_smaller_leaf_splits_->cuda_leaf_index_pointer(), - cuda_smaller_leaf_splits_->cuda_sum_of_gradients_pointer(), - cuda_smaller_leaf_splits_->cuda_sum_of_hessians_pointer(), - cuda_smaller_leaf_splits_->cuda_num_data_in_leaf_pointer(), - cuda_smaller_leaf_splits_->cuda_gain_pointer(), - cuda_smaller_leaf_splits_->cuda_leaf_value_pointer(), - cuda_smaller_leaf_splits_->cuda_data_indices_in_leaf_pointer_pointer(), - cuda_smaller_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), - cuda_larger_leaf_splits_->cuda_leaf_index_pointer(), - cuda_larger_leaf_splits_->cuda_sum_of_gradients_pointer(), - cuda_larger_leaf_splits_->cuda_sum_of_hessians_pointer(), - cuda_larger_leaf_splits_->cuda_num_data_in_leaf_pointer(), - cuda_larger_leaf_splits_->cuda_gain_pointer(), - cuda_larger_leaf_splits_->cuda_leaf_value_pointer(), - cuda_larger_leaf_splits_->cuda_data_indices_in_leaf_pointer_pointer(), - cuda_larger_leaf_splits_->cuda_hist_in_leaf_pointer_pointer(), + cuda_smaller_leaf_splits_->GetCUDAStructRef(), + cuda_larger_leaf_splits_->GetCUDAStructRef(), &leaf_num_data_, &leaf_data_start_, &leaf_sum_hessians_, From d9d3aa9a1264c93f1dd5ef7b84c05b691fe65acf Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 21 Jul 2021 15:06:21 +0000 Subject: [PATCH 041/166] return CUDASplitInfo directly after finding best split --- .../cuda/cuda_best_split_finder.cpp | 4 +- .../cuda/cuda_best_split_finder.cu | 39 ++--- .../cuda/cuda_best_split_finder.hpp | 5 +- src/treelearner/cuda/cuda_data_partition.cpp | 18 +- src/treelearner/cuda/cuda_data_partition.cu | 165 +++++++++--------- src/treelearner/cuda/cuda_data_partition.hpp | 52 +++--- src/treelearner/cuda/cuda_leaf_splits.cpp | 1 + src/treelearner/cuda/cuda_leaf_splits.cu | 8 +- src/treelearner/cuda/cuda_split_info.hpp | 2 +- .../cuda/new_cuda_tree_learner.cpp | 7 +- 10 files changed, 149 insertions(+), 152 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index bba4d9ae59b6..624746aafc1b 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -113,7 +113,6 @@ void CUDABestSplitFinder::Init() { const size_t cuda_best_leaf_split_info_buffer_size = static_cast(num_task_blocks) * static_cast(num_leaves_); AllocateCUDAMemoryOuter(&cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__); - AllocateCUDAMemory(cuda_best_leaf_split_info_buffer_size, &cuda_leaf_best_split_feature_); InitCUDAMemoryFromHostMemory(&cuda_task_feature_index_, cpu_task_feature_index_.data(), cpu_task_feature_index_.size()); InitCUDAMemoryFromHostMemory(&cuda_task_reverse_, cpu_task_reverse_.data(), cpu_task_reverse_.size()); @@ -147,12 +146,13 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplitsStruct* smal global_timer.Stop("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); } -void CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int smaller_leaf_index, +const CUDASplitInfo* CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves, smaller_leaf_index, larger_leaf_index, leaf_best_split_feature, leaf_best_split_threshold, leaf_best_split_default_left, best_leaf_index); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + return cuda_leaf_best_split_info_ + (*best_leaf_index); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 42cf165e7155..133d4c87ce90 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -210,7 +210,7 @@ __device__ void FindBestSplitsForLeafKernelInner( const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; - cuda_best_split_info->is_valid = 0; + cuda_best_split_info->is_valid = false; __shared__ hist_t local_grad_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; __shared__ hist_t local_hess_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; @@ -321,7 +321,7 @@ __device__ void FindBestSplitsForLeafKernelInner( ReduceBestGain(local_gain, local_grad_hist, local_hess_hist, threshold_found, threshold_value); const uint8_t found = threshold_found[0]; if (found && threadIdx_x == 0) { - cuda_best_split_info->is_valid = 1; + cuda_best_split_info->is_valid = true; cuda_best_split_info->threshold = threshold_value[0]; cuda_best_split_info->gain = local_gain[0]; cuda_best_split_info->default_left = assume_out_default_left; @@ -523,7 +523,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( } } -__device__ void ReduceBestSplit(uint8_t* found, double* gain, uint32_t* shared_read_index, +__device__ void ReduceBestSplit(bool* found, double* gain, uint32_t* shared_read_index, uint32_t num_features_aligned) { const uint32_t threadIdx_x = threadIdx.x; for (unsigned int s = 1; s < num_features_aligned; s <<= 1) { @@ -541,7 +541,7 @@ __device__ void ReduceBestSplit(uint8_t* found, double* gain, uint32_t* shared_r } __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const int larger_leaf_index, - const int* cuda_num_features, int* cuda_leaf_best_split_feature, + const int* cuda_num_features, CUDASplitInfo* cuda_leaf_best_split_info, // input parameters const int* cuda_task_feature_index, @@ -556,7 +556,7 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i const uint32_t threadIdx_x = threadIdx.x; const uint32_t blockIdx_x = blockIdx.x; - __shared__ uint8_t best_found[NUM_TASKS_PER_SYNC_BLOCK]; + __shared__ bool best_found[NUM_TASKS_PER_SYNC_BLOCK]; __shared__ double best_gain[NUM_TASKS_PER_SYNC_BLOCK]; __shared__ uint32_t shared_read_index[NUM_TASKS_PER_SYNC_BLOCK]; @@ -569,7 +569,7 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i best_gain[threadIdx_x] = cuda_best_split_info[read_index].gain; shared_read_index[threadIdx_x] = read_index; } else { - best_found[threadIdx_x] = 0; + best_found[threadIdx_x] = false; } __syncthreads(); @@ -582,7 +582,7 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i const CUDASplitInfo* best_split_info = cuda_best_split_info + best_read_index; if (best_found[0]) { cuda_split_info->gain = best_gain[0]; - cuda_leaf_best_split_feature[buffer_write_pos] = is_smaller ? cuda_task_feature_index[best_read_index] : + cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; cuda_split_info->default_left = best_split_info->default_left; cuda_split_info->threshold = best_split_info->threshold; @@ -609,7 +609,6 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( const int larger_leaf_index, const unsigned int num_blocks_per_leaf, const int num_leaves, - int* cuda_leaf_best_split_feature, CUDASplitInfo* cuda_leaf_best_split_info, const bool larger_only) { if (!larger_only) { @@ -622,7 +621,7 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( other_split_info->gain > smaller_leaf_split_info->gain) || (!smaller_leaf_split_info->is_valid && other_split_info->is_valid)) { smaller_leaf_split_info->is_valid = other_split_info->is_valid; - cuda_leaf_best_split_feature[smaller_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; + smaller_leaf_split_info->inner_feature_index = other_split_info->inner_feature_index; smaller_leaf_split_info->default_left = other_split_info->default_left; smaller_leaf_split_info->threshold = other_split_info->threshold; smaller_leaf_split_info->gain = other_split_info->gain; @@ -650,7 +649,7 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( other_split_info->gain > larger_leaf_split_info->gain) || (!larger_leaf_split_info->is_valid && other_split_info->is_valid)) { larger_leaf_split_info->is_valid = other_split_info->is_valid; - cuda_leaf_best_split_feature[larger_leaf_index] = cuda_leaf_best_split_feature[leaf_read_pos]; + larger_leaf_split_info->inner_feature_index = other_split_info->inner_feature_index; larger_leaf_split_info->default_left = other_split_info->default_left; larger_leaf_split_info->threshold = other_split_info->threshold; larger_leaf_split_info->gain = other_split_info->gain; @@ -689,7 +688,6 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cpu_smaller_leaf_index, cpu_larger_leaf_index, cuda_num_features_, - cuda_leaf_best_split_feature_, cuda_leaf_best_split_info_, cuda_task_feature_index_, cuda_best_split_info_, @@ -705,7 +703,6 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cpu_larger_leaf_index, num_blocks_per_leaf, num_leaves_, - cuda_leaf_best_split_feature_, cuda_leaf_best_split_info_, false); } @@ -714,7 +711,6 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cpu_smaller_leaf_index, cpu_larger_leaf_index, cuda_num_features_, - cuda_leaf_best_split_feature_, cuda_leaf_best_split_info_, cuda_task_feature_index_, cuda_best_split_info_, @@ -730,7 +726,6 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cpu_larger_leaf_index, num_blocks_per_leaf, num_leaves_, - cuda_leaf_best_split_feature_, cuda_leaf_best_split_info_, true); } @@ -740,7 +735,6 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cpu_smaller_leaf_index, cpu_larger_leaf_index, cuda_num_features_, - cuda_leaf_best_split_feature_, cuda_leaf_best_split_info_, cuda_task_feature_index_, cuda_best_split_info_, @@ -757,7 +751,6 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cpu_larger_leaf_index, num_blocks_per_leaf, num_leaves_, - cuda_leaf_best_split_feature_, cuda_leaf_best_split_info_, larger_only); } @@ -766,7 +759,6 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, int* out_best_leaf, - const int* cuda_leaf_best_split_feature, const CUDASplitInfo* cuda_leaf_best_split_info, int* cuda_best_split_info_buffer) { const int cuda_cur_num_leaves_ref = *cuda_cur_num_leaves; @@ -791,17 +783,19 @@ __global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, __syncthreads(); ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf, cur_num_valid_threads); if (threadIdx_x == 0) { - *out_best_leaf = thread_best_leaf[0]; + const int best_leaf_index = thread_best_leaf[0]; + *out_best_leaf = best_leaf_index; + //cuda_leaf_best_split_info[best_leaf_index].leaf_index = best_leaf_index; cuda_best_split_info_buffer[6] = thread_best_leaf[0]; } } __global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int larger_leaf_index, - int* cuda_best_split_info_buffer, const int* cuda_leaf_best_split_feature, + int* cuda_best_split_info_buffer, const CUDASplitInfo* cuda_leaf_best_split_info) { const unsigned int threadIdx_x = blockIdx.x; if (threadIdx_x == 0) { - cuda_best_split_info_buffer[0] = cuda_leaf_best_split_feature[smaller_leaf_index]; + cuda_best_split_info_buffer[0] = cuda_leaf_best_split_info[smaller_leaf_index].inner_feature_index; } else if (threadIdx_x == 1) { cuda_best_split_info_buffer[1] = cuda_leaf_best_split_info[smaller_leaf_index].threshold; } else if (threadIdx_x == 2) { @@ -809,7 +803,7 @@ __global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int } if (larger_leaf_index >= 0) { if (threadIdx_x == 3) { - cuda_best_split_info_buffer[3] = cuda_leaf_best_split_feature[larger_leaf_index]; + cuda_best_split_info_buffer[3] = cuda_leaf_best_split_info[larger_leaf_index].inner_feature_index; } else if (threadIdx_x == 4) { cuda_best_split_info_buffer[4] = cuda_leaf_best_split_info[larger_leaf_index].threshold; } else if (threadIdx_x == 5) { @@ -822,11 +816,10 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_ const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF, 0, cuda_streams_[1]>>>(cuda_cur_num_leaves, cuda_best_leaf_, - cuda_leaf_best_split_feature_, cuda_leaf_best_split_info_, cuda_best_split_info_buffer_); PrepareLeafBestSplitInfo<<<6, 1, 0, cuda_streams_[0]>>>(smaller_leaf_index, larger_leaf_index, - cuda_best_split_info_buffer_, cuda_leaf_best_split_feature_, + cuda_best_split_info_buffer_, cuda_leaf_best_split_info_); std::vector cpu_leaf_best_split_info_buffer(7); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 838294883484..12891166c3c5 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -43,14 +43,12 @@ class CUDABestSplitFinder { const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); - void FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int smaller_leaf_index, + const CUDASplitInfo* FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index); const int* cuda_best_leaf() const { return cuda_best_leaf_; } - const int* cuda_leaf_best_split_feature() const { return cuda_leaf_best_split_feature_; } - CUDASplitInfo* cuda_leaf_best_split_info() { return cuda_leaf_best_split_info_; } private: @@ -96,7 +94,6 @@ class CUDABestSplitFinder { // CUDA memory, held by this object // for per leaf best split information int* cuda_best_leaf_; - int* cuda_leaf_best_split_feature_; CUDASplitInfo* cuda_leaf_best_split_info_; // for best split information when finding best split CUDASplitInfo* cuda_best_split_info_; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 1f316706ed84..60b6d48e705f 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -123,9 +123,8 @@ void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { } } -void CUDADataPartition::Split(const int* leaf_id, - const int* best_split_feature, - CUDASplitInfo* best_split_info, +void CUDADataPartition::Split( + const CUDASplitInfo* best_split_info, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, @@ -149,8 +148,7 @@ void CUDADataPartition::Split(const int* leaf_id, global_timer.Stop("GenDataToLeftBitVector"); global_timer.Start("SplitInner"); - SplitInner(leaf_id, num_data_in_leaf, - best_split_feature, + SplitInner(num_data_in_leaf, best_split_info, smaller_leaf_splits, larger_leaf_splits, @@ -166,17 +164,17 @@ void CUDADataPartition::GenDataToLeftBitVector(const data_size_t num_data_in_lea LaunchGenDataToLeftBitVectorKernel(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, left_leaf_index, right_leaf_index); } -void CUDADataPartition::SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, - const int* best_split_feature, - CUDASplitInfo* best_split_info, +void CUDADataPartition::SplitInner( + const data_size_t num_data_in_leaf, + const CUDASplitInfo* best_split_info, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { - LaunchSplitInnerKernel(leaf_index, num_data_in_leaf, - best_split_feature, + LaunchSplitInnerKernel( + num_data_in_leaf, best_split_info, smaller_leaf_splits, larger_leaf_splits, diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 6292dc802ada..0fa6299f768f 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -1119,7 +1119,9 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num } } -__global__ void AggregateBlockOffsetKernel0(const int* leaf_index, data_size_t* block_to_left_offset_buffer, +__global__ void AggregateBlockOffsetKernel0( + const int leaf_index, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, int* cuda_cur_num_leaves, @@ -1128,8 +1130,7 @@ __global__ void AggregateBlockOffsetKernel0(const int* leaf_index, data_size_t* (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; - const int leaf_index_ref = *leaf_index; - const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; + const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index]; const unsigned int blockDim_x = blockDim.x; const unsigned int threadIdx_x = threadIdx.x; const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); @@ -1177,16 +1178,18 @@ __global__ void AggregateBlockOffsetKernel0(const int* leaf_index, data_size_t* if (blockIdx.x == 0 && threadIdx.x == 0) { ++(*cuda_cur_num_leaves); const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; - const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index_ref]; - cuda_leaf_data_end[leaf_index_ref] = cuda_leaf_data_start[leaf_index_ref] + static_cast(to_left_total_count); - cuda_leaf_num_data[leaf_index_ref] = static_cast(to_left_total_count); - cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index_ref]; + const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index]; + cuda_leaf_data_end[leaf_index] = cuda_leaf_data_start[leaf_index] + static_cast(to_left_total_count); + cuda_leaf_num_data[leaf_index] = static_cast(to_left_total_count); + cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index]; cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; cuda_leaf_num_data[cur_max_leaf_index] = num_data_in_leaf - static_cast(to_left_total_count); } } -__global__ void AggregateBlockOffsetKernel1(const int* leaf_index, data_size_t* block_to_left_offset_buffer, +__global__ void AggregateBlockOffsetKernel1( + const int leaf_index, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, int* cuda_cur_num_leaves, @@ -1195,8 +1198,7 @@ __global__ void AggregateBlockOffsetKernel1(const int* leaf_index, data_size_t* (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; - const int leaf_index_ref = *leaf_index; - const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index_ref]; + const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index]; const unsigned int threadIdx_x = threadIdx.x; const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); @@ -1223,21 +1225,20 @@ __global__ void AggregateBlockOffsetKernel1(const int* leaf_index, data_size_t* if (blockIdx.x == 0 && threadIdx.x == 0) { ++(*cuda_cur_num_leaves); const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; - const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index_ref]; - cuda_leaf_data_end[leaf_index_ref] = cuda_leaf_data_start[leaf_index_ref] + static_cast(to_left_total_count); - cuda_leaf_num_data[leaf_index_ref] = static_cast(to_left_total_count); - cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index_ref]; + const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index]; + cuda_leaf_data_end[leaf_index] = cuda_leaf_data_start[leaf_index] + static_cast(to_left_total_count); + cuda_leaf_num_data[leaf_index] = static_cast(to_left_total_count); + cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index]; cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; cuda_leaf_num_data[cur_max_leaf_index] = num_data_in_leaf - static_cast(to_left_total_count); } } -__global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* block_to_left_offset_buffer, +__global__ void SplitTreeStructureKernel(const int leaf_index, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, int* cuda_cur_num_leaves, - const int* best_split_feature, - CUDASplitInfo* best_split_info, + const CUDASplitInfo* best_split_info, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, @@ -1250,44 +1251,42 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, double* cuda_leaf_output, int* cuda_split_info_buffer) { - const int leaf_index_ref = *leaf_index; const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; - const unsigned int to_left_total_cnt = cuda_leaf_num_data[leaf_index_ref]; + const unsigned int to_left_total_cnt = cuda_leaf_num_data[leaf_index]; double* cuda_split_info_buffer_for_hessians = reinterpret_cast(cuda_split_info_buffer + 8); const unsigned int global_thread_index = blockIdx.x * blockDim.x + threadIdx.x; - const CUDASplitInfo* leaf_split_info = best_split_info + leaf_index_ref; if (global_thread_index == 0) { - tree_split_leaf_index[cur_max_leaf_index - 1] = leaf_index_ref; + tree_split_leaf_index[cur_max_leaf_index - 1] = leaf_index; } else if (global_thread_index == 1) { - tree_inner_feature_index[cur_max_leaf_index - 1] = best_split_feature[leaf_index_ref]; + tree_inner_feature_index[cur_max_leaf_index - 1] = best_split_info->inner_feature_index; } else if (global_thread_index == 2) { - tree_threshold[cur_max_leaf_index - 1] = leaf_split_info->threshold; + tree_threshold[cur_max_leaf_index - 1] = best_split_info->threshold; } else if (global_thread_index == 3) { - tree_left_output[cur_max_leaf_index - 1] = leaf_split_info->left_value; + tree_left_output[cur_max_leaf_index - 1] = best_split_info->left_value; } else if (global_thread_index == 4) { - tree_right_output[cur_max_leaf_index - 1] = leaf_split_info->right_value; + tree_right_output[cur_max_leaf_index - 1] = best_split_info->right_value; } else if (global_thread_index == 5) { - tree_left_count[cur_max_leaf_index - 1] = leaf_split_info->left_count; + tree_left_count[cur_max_leaf_index - 1] = best_split_info->left_count; } else if (global_thread_index == 6) { - tree_right_count[cur_max_leaf_index - 1] = leaf_split_info->right_count; + tree_right_count[cur_max_leaf_index - 1] = best_split_info->right_count; } else if (global_thread_index == 7) { - tree_left_sum_hessian[cur_max_leaf_index - 1] = leaf_split_info->left_sum_hessians; + tree_left_sum_hessian[cur_max_leaf_index - 1] = best_split_info->left_sum_hessians; } else if (global_thread_index == 8) { - tree_right_sum_hessian[cur_max_leaf_index - 1] = leaf_split_info->right_sum_hessians; + tree_right_sum_hessian[cur_max_leaf_index - 1] = best_split_info->right_sum_hessians; } else if (global_thread_index == 9) { - tree_gain[cur_max_leaf_index - 1] = leaf_split_info->gain; + tree_gain[cur_max_leaf_index - 1] = best_split_info->gain; } else if (global_thread_index == 10) { - tree_default_left[cur_max_leaf_index - 1] = leaf_split_info->default_left; + tree_default_left[cur_max_leaf_index - 1] = best_split_info->default_left; } else if (global_thread_index == 11) { - cuda_leaf_output[leaf_index_ref] = leaf_split_info->left_value; + cuda_leaf_output[leaf_index] = best_split_info->left_value; } else if (global_thread_index == 12) { - cuda_leaf_output[cur_max_leaf_index] = leaf_split_info->right_value; + cuda_leaf_output[cur_max_leaf_index] = best_split_info->right_value; } else if (global_thread_index == 13) { - cuda_split_info_buffer[0] = leaf_index_ref; + cuda_split_info_buffer[0] = leaf_index; } else if (global_thread_index == 14) { - cuda_split_info_buffer[1] = cuda_leaf_num_data[leaf_index_ref]; + cuda_split_info_buffer[1] = cuda_leaf_num_data[leaf_index]; } else if (global_thread_index == 15) { - cuda_split_info_buffer[2] = cuda_leaf_data_start[leaf_index_ref]; + cuda_split_info_buffer[2] = cuda_leaf_data_start[leaf_index]; } else if (global_thread_index == 16) { cuda_split_info_buffer[3] = cur_max_leaf_index; } else if (global_thread_index == 17) { @@ -1295,103 +1294,99 @@ __global__ void SplitTreeStructureKernel(const int* leaf_index, data_size_t* blo } else if (global_thread_index == 18) { cuda_split_info_buffer[5] = cuda_leaf_data_start[cur_max_leaf_index]; } else if (global_thread_index == 19) { - cuda_split_info_buffer_for_hessians[0] = leaf_split_info->left_sum_hessians; + cuda_split_info_buffer_for_hessians[0] = best_split_info->left_sum_hessians; } else if (global_thread_index == 20) { - cuda_split_info_buffer_for_hessians[1] = leaf_split_info->right_sum_hessians; + cuda_split_info_buffer_for_hessians[1] = best_split_info->right_sum_hessians; } else if (global_thread_index == 21) { - best_split_info[leaf_index_ref].is_valid = false; - } else if (global_thread_index == 22) { - best_split_info[cur_max_leaf_index].is_valid = false; - } else if (global_thread_index == 23) { - const uint32_t threshold_int = leaf_split_info->threshold; - const int split_inner_feature_index = best_split_feature[leaf_index_ref]; + const uint32_t threshold_int = best_split_info->threshold; + const int split_inner_feature_index = best_split_info->inner_feature_index; const double threshold_real = cuda_bin_upper_bounds[cuda_feature_num_bin_offsets[split_inner_feature_index] + threshold_int]; tree_threshold_real[cur_max_leaf_index - 1] = threshold_real; - } + } - if (cuda_leaf_num_data[leaf_index_ref] < cuda_leaf_num_data[cur_max_leaf_index]) { + if (cuda_leaf_num_data[leaf_index] < cuda_leaf_num_data[cur_max_leaf_index]) { if (global_thread_index == 0) { - hist_t* parent_hist_ptr = cuda_hist_pool[leaf_index_ref]; + hist_t* parent_hist_ptr = cuda_hist_pool[leaf_index]; cuda_hist_pool[cur_max_leaf_index] = parent_hist_ptr; - cuda_hist_pool[leaf_index_ref] = cuda_hist + 2 * cur_max_leaf_index * num_total_bin; - smaller_leaf_splits->hist_in_leaf = cuda_hist_pool[leaf_index_ref]; + cuda_hist_pool[leaf_index] = cuda_hist + 2 * cur_max_leaf_index * num_total_bin; + smaller_leaf_splits->hist_in_leaf = cuda_hist_pool[leaf_index]; larger_leaf_splits->hist_in_leaf = cuda_hist_pool[cur_max_leaf_index]; } else if (global_thread_index == 1) { - smaller_leaf_splits->sum_of_gradients = leaf_split_info->left_sum_gradients; + smaller_leaf_splits->sum_of_gradients = best_split_info->left_sum_gradients; } else if (global_thread_index == 2) { - smaller_leaf_splits->sum_of_hessians = leaf_split_info->left_sum_hessians; + smaller_leaf_splits->sum_of_hessians = best_split_info->left_sum_hessians; } else if (global_thread_index == 3) { smaller_leaf_splits->num_data_in_leaf = to_left_total_cnt; } else if (global_thread_index == 4) { - smaller_leaf_splits->gain = leaf_split_info->left_gain; + smaller_leaf_splits->gain = best_split_info->left_gain; } else if (global_thread_index == 5) { - smaller_leaf_splits->leaf_value = leaf_split_info->left_value; + smaller_leaf_splits->leaf_value = best_split_info->left_value; } else if (global_thread_index == 6) { smaller_leaf_splits->data_indices_in_leaf = cuda_data_indices; } else if (global_thread_index == 7) { larger_leaf_splits->leaf_index = cur_max_leaf_index; } else if (global_thread_index == 8) { - larger_leaf_splits->sum_of_gradients = leaf_split_info->right_sum_gradients; + larger_leaf_splits->sum_of_gradients = best_split_info->right_sum_gradients; } else if (global_thread_index == 9) { - larger_leaf_splits->sum_of_hessians = leaf_split_info->right_sum_hessians; + larger_leaf_splits->sum_of_hessians = best_split_info->right_sum_hessians; } else if (global_thread_index == 10) { larger_leaf_splits->num_data_in_leaf = cuda_leaf_num_data[cur_max_leaf_index]; } else if (global_thread_index == 11) { - larger_leaf_splits->gain = leaf_split_info->right_gain; + larger_leaf_splits->gain = best_split_info->right_gain; } else if (global_thread_index == 12) { - larger_leaf_splits->leaf_value = leaf_split_info->right_value; + larger_leaf_splits->leaf_value = best_split_info->right_value; } else if (global_thread_index == 13) { - larger_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[leaf_index_ref]; + larger_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[leaf_index]; } else if (global_thread_index == 14) { - cuda_split_info_buffer[6] = leaf_index_ref; + cuda_split_info_buffer[6] = leaf_index; } else if (global_thread_index == 15) { cuda_split_info_buffer[7] = cur_max_leaf_index; } else if (global_thread_index == 16) { - smaller_leaf_splits->leaf_index = leaf_index_ref; + smaller_leaf_splits->leaf_index = leaf_index; } } else { if (global_thread_index == 0) { - larger_leaf_splits->leaf_index = leaf_index_ref; + larger_leaf_splits->leaf_index = leaf_index; } else if (global_thread_index == 1) { - larger_leaf_splits->sum_of_gradients = leaf_split_info->left_sum_gradients; + larger_leaf_splits->sum_of_gradients = best_split_info->left_sum_gradients; } else if (global_thread_index == 2) { - larger_leaf_splits->sum_of_hessians = leaf_split_info->left_sum_hessians; + larger_leaf_splits->sum_of_hessians = best_split_info->left_sum_hessians; } else if (global_thread_index == 3) { larger_leaf_splits->num_data_in_leaf = to_left_total_cnt; } else if (global_thread_index == 4) { - larger_leaf_splits->gain = leaf_split_info->left_gain; + larger_leaf_splits->gain = best_split_info->left_gain; } else if (global_thread_index == 5) { - larger_leaf_splits->leaf_value = leaf_split_info->left_value; + larger_leaf_splits->leaf_value = best_split_info->left_value; } else if (global_thread_index == 6) { larger_leaf_splits->data_indices_in_leaf = cuda_data_indices; } else if (global_thread_index == 7) { smaller_leaf_splits->leaf_index = cur_max_leaf_index; } else if (global_thread_index == 8) { - smaller_leaf_splits->sum_of_gradients = leaf_split_info->right_sum_gradients; + smaller_leaf_splits->sum_of_gradients = best_split_info->right_sum_gradients; } else if (global_thread_index == 9) { - smaller_leaf_splits->sum_of_hessians = leaf_split_info->right_sum_hessians; + smaller_leaf_splits->sum_of_hessians = best_split_info->right_sum_hessians; } else if (global_thread_index == 10) { smaller_leaf_splits->num_data_in_leaf = cuda_leaf_num_data[cur_max_leaf_index]; } else if (global_thread_index == 11) { - smaller_leaf_splits->gain = leaf_split_info->right_gain; + smaller_leaf_splits->gain = best_split_info->right_gain; } else if (global_thread_index == 12) { - smaller_leaf_splits->leaf_value = leaf_split_info->right_value; + smaller_leaf_splits->leaf_value = best_split_info->right_value; } else if (global_thread_index == 13) { - smaller_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[leaf_index_ref]; + smaller_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[leaf_index]; } else if (global_thread_index == 14) { cuda_hist_pool[cur_max_leaf_index] = cuda_hist + 2 * cur_max_leaf_index * num_total_bin; smaller_leaf_splits->hist_in_leaf = cuda_hist_pool[cur_max_leaf_index]; } else if (global_thread_index == 15) { - larger_leaf_splits->hist_in_leaf = cuda_hist_pool[leaf_index_ref]; + larger_leaf_splits->hist_in_leaf = cuda_hist_pool[leaf_index]; } else if (global_thread_index == 16) { cuda_split_info_buffer[6] = cur_max_leaf_index; } else if (global_thread_index == 17) { - cuda_split_info_buffer[7] = leaf_index_ref; + cuda_split_info_buffer[7] = leaf_index; } } } -__global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_leaves, +__global__ void SplitInnerKernel(const int leaf_index, const int* cuda_cur_num_leaves, const data_size_t* cuda_leaf_data_start, const data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, const uint8_t* split_to_left_bit_vector, const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, @@ -1402,9 +1397,8 @@ __global__ void SplitInnerKernel(const int* leaf_index, const int* cuda_cur_num_ __shared__ uint16_t thread_to_right_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; uint8_t first_to_left = 0; uint8_t second_to_left = 0; - const int leaf_index_ref = *leaf_index; - const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index_ref]; - const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index_ref] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; + const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index]; + const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockDim_x = blockDim.x; const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); @@ -1472,9 +1466,9 @@ __global__ void CopyDataIndicesKernel( } } -void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data_size_t num_data_in_leaf, - const int* best_split_feature, - CUDASplitInfo* best_split_info, +void CUDADataPartition::LaunchSplitInnerKernel( + const data_size_t num_data_in_leaf, + const CUDASplitInfo* best_split_info, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, @@ -1499,13 +1493,17 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); if (num_blocks_final > AGGREGATE_BLOCK_SIZE_DATA_PARTITION) { - AggregateBlockOffsetKernel0<<<1, AGGREGATE_BLOCK_SIZE_DATA_PARTITION, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + AggregateBlockOffsetKernel0<<<1, AGGREGATE_BLOCK_SIZE_DATA_PARTITION, 0, cuda_streams_[0]>>>( + cpu_leaf_index, + cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, cuda_cur_num_leaves_, num_blocks_final); } else { - AggregateBlockOffsetKernel1<<<1, num_blocks_final_aligned, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + AggregateBlockOffsetKernel1<<<1, num_blocks_final_aligned, 0, cuda_streams_[0]>>>( + cpu_leaf_index, + cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, cuda_cur_num_leaves_, @@ -1516,18 +1514,17 @@ void CUDADataPartition::LaunchSplitInnerKernel(const int* leaf_index, const data global_timer.Start("CUDADataPartition::SplitInnerKernel"); SplitInnerKernel<<>>( - leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, + cpu_leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::SplitInnerKernel"); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); - SplitTreeStructureKernel<<<4, 6, 0, cuda_streams_[0]>>>(leaf_index, cuda_block_data_to_left_offset_, + SplitTreeStructureKernel<<<4, 6, 0, cuda_streams_[0]>>>(cpu_leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, cuda_cur_num_leaves_, - best_split_feature, best_split_info, smaller_leaf_splits, larger_leaf_splits, diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index ccbfd6bedd84..22b49f44d849 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -39,19 +39,23 @@ class CUDADataPartition { void BeforeTrain(const data_size_t* data_indices); - void Split(const int* leaf_id, const int* best_split_feature, - CUDASplitInfo* best_split_info, - // for splits information update + void Split( + // input best split info + const CUDASplitInfo* best_split_info, + // for leaf information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, - std::vector* cpu_leaf_num_data, - std::vector* cpu_leaf_data_start, - std::vector* cpu_leaf_sum_hessians, - const std::vector& cpu_leaf_best_split_feature, - const std::vector& cpu_leaf_best_split_threshold, - const std::vector& cpu_leaf_best_split_default_left, - int* smaller_leaf_index, int* larger_leaf_index, - const int cpu_leaf_index, const int cur_max_leaf_index); + // gather information for CPU, used for launching kernels + std::vector* leaf_num_data, + std::vector* leaf_data_start, + std::vector* leaf_sum_hessians, + const std::vector& leaf_best_split_feature, + const std::vector& leaf_best_split_threshold, + const std::vector& leaf_best_split_default_left, + int* smaller_leaf_index, + int* larger_leaf_index, + const int leaf_index, + const int cur_max_leaf_index); Tree* GetCPUTree(); @@ -105,28 +109,34 @@ class CUDADataPartition { const uint8_t split_default_left, const data_size_t leaf_data_start, const int left_leaf_index, const int right_leaf_index); - void SplitInner(const int* leaf_index, const data_size_t num_data_in_leaf, - const int* best_split_feature, - CUDASplitInfo* best_split_info, + void SplitInner( + const data_size_t num_data_in_leaf, + const CUDASplitInfo* best_split_info, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, - std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + std::vector* cpu_leaf_num_data, + std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index); + int* smaller_leaf_index, + int* larger_leaf_index, + const int leaf_index); // kernel launch functions void LaunchFillDataIndicesBeforeTrain(); - void LaunchSplitInnerKernel(const int* leaf_index, const data_size_t num_data_in_leaf, - const int* best_split_feature, - CUDASplitInfo* best_split_info, + void LaunchSplitInnerKernel( + const data_size_t num_data_in_leaf, + const CUDASplitInfo* best_split_info, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, - std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + std::vector* cpu_leaf_num_data, + std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index); + int* smaller_leaf_index, + int* larger_leaf_index, + const int cpu_leaf_index); void LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index c17f7d8ba399..13a512278d51 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -45,6 +45,7 @@ void CUDALeafSplits::InitValues( SetCUDAMemory(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_); SetCUDAMemory(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_); LaunchInitValuesKernal(cuda_data_indices_in_leaf, cuda_hist_in_leaf); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CopyFromCUDADeviceToHostAsync(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, cuda_streams_[1]); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 77a5c515a086..c29176ad7363 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -41,6 +41,7 @@ __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score } __global__ void CUDAInitValuesKernel2( + const int num_blocks_to_reduce, double* cuda_sum_of_gradients, double* cuda_sum_of_hessians, const data_size_t num_data, @@ -49,7 +50,7 @@ __global__ void CUDAInitValuesKernel2( CUDALeafSplitsStruct* cuda_struct) { double sum_of_gradients = 0.0f; double sum_of_hessians = 0.0f; - for (unsigned int i = 0; i < gridDim.x; ++i) { + for (unsigned int i = 0; i < num_blocks_to_reduce; ++i) { sum_of_gradients += cuda_sum_of_gradients[i]; sum_of_hessians += cuda_sum_of_hessians[i]; } @@ -59,7 +60,7 @@ __global__ void CUDAInitValuesKernel2( cuda_struct->sum_of_gradients = sum_of_gradients; cuda_struct->sum_of_hessians = sum_of_hessians; cuda_struct->num_data_in_leaf = num_data; - cuda_struct->gain = kMinScore; + cuda_struct->gain = 0.0f; cuda_struct->leaf_value = 0.0f; cuda_struct->data_indices_in_leaf = cuda_data_indices_in_leaf; cuda_struct->hist_in_leaf = cuda_hist_in_leaf; @@ -70,7 +71,7 @@ __global__ void InitValuesEmptyKernel(CUDALeafSplitsStruct* cuda_struct) { cuda_struct->sum_of_gradients = 0.0f; cuda_struct->sum_of_hessians = 0.0f; cuda_struct->num_data_in_leaf = 0; - cuda_struct->gain = kMinScore; + cuda_struct->gain = 0.0f; cuda_struct->leaf_value = 0.0f; cuda_struct->data_indices_in_leaf = nullptr; cuda_struct->hist_in_leaf = nullptr; @@ -88,6 +89,7 @@ void CUDALeafSplits::LaunchInitValuesKernal( cuda_sum_of_hessians_buffer_); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CUDAInitValuesKernel2<<<1, 1>>>( + num_blocks_init_from_gradients_, cuda_sum_of_gradients_buffer_, cuda_sum_of_hessians_buffer_, num_data_, diff --git a/src/treelearner/cuda/cuda_split_info.hpp b/src/treelearner/cuda/cuda_split_info.hpp index aa95d4b1049b..08849345459c 100644 --- a/src/treelearner/cuda/cuda_split_info.hpp +++ b/src/treelearner/cuda/cuda_split_info.hpp @@ -18,7 +18,7 @@ struct CUDASplitInfo { bool is_valid; int leaf_index; double gain; - int feature_index; + int inner_feature_index; uint32_t threshold; bool default_left; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 91bd9355b248..ba80dd057057 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -181,7 +181,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, find_best_split_time += duration.count(); start = std::chrono::steady_clock::now(); global_timer.Start("NewCUDATreeLearner::FindBestFromAllSplits"); - cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves(), + const CUDASplitInfo* best_split_info = cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves(), smaller_leaf_index_, larger_leaf_index_, &leaf_best_split_feature_, &leaf_best_split_threshold_, &leaf_best_split_default_left_, &best_leaf_index_); global_timer.Stop("NewCUDATreeLearner::FindBestFromAllSplits"); @@ -196,9 +196,8 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, global_timer.Start("NewCUDATreeLearner::Split"); start = std::chrono::steady_clock::now(); - cuda_data_partition_->Split(cuda_best_split_finder_->cuda_best_leaf(), - cuda_best_split_finder_->cuda_leaf_best_split_feature(), - cuda_best_split_finder_->cuda_leaf_best_split_info(), + cuda_data_partition_->Split( + best_split_info, cuda_smaller_leaf_splits_->GetCUDAStructRef(), cuda_larger_leaf_splits_->GetCUDAStructRef(), &leaf_num_data_, From 45cf7a77335e9d1e2fa80502b3c9813e3b64c856 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 22 Jul 2021 04:07:31 +0000 Subject: [PATCH 042/166] split with CUDATree directly --- .../LightGBM}/cuda/cuda_split_info.hpp | 1 - include/LightGBM/cuda/cuda_tree.hpp | 26 +++ src/io/cuda/cuda_tree.cpp | 81 ++++++++- src/io/cuda/cuda_tree.cu | 136 ++++++++++++++ .../cuda/cuda_best_split_finder.cpp | 5 +- .../cuda/cuda_best_split_finder.cu | 15 +- .../cuda/cuda_best_split_finder.hpp | 6 +- src/treelearner/cuda/cuda_data_partition.cpp | 45 ++--- src/treelearner/cuda/cuda_data_partition.cu | 172 +++++++----------- src/treelearner/cuda/cuda_data_partition.hpp | 76 +------- .../cuda/cuda_histogram_constructor.cpp | 4 +- .../cuda/cuda_histogram_constructor.cu | 43 ++--- .../cuda/cuda_histogram_constructor.hpp | 3 +- .../cuda/new_cuda_tree_learner.cpp | 79 ++------ .../cuda/new_cuda_tree_learner.hpp | 2 - 15 files changed, 390 insertions(+), 304 deletions(-) rename {src/treelearner => include/LightGBM}/cuda/cuda_split_info.hpp (96%) diff --git a/src/treelearner/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp similarity index 96% rename from src/treelearner/cuda/cuda_split_info.hpp rename to include/LightGBM/cuda/cuda_split_info.hpp index 08849345459c..61b3438c063f 100644 --- a/src/treelearner/cuda/cuda_split_info.hpp +++ b/include/LightGBM/cuda/cuda_split_info.hpp @@ -9,7 +9,6 @@ #ifdef USE_CUDA #include -#include "new_cuda_utils.hpp" namespace LightGBM { diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index 1f980fdb5115..4607d7c7e8be 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -7,7 +7,9 @@ #define LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ #include +#include #include +#include "../bin.h" namespace LightGBM { @@ -25,6 +27,12 @@ class CUDATree : public Tree { ~CUDATree() noexcept; + int Split(const int leaf_index, + const int real_feature_index, + const double real_threshold, + const MissingType missing_type, + const CUDASplitInfo* cuda_split_info); + /*! * \brief Adding prediction value of this tree model to scores * \param data The dataset @@ -65,8 +73,16 @@ class CUDATree : public Tree { inline void Shrinkage(double rate) override; private: + void InitCUDAMemory(); + void InitCUDA(); + void LaunchSplitKernel(const int leaf_index, + const int real_feature_index, + const double real_threshold, + const MissingType missing_type, + const CUDASplitInfo* cuda_split_info); + void LaunchAddPredictionToScoreKernel(const Dataset* data, const data_size_t* used_data_indices, data_size_t num_data, double* score) const; @@ -77,10 +93,20 @@ class CUDATree : public Tree { int* cuda_right_child_; int* cuda_split_feature_inner_; int* cuda_split_feature_; + int* cuda_leaf_depth_; + int* cuda_leaf_parent_; uint32_t* cuda_threshold_in_bin_; double* cuda_threshold_; + double* cuda_internal_weight_; + double* cuda_internal_value_; int8_t* cuda_decision_type_; double* cuda_leaf_value_; + data_size_t* cuda_leaf_count_; + double* cuda_leaf_weight_; + data_size_t* cuda_internal_count_; + double* cuda_split_gain_; + + cudaStream_t cuda_stream_; const int num_threads_per_block_add_prediction_to_score_; }; diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index bb59b5fb8f70..4e290b235945 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -11,7 +11,7 @@ CUDATree::CUDATree(int max_leaves, bool track_branch_features, bool is_linear): Tree(max_leaves, track_branch_features, is_linear), num_threads_per_block_add_prediction_to_score_(1024) { is_cuda_tree_ = true; - Log::Fatal("CUDATree can be only created from host Tree."); + InitCUDAMemory(); } CUDATree::CUDATree(const Tree* host_tree): @@ -23,6 +23,75 @@ CUDATree::CUDATree(const Tree* host_tree): CUDATree::~CUDATree() {} +void CUDATree::InitCUDAMemory() { + AllocateCUDAMemoryOuter(&cuda_left_child_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_right_child_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_split_feature_inner_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_split_feature_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_leaf_depth_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_leaf_parent_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_threshold_in_bin_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_threshold_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_decision_type_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_leaf_value_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_internal_weight_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_internal_value_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_leaf_weight_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_leaf_count_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_internal_count_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_split_gain_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + SetCUDAMemoryOuter(cuda_leaf_parent_, -1, 1, __FILE__, __LINE__); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); +} + void CUDATree::InitCUDA() { InitCUDAMemoryFromHostMemoryOuter(&cuda_left_child_, left_child_.data(), @@ -67,6 +136,16 @@ void CUDATree::InitCUDA() { SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } +int CUDATree::Split(const int leaf_index, + const int real_feature_index, + const double real_threshold, + const MissingType missing_type, + const CUDASplitInfo* cuda_split_info) { + LaunchSplitKernel(leaf_index, real_feature_index, real_threshold, missing_type, cuda_split_info); + ++num_leaves_; + return num_leaves_ - 1; +} + void CUDATree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const { diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index c995f1be43dc..bdebf2f3a0ba 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -7,6 +7,137 @@ namespace LightGBM { +__device__ void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) { + if (input) { + (*decision_type) |= mask; + } else { + (*decision_type) &= (127 - mask); + } +} + +__device__ void SetMissingType(int8_t* decision_type, int8_t input) { + (*decision_type) &= 3; + (*decision_type) |= (input << 2); +} + +__global__ void SplitKernel(// split information + const int leaf_index, + const int real_feature_index, + const double real_threshold, + const MissingType missing_type, + const CUDASplitInfo* cuda_split_info, + // tree structure + const int num_leaves, + int* leaf_parent, + int* leaf_depth, + int* left_child, + int* right_child, + int* split_feature_inner, + int* split_feature, + double* split_gain, + double* internal_weight, + double* internal_value, + data_size_t* internal_count, + double* leaf_weight, + double* leaf_value, + data_size_t* leaf_count, + int8_t* decision_type, + uint32_t* threshold_in_bin, + double* threshold) { + const int new_node_index = num_leaves - 1; + const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const int parent_index = leaf_parent[leaf_index]; + if (thread_index == 0) { + if (parent_index >= 0) { + // if cur node is left child + if (left_child[parent_index] == ~leaf_index) { + left_child[parent_index] = new_node_index; + } else { + right_child[parent_index] = new_node_index; + } + } + } else if (thread_index == 1) { + // add new node + split_feature_inner[new_node_index] = cuda_split_info->inner_feature_index; + } else if (thread_index == 2) { + split_feature[new_node_index] = real_feature_index; + } else if (thread_index == 3) { + split_gain[new_node_index] = cuda_split_info->gain; + } else if (thread_index == 4) { + // add two new leaves + left_child[new_node_index] = ~leaf_index; + } else if (thread_index == 5) { + right_child[new_node_index] = ~num_leaves; + } else if (thread_index == 6) { + // update new leaves + leaf_parent[leaf_index] = new_node_index; + } else if (thread_index == 7) { + leaf_parent[num_leaves] = new_node_index; + } else if (thread_index == 8) { + // save current leaf value to internal node before change + internal_weight[new_node_index] = leaf_weight[leaf_index]; + leaf_weight[leaf_index] = cuda_split_info->left_sum_hessians; + } else if (thread_index == 9) { + internal_value[new_node_index] = leaf_value[leaf_index]; + leaf_value[leaf_index] = std::isnan(cuda_split_info->left_value) ? 0.0f : cuda_split_info->left_value; + } else if (thread_index == 10) { + internal_count[new_node_index] = cuda_split_info->left_count + cuda_split_info->right_count; + } else if (thread_index == 11) { + leaf_count[leaf_index] = cuda_split_info->left_count; + } else if (thread_index == 12) { + leaf_value[num_leaves] = std::isnan(cuda_split_info->right_value) ? 0.0f : cuda_split_info->right_value; + } else if (thread_index == 13) { + leaf_weight[num_leaves] = cuda_split_info->right_sum_hessians; + } else if (thread_index == 14) { + leaf_count[num_leaves] = cuda_split_info->right_count; + } else if (thread_index == 15) { + // update leaf depth + leaf_depth[num_leaves] = leaf_depth[leaf_index] + 1; + leaf_depth[leaf_index]++; + } else if (thread_index == 16) { + decision_type[new_node_index] = 0; + SetDecisionType(&decision_type[new_node_index], false, kCategoricalMask); + SetDecisionType(&decision_type[new_node_index], cuda_split_info->default_left, kDefaultLeftMask); + SetMissingType(&decision_type[new_node_index], static_cast(missing_type)); + } else if (thread_index == 17) { + threshold_in_bin[new_node_index] = cuda_split_info->threshold; + } else if (thread_index == 18) { + threshold[new_node_index] = real_threshold; + } +} + +void CUDATree::LaunchSplitKernel(const int leaf_index, + const int real_feature_index, + const double real_threshold, + const MissingType missing_type, + const CUDASplitInfo* cuda_split_info) { + SplitKernel<<<4, 5, 0, cuda_stream_>>>( + // split information + leaf_index, + real_feature_index, + real_threshold, + missing_type, + cuda_split_info, + // tree structure + num_leaves_, + cuda_leaf_parent_, + cuda_leaf_depth_, + cuda_left_child_, + cuda_right_child_, + cuda_split_feature_inner_, + cuda_split_feature_, + cuda_split_gain_, + cuda_internal_weight_, + cuda_internal_value_, + cuda_internal_count_, + cuda_leaf_weight_, + cuda_leaf_value_, + cuda_leaf_count_, + cuda_decision_type_, + cuda_threshold_in_bin_, + cuda_threshold_); +} + template __global__ void AddPredictionToScoreKernel( // dataset information @@ -33,6 +164,7 @@ __global__ void AddPredictionToScoreKernel( const data_size_t data_index = USE_INDICES ? cuda_used_indices[inner_data_index] : inner_data_index; if (data_index < num_data) { int node = 0; + int num_iter = 0; while (node >= 0) { const int split_feature_inner = cuda_split_feature_inner[node]; const int column = cuda_feature_to_column[split_feature_inner]; @@ -72,6 +204,10 @@ __global__ void AddPredictionToScoreKernel( node = cuda_right_child[node]; } } + ++num_iter; + if (num_iter >= 1000) { + printf("error num_iter = %d, node = %d, ~node = %d\n", num_iter, node, ~node); + } } score[data_index] += cuda_leaf_value[~node]; } diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 624746aafc1b..850a2c3831c0 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -139,17 +139,16 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplitsStruct* smal const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ && sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0); LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits, smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); - //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); } -const CUDASplitInfo* CUDABestSplitFinder::FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int smaller_leaf_index, +const CUDASplitInfo* CUDABestSplitFinder::FindBestFromAllSplits(const int cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { - LaunchFindBestFromAllSplitsKernel(cuda_cur_num_leaves, smaller_leaf_index, larger_leaf_index, + LaunchFindBestFromAllSplitsKernel(cur_num_leaves, smaller_leaf_index, larger_leaf_index, leaf_best_split_feature, leaf_best_split_threshold, leaf_best_split_default_left, best_leaf_index); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); return cuda_leaf_best_split_info_ + (*best_leaf_index); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 133d4c87ce90..25bfd5223f3d 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -757,22 +757,21 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( } } -__global__ void FindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, +__global__ void FindBestFromAllSplitsKernel(const int cur_num_leaves, int* out_best_leaf, const CUDASplitInfo* cuda_leaf_best_split_info, int* cuda_best_split_info_buffer) { - const int cuda_cur_num_leaves_ref = *cuda_cur_num_leaves; __shared__ double thread_best_gain[NUM_THREADS_FIND_BEST_LEAF]; __shared__ int thread_best_leaf[NUM_THREADS_FIND_BEST_LEAF]; const unsigned int threadIdx_x = threadIdx.x; thread_best_gain[threadIdx_x] = K_MIN_SCORE; thread_best_leaf[threadIdx_x] = -1; - const int num_leaves_per_thread = (cuda_cur_num_leaves_ref + NUM_THREADS_FIND_BEST_LEAF - 1) / NUM_THREADS_FIND_BEST_LEAF; - const int cur_num_valid_threads = (cuda_cur_num_leaves_ref + num_leaves_per_thread - 1) / num_leaves_per_thread; + const int num_leaves_per_thread = (cur_num_leaves + NUM_THREADS_FIND_BEST_LEAF - 1) / NUM_THREADS_FIND_BEST_LEAF; + const int cur_num_valid_threads = (cur_num_leaves + num_leaves_per_thread - 1) / num_leaves_per_thread; if (threadIdx_x < static_cast(cur_num_valid_threads)) { const int start = num_leaves_per_thread * threadIdx_x; - const int end = min(start + num_leaves_per_thread, cuda_cur_num_leaves_ref); - for (int leaf_index = threadIdx_x; leaf_index < cuda_cur_num_leaves_ref; leaf_index += cur_num_valid_threads) { + const int end = min(start + num_leaves_per_thread, cur_num_leaves); + for (int leaf_index = threadIdx_x; leaf_index < cur_num_leaves; leaf_index += cur_num_valid_threads) { const double leaf_best_gain = cuda_leaf_best_split_info[leaf_index].gain; if (cuda_leaf_best_split_info[leaf_index].is_valid && leaf_best_gain > thread_best_gain[threadIdx_x]) { thread_best_gain[threadIdx_x] = leaf_best_gain; @@ -812,10 +811,10 @@ __global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int } } -void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, +void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { - FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF, 0, cuda_streams_[1]>>>(cuda_cur_num_leaves, cuda_best_leaf_, + FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF, 0, cuda_streams_[1]>>>(cur_num_leaves, cuda_best_leaf_, cuda_leaf_best_split_info_, cuda_best_split_info_buffer_); PrepareLeafBestSplitInfo<<<6, 1, 0, cuda_streams_[0]>>>(smaller_leaf_index, larger_leaf_index, diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 12891166c3c5..3c4a3ad9d719 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -11,9 +11,9 @@ #include "new_cuda_utils.hpp" #include "cuda_leaf_splits.hpp" -#include "cuda_split_info.hpp" #include +#include #include #include @@ -43,7 +43,7 @@ class CUDABestSplitFinder { const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); - const CUDASplitInfo* FindBestFromAllSplits(const int* cuda_cur_num_leaves, const int smaller_leaf_index, + const CUDASplitInfo* FindBestFromAllSplits(const int cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index); @@ -62,7 +62,7 @@ class CUDABestSplitFinder { const bool is_smaller_leaf_valid, const bool is_larger_leaf_valid); - void LaunchFindBestFromAllSplitsKernel(const int* cuda_cur_num_leaves, const int smaller_leaf_index, + void LaunchFindBestFromAllSplitsKernel(const int cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index); diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 60b6d48e705f..64e0496f8ea5 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -53,7 +53,6 @@ void CUDADataPartition::Init() { AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_data_start_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_data_end_); AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_num_data_); - InitCUDAValueFromConstant(&cuda_cur_num_leaves_, 1); // leave some space for alignment AllocateCUDAMemory(static_cast(num_data_) + 1024 * 8, &cuda_data_to_left_); AllocateCUDAMemory(static_cast(num_data_), &cuda_data_index_to_leaf_index_); @@ -67,19 +66,6 @@ void CUDADataPartition::Init() { AllocateCUDAMemory(12, &cuda_split_info_buffer_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_split_leaf_index_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_inner_feature_index_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_threshold_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_threshold_real_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_left_output_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_right_output_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_left_count_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_right_count_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_left_sum_hessian_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_right_sum_hessian_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_gain_); - AllocateCUDAMemory(static_cast(num_leaves_), &tree_default_left_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_output_); cuda_streams_.resize(4); @@ -115,8 +101,6 @@ void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { CopyFromCUDADeviceToCUDADevice(cuda_leaf_num_data_, cuda_num_data_, 1); CopyFromCUDADeviceToCUDADevice(cuda_leaf_data_end_, cuda_num_data_, 1); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - cur_num_leaves_ = 1; - CopyFromHostToCUDADevice(cuda_cur_num_leaves_, &cur_num_leaves_, 1); CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1); } else { Log::Fatal("bagging is not supported by GPU"); @@ -125,6 +109,8 @@ void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { void CUDADataPartition::Split( const CUDASplitInfo* best_split_info, + const int left_leaf_index, + const int right_leaf_index, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, @@ -134,26 +120,27 @@ void CUDADataPartition::Split( const std::vector& cpu_leaf_best_split_feature, const std::vector& cpu_leaf_best_split_threshold, const std::vector& cpu_leaf_best_split_default_left, - int* smaller_leaf_index, int* larger_leaf_index, - const int cpu_leaf_index, const int cur_max_leaf_index) { + int* smaller_leaf_index, int* larger_leaf_index) { global_timer.Start("GenDataToLeftBitVector"); global_timer.Start("SplitInner Copy CUDA To Host"); - const data_size_t num_data_in_leaf = cpu_leaf_num_data->at(cpu_leaf_index); - const int split_feature_index = cpu_leaf_best_split_feature[cpu_leaf_index]; - const uint32_t split_threshold = cpu_leaf_best_split_threshold[cpu_leaf_index]; - const uint8_t split_default_left = cpu_leaf_best_split_default_left[cpu_leaf_index]; - const data_size_t leaf_data_start = cpu_leaf_data_start->at(cpu_leaf_index); + const data_size_t num_data_in_leaf = cpu_leaf_num_data->at(left_leaf_index); + const int split_feature_index = cpu_leaf_best_split_feature[left_leaf_index]; + const uint32_t split_threshold = cpu_leaf_best_split_threshold[left_leaf_index]; + const uint8_t split_default_left = cpu_leaf_best_split_default_left[left_leaf_index]; + const data_size_t leaf_data_start = cpu_leaf_data_start->at(left_leaf_index); global_timer.Stop("SplitInner Copy CUDA To Host"); - GenDataToLeftBitVector(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, cpu_leaf_index, cur_max_leaf_index); + GenDataToLeftBitVector(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, left_leaf_index, right_leaf_index); global_timer.Stop("GenDataToLeftBitVector"); global_timer.Start("SplitInner"); SplitInner(num_data_in_leaf, best_split_info, + left_leaf_index, + right_leaf_index, smaller_leaf_splits, larger_leaf_splits, cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, - smaller_leaf_index, larger_leaf_index, cpu_leaf_index); + smaller_leaf_index, larger_leaf_index); global_timer.Stop("SplitInner"); } @@ -167,19 +154,23 @@ void CUDADataPartition::GenDataToLeftBitVector(const data_size_t num_data_in_lea void CUDADataPartition::SplitInner( const data_size_t num_data_in_leaf, const CUDASplitInfo* best_split_info, + const int left_leaf_index, + const int right_leaf_index, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { + int* smaller_leaf_index, int* larger_leaf_index) { LaunchSplitInnerKernel( num_data_in_leaf, best_split_info, + left_leaf_index, + right_leaf_index, smaller_leaf_splits, larger_leaf_splits, cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, - smaller_leaf_index, larger_leaf_index, cpu_leaf_index); + smaller_leaf_index, larger_leaf_index); ++cur_num_leaves_; } diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 0fa6299f768f..c0b2e2e79428 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -1120,17 +1120,17 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num } __global__ void AggregateBlockOffsetKernel0( - const int leaf_index, + const int left_leaf_index, + const int right_leaf_index, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, - int* cuda_cur_num_leaves, const data_size_t num_blocks) { __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; - const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index]; + const data_size_t num_data_in_leaf = cuda_leaf_num_data[left_leaf_index]; const unsigned int blockDim_x = blockDim.x; const unsigned int threadIdx_x = threadIdx.x; const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); @@ -1176,29 +1176,27 @@ __global__ void AggregateBlockOffsetKernel0( } __syncthreads(); if (blockIdx.x == 0 && threadIdx.x == 0) { - ++(*cuda_cur_num_leaves); - const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; - const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index]; - cuda_leaf_data_end[leaf_index] = cuda_leaf_data_start[leaf_index] + static_cast(to_left_total_count); - cuda_leaf_num_data[leaf_index] = static_cast(to_left_total_count); - cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index]; - cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; - cuda_leaf_num_data[cur_max_leaf_index] = num_data_in_leaf - static_cast(to_left_total_count); + const data_size_t old_leaf_data_end = cuda_leaf_data_end[left_leaf_index]; + cuda_leaf_data_end[left_leaf_index] = cuda_leaf_data_start[left_leaf_index] + static_cast(to_left_total_count); + cuda_leaf_num_data[left_leaf_index] = static_cast(to_left_total_count); + cuda_leaf_data_start[right_leaf_index] = cuda_leaf_data_end[left_leaf_index]; + cuda_leaf_data_end[right_leaf_index] = old_leaf_data_end; + cuda_leaf_num_data[right_leaf_index] = num_data_in_leaf - static_cast(to_left_total_count); } } __global__ void AggregateBlockOffsetKernel1( - const int leaf_index, + const int left_leaf_index, + const int right_leaf_index, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, - int* cuda_cur_num_leaves, const data_size_t num_blocks, const data_size_t num_blocks_aligned) { __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; - const data_size_t num_data_in_leaf = cuda_leaf_num_data[leaf_index]; + const data_size_t num_data_in_leaf = cuda_leaf_num_data[left_leaf_index]; const unsigned int threadIdx_x = threadIdx.x; const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); @@ -1223,21 +1221,20 @@ __global__ void AggregateBlockOffsetKernel1( } __syncthreads(); if (blockIdx.x == 0 && threadIdx.x == 0) { - ++(*cuda_cur_num_leaves); - const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; - const data_size_t old_leaf_data_end = cuda_leaf_data_end[leaf_index]; - cuda_leaf_data_end[leaf_index] = cuda_leaf_data_start[leaf_index] + static_cast(to_left_total_count); - cuda_leaf_num_data[leaf_index] = static_cast(to_left_total_count); - cuda_leaf_data_start[cur_max_leaf_index] = cuda_leaf_data_end[leaf_index]; - cuda_leaf_data_end[cur_max_leaf_index] = old_leaf_data_end; - cuda_leaf_num_data[cur_max_leaf_index] = num_data_in_leaf - static_cast(to_left_total_count); + const data_size_t old_leaf_data_end = cuda_leaf_data_end[left_leaf_index]; + cuda_leaf_data_end[left_leaf_index] = cuda_leaf_data_start[left_leaf_index] + static_cast(to_left_total_count); + cuda_leaf_num_data[left_leaf_index] = static_cast(to_left_total_count); + cuda_leaf_data_start[right_leaf_index] = cuda_leaf_data_end[left_leaf_index]; + cuda_leaf_data_end[right_leaf_index] = old_leaf_data_end; + cuda_leaf_num_data[right_leaf_index] = num_data_in_leaf - static_cast(to_left_total_count); } } -__global__ void SplitTreeStructureKernel(const int leaf_index, data_size_t* block_to_left_offset_buffer, +__global__ void SplitTreeStructureKernel(const int left_leaf_index, + const int right_leaf_index, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, - int* cuda_cur_num_leaves, const CUDASplitInfo* best_split_info, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, @@ -1246,71 +1243,40 @@ __global__ void SplitTreeStructureKernel(const int leaf_index, data_size_t* bloc hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition, const double* cuda_bin_upper_bounds, const int* cuda_feature_num_bin_offsets, - int* tree_split_leaf_index, int* tree_inner_feature_index, uint32_t* tree_threshold, double* tree_threshold_real, - double* tree_left_output, double* tree_right_output, data_size_t* tree_left_count, data_size_t* tree_right_count, - double* tree_left_sum_hessian, double* tree_right_sum_hessian, double* tree_gain, uint8_t* tree_default_left, double* cuda_leaf_output, int* cuda_split_info_buffer) { - const int cur_max_leaf_index = (*cuda_cur_num_leaves) - 1; - const unsigned int to_left_total_cnt = cuda_leaf_num_data[leaf_index]; + const unsigned int to_left_total_cnt = cuda_leaf_num_data[left_leaf_index]; double* cuda_split_info_buffer_for_hessians = reinterpret_cast(cuda_split_info_buffer + 8); const unsigned int global_thread_index = blockIdx.x * blockDim.x + threadIdx.x; if (global_thread_index == 0) { - tree_split_leaf_index[cur_max_leaf_index - 1] = leaf_index; + cuda_leaf_output[left_leaf_index] = best_split_info->left_value; } else if (global_thread_index == 1) { - tree_inner_feature_index[cur_max_leaf_index - 1] = best_split_info->inner_feature_index; + cuda_leaf_output[right_leaf_index] = best_split_info->right_value; } else if (global_thread_index == 2) { - tree_threshold[cur_max_leaf_index - 1] = best_split_info->threshold; + cuda_split_info_buffer[0] = left_leaf_index; } else if (global_thread_index == 3) { - tree_left_output[cur_max_leaf_index - 1] = best_split_info->left_value; + cuda_split_info_buffer[1] = cuda_leaf_num_data[left_leaf_index]; } else if (global_thread_index == 4) { - tree_right_output[cur_max_leaf_index - 1] = best_split_info->right_value; + cuda_split_info_buffer[2] = cuda_leaf_data_start[left_leaf_index]; } else if (global_thread_index == 5) { - tree_left_count[cur_max_leaf_index - 1] = best_split_info->left_count; + cuda_split_info_buffer[3] = right_leaf_index; } else if (global_thread_index == 6) { - tree_right_count[cur_max_leaf_index - 1] = best_split_info->right_count; + cuda_split_info_buffer[4] = cuda_leaf_num_data[right_leaf_index]; } else if (global_thread_index == 7) { - tree_left_sum_hessian[cur_max_leaf_index - 1] = best_split_info->left_sum_hessians; + cuda_split_info_buffer[5] = cuda_leaf_data_start[right_leaf_index]; } else if (global_thread_index == 8) { - tree_right_sum_hessian[cur_max_leaf_index - 1] = best_split_info->right_sum_hessians; - } else if (global_thread_index == 9) { - tree_gain[cur_max_leaf_index - 1] = best_split_info->gain; - } else if (global_thread_index == 10) { - tree_default_left[cur_max_leaf_index - 1] = best_split_info->default_left; - } else if (global_thread_index == 11) { - cuda_leaf_output[leaf_index] = best_split_info->left_value; - } else if (global_thread_index == 12) { - cuda_leaf_output[cur_max_leaf_index] = best_split_info->right_value; - } else if (global_thread_index == 13) { - cuda_split_info_buffer[0] = leaf_index; - } else if (global_thread_index == 14) { - cuda_split_info_buffer[1] = cuda_leaf_num_data[leaf_index]; - } else if (global_thread_index == 15) { - cuda_split_info_buffer[2] = cuda_leaf_data_start[leaf_index]; - } else if (global_thread_index == 16) { - cuda_split_info_buffer[3] = cur_max_leaf_index; - } else if (global_thread_index == 17) { - cuda_split_info_buffer[4] = cuda_leaf_num_data[cur_max_leaf_index]; - } else if (global_thread_index == 18) { - cuda_split_info_buffer[5] = cuda_leaf_data_start[cur_max_leaf_index]; - } else if (global_thread_index == 19) { cuda_split_info_buffer_for_hessians[0] = best_split_info->left_sum_hessians; - } else if (global_thread_index == 20) { + } else if (global_thread_index == 9) { cuda_split_info_buffer_for_hessians[1] = best_split_info->right_sum_hessians; - } else if (global_thread_index == 21) { - const uint32_t threshold_int = best_split_info->threshold; - const int split_inner_feature_index = best_split_info->inner_feature_index; - const double threshold_real = cuda_bin_upper_bounds[cuda_feature_num_bin_offsets[split_inner_feature_index] + threshold_int]; - tree_threshold_real[cur_max_leaf_index - 1] = threshold_real; } - if (cuda_leaf_num_data[leaf_index] < cuda_leaf_num_data[cur_max_leaf_index]) { + if (cuda_leaf_num_data[left_leaf_index] < cuda_leaf_num_data[right_leaf_index]) { if (global_thread_index == 0) { - hist_t* parent_hist_ptr = cuda_hist_pool[leaf_index]; - cuda_hist_pool[cur_max_leaf_index] = parent_hist_ptr; - cuda_hist_pool[leaf_index] = cuda_hist + 2 * cur_max_leaf_index * num_total_bin; - smaller_leaf_splits->hist_in_leaf = cuda_hist_pool[leaf_index]; - larger_leaf_splits->hist_in_leaf = cuda_hist_pool[cur_max_leaf_index]; + hist_t* parent_hist_ptr = cuda_hist_pool[left_leaf_index]; + cuda_hist_pool[right_leaf_index] = parent_hist_ptr; + cuda_hist_pool[left_leaf_index] = cuda_hist + 2 * right_leaf_index * num_total_bin; + smaller_leaf_splits->hist_in_leaf = cuda_hist_pool[left_leaf_index]; + larger_leaf_splits->hist_in_leaf = cuda_hist_pool[right_leaf_index]; } else if (global_thread_index == 1) { smaller_leaf_splits->sum_of_gradients = best_split_info->left_sum_gradients; } else if (global_thread_index == 2) { @@ -1324,29 +1290,29 @@ __global__ void SplitTreeStructureKernel(const int leaf_index, data_size_t* bloc } else if (global_thread_index == 6) { smaller_leaf_splits->data_indices_in_leaf = cuda_data_indices; } else if (global_thread_index == 7) { - larger_leaf_splits->leaf_index = cur_max_leaf_index; + larger_leaf_splits->leaf_index = right_leaf_index; } else if (global_thread_index == 8) { larger_leaf_splits->sum_of_gradients = best_split_info->right_sum_gradients; } else if (global_thread_index == 9) { larger_leaf_splits->sum_of_hessians = best_split_info->right_sum_hessians; } else if (global_thread_index == 10) { - larger_leaf_splits->num_data_in_leaf = cuda_leaf_num_data[cur_max_leaf_index]; + larger_leaf_splits->num_data_in_leaf = cuda_leaf_num_data[right_leaf_index]; } else if (global_thread_index == 11) { larger_leaf_splits->gain = best_split_info->right_gain; } else if (global_thread_index == 12) { larger_leaf_splits->leaf_value = best_split_info->right_value; } else if (global_thread_index == 13) { - larger_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[leaf_index]; + larger_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[left_leaf_index]; } else if (global_thread_index == 14) { - cuda_split_info_buffer[6] = leaf_index; + cuda_split_info_buffer[6] = left_leaf_index; } else if (global_thread_index == 15) { - cuda_split_info_buffer[7] = cur_max_leaf_index; + cuda_split_info_buffer[7] = right_leaf_index; } else if (global_thread_index == 16) { - smaller_leaf_splits->leaf_index = leaf_index; + smaller_leaf_splits->leaf_index = left_leaf_index; } } else { if (global_thread_index == 0) { - larger_leaf_splits->leaf_index = leaf_index; + larger_leaf_splits->leaf_index = left_leaf_index; } else if (global_thread_index == 1) { larger_leaf_splits->sum_of_gradients = best_split_info->left_sum_gradients; } else if (global_thread_index == 2) { @@ -1360,45 +1326,44 @@ __global__ void SplitTreeStructureKernel(const int leaf_index, data_size_t* bloc } else if (global_thread_index == 6) { larger_leaf_splits->data_indices_in_leaf = cuda_data_indices; } else if (global_thread_index == 7) { - smaller_leaf_splits->leaf_index = cur_max_leaf_index; + smaller_leaf_splits->leaf_index = right_leaf_index; } else if (global_thread_index == 8) { smaller_leaf_splits->sum_of_gradients = best_split_info->right_sum_gradients; } else if (global_thread_index == 9) { smaller_leaf_splits->sum_of_hessians = best_split_info->right_sum_hessians; } else if (global_thread_index == 10) { - smaller_leaf_splits->num_data_in_leaf = cuda_leaf_num_data[cur_max_leaf_index]; + smaller_leaf_splits->num_data_in_leaf = cuda_leaf_num_data[right_leaf_index]; } else if (global_thread_index == 11) { smaller_leaf_splits->gain = best_split_info->right_gain; } else if (global_thread_index == 12) { smaller_leaf_splits->leaf_value = best_split_info->right_value; } else if (global_thread_index == 13) { - smaller_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[leaf_index]; + smaller_leaf_splits->data_indices_in_leaf = cuda_data_indices + cuda_leaf_num_data[left_leaf_index]; } else if (global_thread_index == 14) { - cuda_hist_pool[cur_max_leaf_index] = cuda_hist + 2 * cur_max_leaf_index * num_total_bin; - smaller_leaf_splits->hist_in_leaf = cuda_hist_pool[cur_max_leaf_index]; + cuda_hist_pool[right_leaf_index] = cuda_hist + 2 * right_leaf_index * num_total_bin; + smaller_leaf_splits->hist_in_leaf = cuda_hist_pool[right_leaf_index]; } else if (global_thread_index == 15) { - larger_leaf_splits->hist_in_leaf = cuda_hist_pool[leaf_index]; + larger_leaf_splits->hist_in_leaf = cuda_hist_pool[left_leaf_index]; } else if (global_thread_index == 16) { - cuda_split_info_buffer[6] = cur_max_leaf_index; + cuda_split_info_buffer[6] = right_leaf_index; } else if (global_thread_index == 17) { - cuda_split_info_buffer[7] = leaf_index; + cuda_split_info_buffer[7] = left_leaf_index; } } } -__global__ void SplitInnerKernel(const int leaf_index, const int* cuda_cur_num_leaves, +__global__ void SplitInnerKernel(const int left_leaf_index, const int right_leaf_index, const data_size_t* cuda_leaf_data_start, const data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, const uint8_t* split_to_left_bit_vector, const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, data_size_t* out_data_indices_in_leaf, const int split_indices_block_size_data_partition) { - //__shared__ uint8_t thread_split_to_left_bit_vector[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; __shared__ uint16_t thread_to_left_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; __shared__ uint16_t thread_to_right_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; uint8_t first_to_left = 0; uint8_t second_to_left = 0; - const data_size_t leaf_num_data_offset = cuda_leaf_data_start[leaf_index]; - const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[leaf_index] + cuda_leaf_num_data[(*cuda_cur_num_leaves) - 1]; + const data_size_t leaf_num_data_offset = cuda_leaf_data_start[left_leaf_index]; + const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[left_leaf_index] + cuda_leaf_num_data[right_leaf_index]; const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockDim_x = blockDim.x; const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); @@ -1469,12 +1434,15 @@ __global__ void CopyDataIndicesKernel( void CUDADataPartition::LaunchSplitInnerKernel( const data_size_t num_data_in_leaf, const CUDASplitInfo* best_split_info, + const int left_leaf_index, + const int right_leaf_index, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, - std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, + std::vector* cpu_leaf_num_data, + std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index, const int cpu_leaf_index) { + int* smaller_leaf_index, int* larger_leaf_index) { const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; @@ -1494,19 +1462,19 @@ void CUDADataPartition::LaunchSplitInnerKernel( if (num_blocks_final > AGGREGATE_BLOCK_SIZE_DATA_PARTITION) { AggregateBlockOffsetKernel0<<<1, AGGREGATE_BLOCK_SIZE_DATA_PARTITION, 0, cuda_streams_[0]>>>( - cpu_leaf_index, + left_leaf_index, + right_leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, - cuda_cur_num_leaves_, num_blocks_final); } else { AggregateBlockOffsetKernel1<<<1, num_blocks_final_aligned, 0, cuda_streams_[0]>>>( - cpu_leaf_index, + left_leaf_index, + right_leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, - cuda_cur_num_leaves_, num_blocks_final, num_blocks_final_aligned); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); @@ -1514,17 +1482,17 @@ void CUDADataPartition::LaunchSplitInnerKernel( global_timer.Start("CUDADataPartition::SplitInnerKernel"); SplitInnerKernel<<>>( - cpu_leaf_index, cuda_cur_num_leaves_, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, + left_leaf_index, right_leaf_index, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::SplitInnerKernel"); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); - SplitTreeStructureKernel<<<4, 6, 0, cuda_streams_[0]>>>(cpu_leaf_index, cuda_block_data_to_left_offset_, + SplitTreeStructureKernel<<<4, 5, 0, cuda_streams_[0]>>>(left_leaf_index, right_leaf_index, + cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_out_data_indices_in_leaf_, - cuda_cur_num_leaves_, best_split_info, smaller_leaf_splits, larger_leaf_splits, @@ -1533,10 +1501,6 @@ void CUDADataPartition::LaunchSplitInnerKernel( cuda_hist_pool_, split_indices_block_size_data_partition_aligned, cuda_bin_upper_bounds_, cuda_feature_num_bin_offsets_, - - tree_split_leaf_index_, tree_inner_feature_index_, tree_threshold_, tree_threshold_real_, - tree_left_output_, tree_right_output_, tree_left_count_, tree_right_count_, - tree_left_sum_hessian_, tree_right_sum_hessian_, tree_gain_, tree_default_left_, cuda_leaf_output_, cuda_split_info_buffer_); global_timer.Stop("CUDADataPartition::SplitTreeStructureKernel"); std::vector cpu_split_info_buffer(12); @@ -1552,8 +1516,6 @@ void CUDADataPartition::LaunchSplitInnerKernel( CopyDataIndicesKernel<<>>( left_leaf_num_data + right_leaf_num_data, cuda_out_data_indices_in_leaf_, cuda_data_indices_ + left_leaf_data_start); global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); - const int left_leaf_index = cpu_split_info_buffer[0]; - const int right_leaf_index = cpu_split_info_buffer[3]; const data_size_t right_leaf_data_start = cpu_split_info_buffer[5]; (*cpu_leaf_num_data)[left_leaf_index] = left_leaf_num_data; (*cpu_leaf_data_start)[left_leaf_index] = left_leaf_data_start; diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 22b49f44d849..27b8d1267f40 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -14,7 +14,7 @@ #include #include "new_cuda_utils.hpp" #include "cuda_leaf_splits.hpp" -#include "cuda_split_info.hpp" +#include // TODO(shiyu1994): adjust these values according to different CUDA and GPU versions #define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) @@ -42,6 +42,8 @@ class CUDADataPartition { void Split( // input best split info const CUDASplitInfo* best_split_info, + const int left_leaf_index, + const int right_leaf_index, // for leaf information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, @@ -53,48 +55,14 @@ class CUDADataPartition { const std::vector& leaf_best_split_threshold, const std::vector& leaf_best_split_default_left, int* smaller_leaf_index, - int* larger_leaf_index, - const int leaf_index, - const int cur_max_leaf_index); + int* larger_leaf_index); Tree* GetCPUTree(); void UpdateTrainScore(const double learning_rate, double* cuda_scores); - const data_size_t* cuda_leaf_data_start() const { return cuda_leaf_data_start_; } - - const data_size_t* cuda_leaf_data_end() const { return cuda_leaf_data_end_; } - - const data_size_t* cuda_leaf_num_data() const { return cuda_leaf_num_data_; } - const data_size_t* cuda_data_indices() const { return cuda_data_indices_; } - const int* cuda_cur_num_leaves() const { return cuda_cur_num_leaves_; } - - const int* tree_split_leaf_index() const { return tree_split_leaf_index_; } - - const int* tree_inner_feature_index() const { return tree_inner_feature_index_; } - - const uint32_t* tree_threshold() const { return tree_threshold_; } - - const double* tree_threshold_real() const { return tree_threshold_real_; } - - const double* tree_left_output() const { return tree_left_output_; } - - const double* tree_right_output() const { return tree_right_output_; } - - const data_size_t* tree_left_count() const { return tree_left_count_; } - - const data_size_t* tree_right_count() const { return tree_right_count_; } - - const double* tree_left_sum_hessian() const { return tree_left_sum_hessian_; } - - const double* tree_right_sum_hessian() const { return tree_right_sum_hessian_; } - - const double* tree_gain() const { return tree_gain_; } - - const uint8_t* tree_default_left() const { return tree_default_left_; } - private: void CalcBlockDim(const data_size_t num_data_in_leaf, int* grid_dim, @@ -112,6 +80,8 @@ class CUDADataPartition { void SplitInner( const data_size_t num_data_in_leaf, const CUDASplitInfo* best_split_info, + const int left_leaf_index, + const int right_leaf_index, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, @@ -119,8 +89,7 @@ class CUDADataPartition { std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, - int* larger_leaf_index, - const int leaf_index); + int* larger_leaf_index); // kernel launch functions void LaunchFillDataIndicesBeforeTrain(); @@ -128,6 +97,8 @@ class CUDADataPartition { void LaunchSplitInnerKernel( const data_size_t num_data_in_leaf, const CUDASplitInfo* best_split_info, + const int left_leaf_index, + const int right_leaf_index, // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, @@ -135,8 +106,7 @@ class CUDADataPartition { std::vector* cpu_leaf_data_start, std::vector* cpu_leaf_sum_hessians, int* smaller_leaf_index, - int* larger_leaf_index, - const int cpu_leaf_index); + int* larger_leaf_index); void LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, @@ -254,8 +224,6 @@ class CUDADataPartition { data_size_t* cuda_leaf_data_end_; /*! \brief number of data in each leaf */ data_size_t* cuda_leaf_num_data_; - /*! \brief currnet number of leaves in tree */ - int* cuda_cur_num_leaves_; /*! \brief records the histogram of each leaf */ hist_t** cuda_hist_pool_; /*! \brief records the value of each leaf */ @@ -276,30 +244,6 @@ class CUDADataPartition { // split tree structure algorithm related /*! \brief buffer to store split information, prepared to be copied to cpu */ int* cuda_split_info_buffer_; - /*! \brief the sequence of leaf indices being split during tree growing */ - int* tree_split_leaf_index_; - /*! \brief the sequence of inner split indices during tree growing */ - int* tree_inner_feature_index_; - /*! \brief the sequence of inner threshold during tree growing */ - uint32_t* tree_threshold_; - /*! \brief the sequence of real threshold during tree growing */ - double* tree_threshold_real_; - /*! \brief the sequence of left child output value of splits during tree growing */ - double* tree_left_output_; - /*! \brief the sequence of right child output value of splits during tree growing */ - double* tree_right_output_; - /*! \brief the sequence of left child data number value of splits during tree growing */ - data_size_t* tree_left_count_; - /*! \brief the sequence of right child data number value of splits during tree growing */ - data_size_t* tree_right_count_; - /*! \brief the sequence of left child hessian sum value of splits during tree growing */ - double* tree_left_sum_hessian_; - /*! \brief the sequence of right child hessian sum value of splits during tree growing */ - double* tree_right_sum_hessian_; - /*! \brief the sequence of split gains during tree growing */ - double* tree_gain_; - /*! \brief the sequence of split default left during tree growing */ - uint8_t* tree_default_left_; // dataset information /*! \brief upper bounds of bin boundaries for feature histograms */ diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 98fe0daf582e..9be270b14621 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -290,13 +290,13 @@ void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, void CUDAHistogramConstructor::ConstructHistogramForLeaf( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits, - const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, + const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { if ((num_data_in_smaller_leaf <= min_data_in_leaf_ || sum_hessians_in_smaller_leaf <= min_sum_hessian_in_leaf_) && (num_data_in_larger_leaf <= min_data_in_leaf_ || sum_hessians_in_larger_leaf <= min_sum_hessian_in_leaf_)) { return; } - LaunchConstructHistogramKernel(cuda_smaller_leaf_splits, cuda_leaf_num_data, num_data_in_smaller_leaf); + LaunchConstructHistogramKernel(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); LaunchSubtractHistogramKernel(cuda_smaller_leaf_splits, cuda_larger_leaf_splits); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 061c23eaae6f..cdd4921568a8 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -83,17 +83,14 @@ __global__ void CUDAConstructHistogramDenseKernel( const score_t* cuda_gradients, const score_t* cuda_hessians, const int* num_feature_groups, - const data_size_t* leaf_num_data, const BIN_TYPE* data, const uint32_t* column_hist_offsets, const uint32_t* column_hist_offsets_full, const int* feature_partition_column_index_offsets, const data_size_t num_data) { - - const int leaf_index_ref = smaller_leaf_splits->leaf_index; const int dim_y = static_cast(gridDim.y * blockDim.y); - const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; - const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; + const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; + const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf; __shared__ float shared_hist[SHRAE_HIST_SIZE]; const unsigned int num_threads_per_block = blockDim.x * blockDim.y; @@ -113,7 +110,7 @@ __global__ void CUDAConstructHistogramDenseKernel( const unsigned int blockIdx_y = blockIdx.y; const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread; const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; - data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf_ref - block_start, num_data_per_thread * static_cast(blockDim.y))); + data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast(blockDim.y))); const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y; const data_size_t remainder = block_num_data % blockDim.y; const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); @@ -146,17 +143,14 @@ __global__ void CUDAConstructHistogramSparseKernel( const score_t* cuda_gradients, const score_t* cuda_hessians, const int* num_feature_groups, - const data_size_t* leaf_num_data, const BIN_TYPE* data, const DATA_PTR_TYPE* row_ptr, const DATA_PTR_TYPE* partition_ptr, const uint32_t* column_hist_offsets_full, const data_size_t num_data) { - - const int leaf_index_ref = smaller_leaf_splits->leaf_index; const int dim_y = static_cast(gridDim.y * blockDim.y); - const data_size_t num_data_in_smaller_leaf_ref = leaf_num_data[leaf_index_ref]; - const data_size_t num_data_per_thread = (num_data_in_smaller_leaf_ref + dim_y - 1) / dim_y; + const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; + const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf; __shared__ float shared_hist[SHRAE_HIST_SIZE]; const unsigned int num_threads_per_block = blockDim.x * blockDim.y; @@ -174,7 +168,7 @@ __global__ void CUDAConstructHistogramSparseKernel( const unsigned int blockIdx_y = blockIdx.y; const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread; const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; - data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf_ref - block_start, num_data_per_thread * static_cast(blockDim.y))); + data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast(blockDim.y))); const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y; const data_size_t remainder = block_num_data % blockDim.y; const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); @@ -204,7 +198,6 @@ __global__ void CUDAConstructHistogramSparseKernel( void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, - const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf) { int grid_dim_x = 0; int grid_dim_y = 0; @@ -219,7 +212,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_num_feature_groups_, cuda_data_uint8_t_, cuda_row_ptr_uint16_t_, cuda_partition_ptr_uint16_t_, @@ -229,7 +222,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_num_feature_groups_, cuda_data_uint8_t_, cuda_row_ptr_uint32_t_, cuda_partition_ptr_uint32_t_, @@ -239,7 +232,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_num_feature_groups_, cuda_data_uint8_t_, cuda_row_ptr_uint64_t_, cuda_partition_ptr_uint64_t_, @@ -251,7 +244,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_num_feature_groups_, cuda_data_uint16_t_, cuda_row_ptr_uint16_t_, cuda_partition_ptr_uint16_t_, @@ -261,7 +254,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_num_feature_groups_, cuda_data_uint16_t_, cuda_row_ptr_uint32_t_, cuda_partition_ptr_uint32_t_, @@ -271,7 +264,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_num_feature_groups_, cuda_data_uint16_t_, cuda_row_ptr_uint64_t_, cuda_partition_ptr_uint64_t_, @@ -283,7 +276,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_num_feature_groups_, cuda_data_uint32_t_, cuda_row_ptr_uint16_t_, cuda_partition_ptr_uint16_t_, @@ -293,7 +286,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_num_feature_groups_, cuda_data_uint32_t_, cuda_row_ptr_uint32_t_, cuda_partition_ptr_uint32_t_, @@ -303,7 +296,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, + cuda_num_feature_groups_, cuda_data_uint32_t_, cuda_row_ptr_uint64_t_, cuda_partition_ptr_uint64_t_, @@ -316,7 +309,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint8_t_, + cuda_num_feature_groups_, cuda_data_uint8_t_, cuda_column_hist_offsets_, cuda_column_hist_offsets_full_, cuda_feature_partition_column_index_offsets_, @@ -325,7 +318,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint16_t_, + cuda_num_feature_groups_, cuda_data_uint16_t_, cuda_column_hist_offsets_, cuda_column_hist_offsets_full_, cuda_feature_partition_column_index_offsets_, @@ -334,7 +327,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_leaf_num_data, cuda_data_uint32_t_, + cuda_num_feature_groups_, cuda_data_uint32_t_, cuda_column_hist_offsets_, cuda_column_hist_offsets_full_, cuda_feature_partition_column_index_offsets_, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 34f7764c411b..a3cfecc9ef6e 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -39,7 +39,7 @@ class CUDAHistogramConstructor { void ConstructHistogramForLeaf( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits, - const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, + const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); void BeforeTrain(const score_t* gradients, const score_t* hessians); @@ -59,7 +59,6 @@ class CUDAHistogramConstructor { void LaunchConstructHistogramKernel( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, - const data_size_t* cuda_leaf_num_data, const data_size_t num_data_in_smaller_leaf); void LaunchSubtractHistogramKernel( diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index ba80dd057057..34a4c0b92ead 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -89,50 +89,6 @@ void NewCUDATreeLearner::AddPredictionToScore(const Tree* /*tree*/, double* out_ cuda_data_partition_->UpdateTrainScore(config_->learning_rate, out_score); } -Tree* NewCUDATreeLearner::BuildTree(const int num_leaves) { - std::unique_ptr tree(new Tree(config_->num_leaves, false, false)); - std::vector leaf_index(config_->num_leaves); - std::vector inner_feature_index(config_->num_leaves); - std::vector threshold(config_->num_leaves); - std::vector left_output(config_->num_leaves); - std::vector right_output(config_->num_leaves); - std::vector left_count(config_->num_leaves); - std::vector right_count(config_->num_leaves); - std::vector left_sum_hessian(config_->num_leaves); - std::vector right_sum_hessian(config_->num_leaves); - std::vector gain(config_->num_leaves); - std::vector default_left(config_->num_leaves); - CopyFromCUDADeviceToHost(leaf_index.data(), cuda_data_partition_->tree_split_leaf_index(), config_->num_leaves); - CopyFromCUDADeviceToHost(inner_feature_index.data(), cuda_data_partition_->tree_inner_feature_index(), config_->num_leaves); - CopyFromCUDADeviceToHost(threshold.data(), cuda_data_partition_->tree_threshold(), config_->num_leaves); - CopyFromCUDADeviceToHost(left_output.data(), cuda_data_partition_->tree_left_output(), config_->num_leaves); - CopyFromCUDADeviceToHost(right_output.data(), cuda_data_partition_->tree_right_output(), config_->num_leaves); - CopyFromCUDADeviceToHost(left_count.data(), cuda_data_partition_->tree_left_count(), config_->num_leaves); - CopyFromCUDADeviceToHost(right_count.data(), cuda_data_partition_->tree_right_count(), config_->num_leaves); - CopyFromCUDADeviceToHost(left_sum_hessian.data(), cuda_data_partition_->tree_left_sum_hessian(), config_->num_leaves); - CopyFromCUDADeviceToHost(right_sum_hessian.data(), cuda_data_partition_->tree_right_sum_hessian(), config_->num_leaves); - CopyFromCUDADeviceToHost(gain.data(), cuda_data_partition_->tree_gain(), config_->num_leaves); - CopyFromCUDADeviceToHost(default_left.data(), cuda_data_partition_->tree_default_left(), config_->num_leaves); - for (int i = 0; i < num_leaves - 1; ++i) { - tree->Split( - leaf_index[i], - inner_feature_index[i], - train_data_->RealFeatureIndex(inner_feature_index[i]), - threshold[i], - train_data_->RealThreshold(inner_feature_index[i], threshold[i]), - left_output[i], - right_output[i], - left_count[i], - right_count[i], - left_sum_hessian[i], - right_sum_hessian[i], - gain[i], - train_data_->FeatureBinMapper(inner_feature_index[i])->missing_type(), - static_cast(default_left[i])); - } - return tree.release(); -} - Tree* NewCUDATreeLearner::Train(const score_t* gradients, const score_t* hessians, bool /*is_first_tree*/) { gradients_ = gradients; @@ -147,7 +103,8 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, double find_best_split_time = 0.0f; double find_best_split_from_all_leaves_time = 0.0f; double split_data_indices_time = 0.0f; - int num_leaves = 1; + const bool track_branch_features = !(config_->interaction_constraints_vector.empty()); + std::unique_ptr tree(new CUDATree(config_->num_leaves, track_branch_features, config_->linear_tree)); for (int i = 0; i < config_->num_leaves - 1; ++i) { global_timer.Start("NewCUDATreeLearner::ConstructHistogramForLeaf"); auto start = std::chrono::steady_clock::now(); @@ -158,7 +115,6 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, cuda_histogram_constructor_->ConstructHistogramForLeaf( cuda_smaller_leaf_splits_->GetCUDAStruct(), cuda_larger_leaf_splits_->GetCUDAStruct(), - cuda_data_partition_->cuda_leaf_num_data(), num_data_in_smaller_leaf, num_data_in_larger_leaf, sum_hessians_in_smaller_leaf, @@ -181,9 +137,14 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, find_best_split_time += duration.count(); start = std::chrono::steady_clock::now(); global_timer.Start("NewCUDATreeLearner::FindBestFromAllSplits"); - const CUDASplitInfo* best_split_info = cuda_best_split_finder_->FindBestFromAllSplits(cuda_data_partition_->cuda_cur_num_leaves(), - smaller_leaf_index_, larger_leaf_index_, - &leaf_best_split_feature_, &leaf_best_split_threshold_, &leaf_best_split_default_left_, &best_leaf_index_); + const CUDASplitInfo* best_split_info = cuda_best_split_finder_->FindBestFromAllSplits( + tree->num_leaves(), + smaller_leaf_index_, + larger_leaf_index_, + &leaf_best_split_feature_, + &leaf_best_split_threshold_, + &leaf_best_split_default_left_, + &best_leaf_index_); global_timer.Stop("NewCUDATreeLearner::FindBestFromAllSplits"); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); @@ -196,8 +157,17 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, global_timer.Start("NewCUDATreeLearner::Split"); start = std::chrono::steady_clock::now(); + int right_leaf_index = tree->Split(best_leaf_index_, + train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]), + train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], + leaf_best_split_threshold_[best_leaf_index_]), + train_data_->FeatureBinMapper(leaf_best_split_feature_[best_leaf_index_])->missing_type(), + best_split_info); + cuda_data_partition_->Split( best_split_info, + best_leaf_index_, + right_leaf_index, cuda_smaller_leaf_splits_->GetCUDAStructRef(), cuda_larger_leaf_splits_->GetCUDAStructRef(), &leaf_num_data_, @@ -207,30 +177,21 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, leaf_best_split_threshold_, leaf_best_split_default_left_, &smaller_leaf_index_, - &larger_leaf_index_, - best_leaf_index_, - num_leaves); + &larger_leaf_index_); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::Split"); split_data_indices_time += duration.count(); - ++num_leaves; } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); const auto end = std::chrono::steady_clock::now(); const double duration = (static_cast>(end - start)).count(); - const auto build_tree_start = std::chrono::steady_clock::now(); - std::unique_ptr tree(BuildTree(num_leaves)); - const auto build_tree_end = std::chrono::steady_clock::now(); - const auto build_tre_duration = (static_cast>(build_tree_end - build_tree_start)).count(); Log::Warning("Train time %f", duration); Log::Warning("before train time %f", static_cast>(before_train_end - before_train_start).count()); Log::Warning("construct histogram time %f", construct_histogram_time); Log::Warning("find best split time %f", find_best_split_time); Log::Warning("find best split time from all leaves %f", find_best_split_from_all_leaves_time); Log::Warning("split data indices time %f", split_data_indices_time); - Log::Warning("build tree time %f", build_tre_duration); - tree.reset(new CUDATree(tree.get())); return tree.release(); } diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index ce28d8089f57..3e3ae7e7d486 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -49,8 +49,6 @@ class NewCUDATreeLearner: public SerialTreeLearner { void BeforeTrain() override; - Tree* BuildTree(const int num_leaves); - // number of GPUs int num_gpus_; // number of threads on CPU From 9dea18d71896063879f01e42afaaed293f790120 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 26 Jul 2021 10:58:04 +0000 Subject: [PATCH 043/166] use cuda row data in cuda histogram constructor --- include/LightGBM/cuda/cuda_algorithms.hpp | 30 ++ include/LightGBM/cuda/cuda_row_data.hpp | 139 ++++++ include/LightGBM/train_share_states.h | 4 +- .../cuda/cuda_best_split_finder.cpp | 218 ++++++--- .../cuda/cuda_best_split_finder.cu | 74 ++- .../cuda/cuda_best_split_finder.hpp | 79 +-- src/treelearner/cuda/cuda_data_partition.cpp | 92 ++-- src/treelearner/cuda/cuda_data_partition.cu | 97 ++-- src/treelearner/cuda/cuda_data_partition.hpp | 109 +++-- .../cuda/cuda_histogram_constructor.cpp | 449 ++---------------- .../cuda/cuda_histogram_constructor.cu | 176 ++++--- .../cuda/cuda_histogram_constructor.hpp | 111 ++--- .../cuda/new_cuda_tree_learner.cpp | 83 ++-- 13 files changed, 760 insertions(+), 901 deletions(-) create mode 100644 include/LightGBM/cuda/cuda_algorithms.hpp create mode 100644 include/LightGBM/cuda/cuda_row_data.hpp diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp new file mode 100644 index 000000000000..53ab11ba315e --- /dev/null +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -0,0 +1,30 @@ +/*! + * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_ +#define LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_ + +#ifdef USE_CUDA + +#include +#include +#include + +#include + +namespace LightGBM { + +template +__device__ ReduceSum(T* values, size_t n); + +template +__device__ ReduceMax(T* values, size_t n); + +template +__device__ PrefixSum(T* values, size_t n); + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_ diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp new file mode 100644 index 000000000000..1ea850d5b2f8 --- /dev/null +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -0,0 +1,139 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#ifndef LIGHTGBM_CUDA_ROW_DATA_HPP_ +#define LIGHTGBM_CUDA_ROW_DATA_HPP_ + +#include +#include +#include +#include +#include + +#include "../train_share_states.h" + +#define SHRAE_HIST_SIZE (6144 * 2) + +namespace LightGBM { + +class CUDARowData { + public: + CUDARowData(const Dataset* train_data, + const TrainingShareStates* train_share_state); + + void Init(const Dataset* train_data, + TrainingShareStates* train_share_state); + + int num_feature_partitions() const { return num_feature_partitions_; } + + int max_num_column_per_partition() const { return max_num_column_per_partition_; } + + bool is_sparse() const { return is_sparse_; } + + uint8_t bit_type() const { return bit_type_; } + + uint8_t row_ptr_bit_type() const { return row_ptr_bit_type_; } + + const uint8_t* cuda_data_uint8() const { return cuda_data_uint8_t_; } + + const uint16_t* cuda_data_uint16() const { return cuda_data_uint16_t_; } + + const uint32_t* cuda_data_uint32() const { return cuda_data_uint32_t_; } + + const uint16_t* cuda_row_ptr_uint16() const { return cuda_row_ptr_uint16_t_; } + + const uint32_t* cuda_row_ptr_uint32() const { return cuda_row_ptr_uint32_t_; } + + const uint64_t* cuda_row_ptr_uint64() const { return cuda_row_ptr_uint64_t_; } + + const uint16_t* cuda_partition_ptr_uint16() const { return cuda_partition_ptr_uint16_t_; } + + const uint32_t* cuda_partition_ptr_uint32() const { return cuda_partition_ptr_uint32_t_; } + + const uint64_t* cuda_partition_ptr_uint64() const { return cuda_partition_ptr_uint64_t_; } + + const int* cuda_feature_partition_column_index_offsets() const { return cuda_feature_partition_column_index_offsets_; } + + const uint32_t* cuda_column_hist_offsets() const { return cuda_column_hist_offsets_; } + + const uint32_t* cuda_partition_hist_offsets() const { return cuda_partition_hist_offsets_; } + + private: + void DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state); + + template + void GetDenseDataPartitioned(const BIN_TYPE* row_wise_data, std::vector* partitioned_data); + + template + void GetSparseDataPartitioned(const BIN_TYPE* row_wise_data, + const ROW_PTR_TYPE* row_ptr, + std::vector>* partitioned_data, + std::vector>* partitioned_row_ptr, + std::vector* partition_ptr); + + template + void InitSparseData(const BIN_TYPE* host_data, + const ROW_PTR_TYPE* host_row_ptr, + BIN_TYPE** cuda_data, + ROW_PTR_TYPE** cuda_row_ptr, + ROW_PTR_TYPE** cuda_partition_ptr); + + /*! \brief number of threads to use */ + int num_threads_; + /*! \brief number of training data */ + data_size_t num_data_; + /*! \brief number of bins of all features */ + int num_total_bin_; + /*! \brief number of feature groups in dataset */ + int num_feature_group_; + /*! \brief number of features in dataset */ + int num_feature_; + /*! \brief number of bits used to store each bin value */ + uint8_t bit_type_; + /*! \brief number of bits used to store each row pointer value */ + uint8_t row_ptr_bit_type_; + /*! \brief is sparse row wise data */ + bool is_sparse_; + /*! \brief start column index of each feature partition */ + std::vector feature_partition_column_index_offsets_; + /*! \brief histogram offset of each column */ + std::vector column_hist_offsets_; + /*! \brief hisotgram offset of each partition */ + std::vector partition_hist_offsets_; + /*! \brief maximum number of columns among all feature partitions */ + int max_num_column_per_partition_; + /*! \brief number of partitions */ + int num_feature_partitions_; + + // CUDA memory + + /*! \brief row-wise data stored in CUDA, 8 bits */ + uint8_t* cuda_data_uint8_t_; + /*! \brief row-wise data stored in CUDA, 16 bits */ + uint16_t* cuda_data_uint16_t_; + /*! \brief row-wise data stored in CUDA, 32 bits */ + uint32_t* cuda_data_uint32_t_; + /*! \brief row pointer stored in CUDA, 16 bits */ + uint16_t* cuda_row_ptr_uint16_t_; + /*! \brief row pointer stored in CUDA, 32 bits */ + uint32_t* cuda_row_ptr_uint32_t_; + /*! \brief row pointer stored in CUDA, 64 bits */ + uint64_t* cuda_row_ptr_uint64_t_; + /*! \brief partition bin offsets, 16 bits */ + uint16_t* cuda_partition_ptr_uint16_t_; + /*! \brief partition bin offsets, 32 bits */ + uint32_t* cuda_partition_ptr_uint32_t_; + /*! \brief partition bin offsets, 64 bits */ + uint64_t* cuda_partition_ptr_uint64_t_; + /*! \brief start column index of each feature partition */ + int* cuda_feature_partition_column_index_offsets_; + /*! \brief histogram offset of each column */ + uint32_t* cuda_column_hist_offsets_; + /*! \brief hisotgram offset of each partition */ + uint32_t* cuda_partition_hist_offsets_; +}; + +} // namespace LightGBM +#endif // LIGHTGBM_CUDA_COLUMN_DATA_HPP_ diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index 4521bb474de6..ff303b86c32c 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -178,9 +178,9 @@ struct TrainingShareStates { int num_hist_total_bin() { return num_hist_total_bin_; } - const std::vector& feature_hist_offsets() { return feature_hist_offsets_; } + const std::vector& feature_hist_offsets() const { return feature_hist_offsets_; } - const std::vector& column_hist_offsets() { return column_hist_offsets_; } + const std::vector& column_hist_offsets() const { return column_hist_offsets_; } bool IsSparseRowwise() { return (multi_val_bin_wrapper_ != nullptr && multi_val_bin_wrapper_->IsSparse()); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 850a2c3831c0..983bbe1064ac 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -11,15 +11,21 @@ namespace LightGBM { -CUDABestSplitFinder::CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* train_data, - const std::vector& feature_hist_offsets, const int num_leaves, - const double lambda_l1, const double lambda_l2, const data_size_t min_data_in_leaf, - const double min_sum_hessian_in_leaf, const double min_gain_to_split, - const int* cuda_num_features): - num_features_(train_data->num_features()), num_leaves_(num_leaves), - num_total_bin_(feature_hist_offsets.back()), feature_hist_offsets_(feature_hist_offsets), lambda_l1_(lambda_l1), lambda_l2_(lambda_l2), - min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf), min_gain_to_split_(min_gain_to_split), - cuda_hist_(cuda_hist), cuda_num_features_(cuda_num_features) { +CUDABestSplitFinder::CUDABestSplitFinder( + const hist_t* cuda_hist, + const Dataset* train_data, + const std::vector& feature_hist_offsets, + const Config* config): + num_features_(train_data->num_features()), + num_leaves_(config->num_leaves), + num_total_bin_(feature_hist_offsets.back()), + feature_hist_offsets_(feature_hist_offsets), + lambda_l1_(config->lambda_l1), + lambda_l2_(config->lambda_l2), + min_data_in_leaf_(config->min_data_in_leaf), + min_sum_hessian_in_leaf_(config->min_sum_hessian_in_leaf), + min_gain_to_split_(config->min_gain_to_split), + cuda_hist_(cuda_hist) { feature_missing_type_.resize(num_features_); feature_mfb_offsets_.resize(num_features_); feature_default_bins_.resize(num_features_); @@ -43,67 +49,77 @@ CUDABestSplitFinder::CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* } void CUDABestSplitFinder::Init() { - AllocateCUDAMemory(1, &cuda_best_leaf_); - - AllocateCUDAMemory(feature_hist_offsets_.size() * 2, &cuda_feature_hist_offsets_); - CopyFromHostToCUDADevice(cuda_feature_hist_offsets_, feature_hist_offsets_.data(), feature_hist_offsets_.size()); - - AllocateCUDAMemory(feature_mfb_offsets_.size(), &cuda_feature_mfb_offsets_); - CopyFromHostToCUDADevice(cuda_feature_mfb_offsets_, feature_mfb_offsets_.data(), feature_mfb_offsets_.size()); - - AllocateCUDAMemory(feature_default_bins_.size(), &cuda_feature_default_bins_); - CopyFromHostToCUDADevice(cuda_feature_default_bins_, feature_default_bins_.data(), feature_default_bins_.size()); - - AllocateCUDAMemory(1, &cuda_num_total_bin_); - CopyFromHostToCUDADevice(cuda_num_total_bin_, &num_total_bin_, 1); - - AllocateCUDAMemory(num_features_, &cuda_feature_missing_type_); - CopyFromHostToCUDADevice(cuda_feature_missing_type_, feature_missing_type_.data(), static_cast(num_features_)); - - InitCUDAMemoryFromHostMemory(&cuda_feature_num_bins_, feature_num_bins_.data(), static_cast(num_features_)); - - AllocateCUDAMemory(num_total_bin_ * 2, &prefix_sum_hist_left_); - AllocateCUDAMemory(num_total_bin_ * 2, &prefix_sum_hist_right_); - + AllocateCUDAMemoryOuter(&cuda_feature_hist_offsets_, + feature_hist_offsets_.size() * 2, + __FILE__, + __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_feature_hist_offsets_, + feature_hist_offsets_.data(), + feature_hist_offsets_.size(), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_feature_mfb_offsets_, + feature_mfb_offsets_.size(), + __FILE__, + __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_feature_mfb_offsets_, + feature_mfb_offsets_.data(), + feature_mfb_offsets_.size(), + __FILE__, + __LINE__); + AllocateCUDAMemoryOuter(&cuda_feature_default_bins_, + feature_default_bins_.size(), + __FILE__, + __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_feature_default_bins_, + feature_default_bins_.data(), + feature_default_bins_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_num_bins_, + feature_num_bins_.data(), + static_cast(num_features_), + __FILE__, + __LINE__); num_tasks_ = 0; for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { const uint32_t num_bin = feature_num_bins_[inner_feature_index]; const uint8_t missing_type = feature_missing_type_[inner_feature_index]; - if (num_bin > 2 && missing_type != 0) { - if (missing_type == 1) { - cpu_task_reverse_.emplace_back(0); - cpu_task_reverse_.emplace_back(1); - cpu_task_skip_default_bin_.emplace_back(1); - cpu_task_skip_default_bin_.emplace_back(1); - cpu_task_na_as_missing_.emplace_back(0); - cpu_task_na_as_missing_.emplace_back(0); - cpu_task_feature_index_.emplace_back(inner_feature_index); - cpu_task_feature_index_.emplace_back(inner_feature_index); - cpu_task_out_default_left_.emplace_back(0); - cpu_task_out_default_left_.emplace_back(1); + if (num_bin > 2 && missing_type != MissingType::None) { + if (missing_type == MissingType::Zero) { + host_task_reverse_.emplace_back(0); + host_task_reverse_.emplace_back(1); + host_task_skip_default_bin_.emplace_back(1); + host_task_skip_default_bin_.emplace_back(1); + host_task_na_as_missing_.emplace_back(0); + host_task_na_as_missing_.emplace_back(0); + host_task_feature_index_.emplace_back(inner_feature_index); + host_task_feature_index_.emplace_back(inner_feature_index); + host_task_out_default_left_.emplace_back(0); + host_task_out_default_left_.emplace_back(1); num_tasks_ += 2; } else { - cpu_task_reverse_.emplace_back(0); - cpu_task_reverse_.emplace_back(1); - cpu_task_skip_default_bin_.emplace_back(0); - cpu_task_skip_default_bin_.emplace_back(0); - cpu_task_na_as_missing_.emplace_back(1); - cpu_task_na_as_missing_.emplace_back(1); - cpu_task_feature_index_.emplace_back(inner_feature_index); - cpu_task_feature_index_.emplace_back(inner_feature_index); - cpu_task_out_default_left_.emplace_back(0); - cpu_task_out_default_left_.emplace_back(1); + host_task_reverse_.emplace_back(0); + host_task_reverse_.emplace_back(1); + host_task_skip_default_bin_.emplace_back(0); + host_task_skip_default_bin_.emplace_back(0); + host_task_na_as_missing_.emplace_back(1); + host_task_na_as_missing_.emplace_back(1); + host_task_feature_index_.emplace_back(inner_feature_index); + host_task_feature_index_.emplace_back(inner_feature_index); + host_task_out_default_left_.emplace_back(0); + host_task_out_default_left_.emplace_back(1); num_tasks_ += 2; } } else { - cpu_task_reverse_.emplace_back(1); - cpu_task_skip_default_bin_.emplace_back(0); - cpu_task_na_as_missing_.emplace_back(0); - cpu_task_feature_index_.emplace_back(inner_feature_index); + host_task_reverse_.emplace_back(1); + host_task_skip_default_bin_.emplace_back(0); + host_task_na_as_missing_.emplace_back(0); + host_task_feature_index_.emplace_back(inner_feature_index); if (missing_type != 2) { - cpu_task_out_default_left_.emplace_back(1); + host_task_out_default_left_.emplace_back(1); } else { - cpu_task_out_default_left_.emplace_back(0); + host_task_out_default_left_.emplace_back(0); } ++num_tasks_; } @@ -112,18 +128,40 @@ void CUDABestSplitFinder::Init() { const int num_task_blocks = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; const size_t cuda_best_leaf_split_info_buffer_size = static_cast(num_task_blocks) * static_cast(num_leaves_); - AllocateCUDAMemoryOuter(&cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__); - - InitCUDAMemoryFromHostMemory(&cuda_task_feature_index_, cpu_task_feature_index_.data(), cpu_task_feature_index_.size()); - InitCUDAMemoryFromHostMemory(&cuda_task_reverse_, cpu_task_reverse_.data(), cpu_task_reverse_.size()); - InitCUDAMemoryFromHostMemory(&cuda_task_skip_default_bin_, cpu_task_skip_default_bin_.data(), cpu_task_skip_default_bin_.size()); - InitCUDAMemoryFromHostMemory(&cuda_task_na_as_missing_, cpu_task_na_as_missing_.data(), cpu_task_na_as_missing_.size()); - InitCUDAMemoryFromHostMemory(&cuda_task_out_default_left_, cpu_task_out_default_left_.data(), cpu_task_out_default_left_.size()); + AllocateCUDAMemoryOuter(&cuda_leaf_best_split_info_, + cuda_best_leaf_split_info_buffer_size, + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_task_feature_index_, + host_task_feature_index_.data(), + host_task_feature_index_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_task_reverse_, + host_task_reverse_.data(), + host_task_reverse_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_task_skip_default_bin_, + host_task_skip_default_bin_.data(), + host_task_skip_default_bin_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_task_na_as_missing_, + host_task_na_as_missing_.data(), + host_task_na_as_missing_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_task_out_default_left_, + host_task_out_default_left_.data(), + host_task_out_default_left_.size(), + __FILE__, + __LINE__); const size_t output_buffer_size = 2 * static_cast(num_tasks_); AllocateCUDAMemoryOuter(&cuda_best_split_info_, output_buffer_size, __FILE__, __LINE__); - AllocateCUDAMemory(7, &cuda_best_split_info_buffer_); + AllocateCUDAMemoryOuter(&cuda_best_split_info_buffer_, 7, __FILE__, __LINE__); cuda_streams_.resize(2); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); @@ -131,12 +169,19 @@ void CUDABestSplitFinder::Init() { void CUDABestSplitFinder::BeforeTrain() {} -void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplitsStruct* smaller_leaf_splits, - const CUDALeafSplitsStruct* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, - const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, - const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { - const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ && sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_); - const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ && sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0); +void CUDABestSplitFinder::FindBestSplitsForLeaf( + const CUDALeafSplitsStruct* smaller_leaf_splits, + const CUDALeafSplitsStruct* larger_leaf_splits, + const int smaller_leaf_index, + const int larger_leaf_index, + const data_size_t num_data_in_smaller_leaf, + const data_size_t num_data_in_larger_leaf, + const double sum_hessians_in_smaller_leaf, + const double sum_hessians_in_larger_leaf) { + const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ && + sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_); + const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ && + sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0); LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits, smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); @@ -145,11 +190,28 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(const CUDALeafSplitsStruct* smal global_timer.Stop("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); } -const CUDASplitInfo* CUDABestSplitFinder::FindBestFromAllSplits(const int cur_num_leaves, const int smaller_leaf_index, - const int larger_leaf_index, std::vector* leaf_best_split_feature, - std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { - LaunchFindBestFromAllSplitsKernel(cur_num_leaves, smaller_leaf_index, larger_leaf_index, - leaf_best_split_feature, leaf_best_split_threshold, leaf_best_split_default_left, best_leaf_index); +const CUDASplitInfo* CUDABestSplitFinder::FindBestFromAllSplits( + const int cur_num_leaves, + const int smaller_leaf_index, + const int larger_leaf_index, + int* smaller_leaf_best_split_feature, + uint32_t* smaller_leaf_best_split_threshold, + uint8_t* smaller_leaf_best_split_default_left, + int* larger_leaf_best_split_feature, + uint32_t* larger_leaf_best_split_threshold, + uint8_t* larger_leaf_best_split_default_left, + int* best_leaf_index) { + LaunchFindBestFromAllSplitsKernel( + cur_num_leaves, + smaller_leaf_index, + larger_leaf_index, + smaller_leaf_best_split_feature, + smaller_leaf_best_split_threshold, + smaller_leaf_best_split_default_left, + larger_leaf_best_split_feature, + larger_leaf_best_split_threshold, + larger_leaf_best_split_default_left, + best_leaf_index); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); return cuda_leaf_best_split_info_ + (*best_leaf_index); } diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 25bfd5223f3d..3c4d45e28982 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -185,7 +185,6 @@ __device__ void FindBestSplitsForLeafKernelInner( const uint32_t feature_num_bin, const uint8_t feature_mfb_offset, const uint32_t feature_default_bin, - const uint8_t feature_missing_type, const int inner_feature_index, // input config parameter values const double lambda_l1, @@ -379,8 +378,7 @@ __global__ void FindBestSplitsForLeafKernel( // input feature information const uint32_t* feature_hist_offsets, const uint8_t* feature_mfb_offsets, - const uint32_t* feature_default_bins, - const uint8_t* feature_missing_types, + const uint32_t* feature_default_bins, const uint32_t* feature_num_bins, // input task information const bool larger_only, @@ -424,7 +422,6 @@ __global__ void FindBestSplitsForLeafKernel( feature_num_bins[inner_feature_index], feature_mfb_offsets[inner_feature_index], feature_default_bins[inner_feature_index], - feature_missing_types[inner_feature_index], inner_feature_index, // input config parameter values lambda_l1, @@ -466,7 +463,6 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_feature_hist_offsets_, cuda_feature_mfb_offsets_, cuda_feature_default_bins_, - cuda_feature_missing_type_, cuda_feature_num_bins_, // input task information larger_only, @@ -497,7 +493,6 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_feature_hist_offsets_, cuda_feature_mfb_offsets_, cuda_feature_default_bins_, - cuda_feature_missing_type_, cuda_feature_num_bins_, // input task information true, @@ -541,7 +536,6 @@ __device__ void ReduceBestSplit(bool* found, double* gain, uint32_t* shared_read } __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const int larger_leaf_index, - const int* cuda_num_features, CUDASplitInfo* cuda_leaf_best_split_info, // input parameters const int* cuda_task_feature_index, @@ -670,8 +664,8 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( } void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( - const int cpu_smaller_leaf_index, - const int cpu_larger_leaf_index, + const int host_smaller_leaf_index, + const int host_larger_leaf_index, const bool is_smaller_leaf_valid, const bool is_larger_leaf_valid) { @@ -683,11 +677,10 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_tasks >>= 1; } const int num_blocks_per_leaf = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; - if (cpu_larger_leaf_index >= 0 && is_smaller_leaf_valid && is_larger_leaf_valid) { + if (host_larger_leaf_index >= 0 && is_smaller_leaf_valid && is_larger_leaf_valid) { SyncBestSplitForLeafKernel<<>>( - cpu_smaller_leaf_index, - cpu_larger_leaf_index, - cuda_num_features_, + host_smaller_leaf_index, + host_larger_leaf_index, cuda_leaf_best_split_info_, cuda_task_feature_index_, cuda_best_split_info_, @@ -699,8 +692,8 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_leaves_); if (num_blocks_per_leaf > 1) { SyncBestSplitForLeafKernelAllBlocks<<<1, 1, 0, cuda_streams_[0]>>>( - cpu_smaller_leaf_index, - cpu_larger_leaf_index, + host_smaller_leaf_index, + host_larger_leaf_index, num_blocks_per_leaf, num_leaves_, cuda_leaf_best_split_info_, @@ -708,9 +701,8 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); SyncBestSplitForLeafKernel<<>>( - cpu_smaller_leaf_index, - cpu_larger_leaf_index, - cuda_num_features_, + host_smaller_leaf_index, + host_larger_leaf_index, cuda_leaf_best_split_info_, cuda_task_feature_index_, cuda_best_split_info_, @@ -722,8 +714,8 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( num_leaves_); if (num_blocks_per_leaf > 1) { SyncBestSplitForLeafKernelAllBlocks<<<1, 1, 0, cuda_streams_[1]>>>( - cpu_smaller_leaf_index, - cpu_larger_leaf_index, + host_smaller_leaf_index, + host_larger_leaf_index, num_blocks_per_leaf, num_leaves_, cuda_leaf_best_split_info_, @@ -732,9 +724,8 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( } else { const bool larger_only = (!is_smaller_leaf_valid && is_larger_leaf_valid); SyncBestSplitForLeafKernel<<>>( - cpu_smaller_leaf_index, - cpu_larger_leaf_index, - cuda_num_features_, + host_smaller_leaf_index, + host_larger_leaf_index, cuda_leaf_best_split_info_, cuda_task_feature_index_, cuda_best_split_info_, @@ -747,8 +738,8 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( if (num_blocks_per_leaf > 1) { SynchronizeCUDADeviceOuter(__FILE__, __LINE__); SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( - cpu_smaller_leaf_index, - cpu_larger_leaf_index, + host_smaller_leaf_index, + host_larger_leaf_index, num_blocks_per_leaf, num_leaves_, cuda_leaf_best_split_info_, @@ -758,7 +749,6 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( } __global__ void FindBestFromAllSplitsKernel(const int cur_num_leaves, - int* out_best_leaf, const CUDASplitInfo* cuda_leaf_best_split_info, int* cuda_best_split_info_buffer) { __shared__ double thread_best_gain[NUM_THREADS_FIND_BEST_LEAF]; @@ -783,8 +773,6 @@ __global__ void FindBestFromAllSplitsKernel(const int cur_num_leaves, ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf, cur_num_valid_threads); if (threadIdx_x == 0) { const int best_leaf_index = thread_best_leaf[0]; - *out_best_leaf = best_leaf_index; - //cuda_leaf_best_split_info[best_leaf_index].leaf_index = best_leaf_index; cuda_best_split_info_buffer[6] = thread_best_leaf[0]; } } @@ -812,26 +800,32 @@ __global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int } void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int cur_num_leaves, - const int smaller_leaf_index, const int larger_leaf_index, std::vector* leaf_best_split_feature, - std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index) { - FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF, 0, cuda_streams_[1]>>>(cur_num_leaves, cuda_best_leaf_, + const int smaller_leaf_index, const int larger_leaf_index, + int* smaller_leaf_best_split_feature, + uint32_t* smaller_leaf_best_split_threshold, + uint8_t* smaller_leaf_best_split_default_left, + int* larger_leaf_best_split_feature, + uint32_t* larger_leaf_best_split_threshold, + uint8_t* larger_leaf_best_split_default_left, + int* best_leaf_index) { + FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF, 0, cuda_streams_[1]>>>(cur_num_leaves, cuda_leaf_best_split_info_, cuda_best_split_info_buffer_); PrepareLeafBestSplitInfo<<<6, 1, 0, cuda_streams_[0]>>>(smaller_leaf_index, larger_leaf_index, cuda_best_split_info_buffer_, cuda_leaf_best_split_info_); - std::vector cpu_leaf_best_split_info_buffer(7); + std::vector host_leaf_best_split_info_buffer(7); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CopyFromCUDADeviceToHost(cpu_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 7); - (*leaf_best_split_feature)[smaller_leaf_index] = cpu_leaf_best_split_info_buffer[0]; - (*leaf_best_split_threshold)[smaller_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[1]); - (*leaf_best_split_default_left)[smaller_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[2]); + CopyFromCUDADeviceToHost(host_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 7); + *smaller_leaf_best_split_feature = host_leaf_best_split_info_buffer[0]; + *smaller_leaf_best_split_threshold = static_cast(host_leaf_best_split_info_buffer[1]); + *smaller_leaf_best_split_default_left = static_cast(host_leaf_best_split_info_buffer[2]); if (larger_leaf_index >= 0) { - (*leaf_best_split_feature)[larger_leaf_index] = cpu_leaf_best_split_info_buffer[3]; - (*leaf_best_split_threshold)[larger_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[4]); - (*leaf_best_split_default_left)[larger_leaf_index] = static_cast(cpu_leaf_best_split_info_buffer[5]); + *larger_leaf_best_split_feature = host_leaf_best_split_info_buffer[3]; + *larger_leaf_best_split_threshold = static_cast(host_leaf_best_split_info_buffer[4]); + *larger_leaf_best_split_default_left = static_cast(host_leaf_best_split_info_buffer[5]); } - *best_leaf_index = cpu_leaf_best_split_info_buffer[6]; + *best_leaf_index = host_leaf_best_split_info_buffer[6]; } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 3c4a3ad9d719..e5e0e59f4766 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -28,28 +28,37 @@ namespace LightGBM { class CUDABestSplitFinder { public: - CUDABestSplitFinder(const hist_t* cuda_hist, const Dataset* train_data, - const std::vector& feature_hist_offsets, const int num_leaves, - const double lambda_l1, const double lambda_l2, const data_size_t min_data_in_leaf, - const double min_sum_hessian_in_leaf, const double min_gain_to_split, - const int* cuda_num_features); + CUDABestSplitFinder( + const hist_t* cuda_hist, + const Dataset* train_data, + const std::vector& feature_hist_offsets, + const Config* config); void Init(); void BeforeTrain(); - void FindBestSplitsForLeaf(const CUDALeafSplitsStruct* smaller_leaf_splits, const CUDALeafSplitsStruct* larger_leaf_splits, - const int smaller_leaf_index, const int larger_leaf_index, - const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, - const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); - - const CUDASplitInfo* FindBestFromAllSplits(const int cur_num_leaves, const int smaller_leaf_index, - const int larger_leaf_index, std::vector* leaf_best_split_feature, - std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index); - - const int* cuda_best_leaf() const { return cuda_best_leaf_; } - - CUDASplitInfo* cuda_leaf_best_split_info() { return cuda_leaf_best_split_info_; } + void FindBestSplitsForLeaf( + const CUDALeafSplitsStruct* smaller_leaf_splits, + const CUDALeafSplitsStruct* larger_leaf_splits, + const int smaller_leaf_index, + const int larger_leaf_index, + const data_size_t num_data_in_smaller_leaf, + const data_size_t num_data_in_larger_leaf, + const double sum_hessians_in_smaller_leaf, + const double sum_hessians_in_larger_leaf); + + const CUDASplitInfo* FindBestFromAllSplits( + const int cur_num_leaves, + const int smaller_leaf_index, + const int larger_leaf_index, + int* smaller_leaf_best_split_feature, + uint32_t* smaller_leaf_best_split_threshold, + uint8_t* smaller_leaf_best_split_default_left, + int* larger_leaf_best_split_feature, + uint32_t* larger_leaf_best_split_threshold, + uint8_t* larger_leaf_best_split_default_left, + int* best_leaf_index); private: void LaunchFindBestSplitsForLeafKernel(const CUDALeafSplitsStruct* smaller_leaf_splits, @@ -57,14 +66,20 @@ class CUDABestSplitFinder { const bool is_smaller_leaf_valid, const bool is_larger_leaf_valid); void LaunchSyncBestSplitForLeafKernel( - const int cpu_smaller_leaf_index, - const int cpu_larger_leaf_index, + const int host_smaller_leaf_index, + const int host_larger_leaf_index, const bool is_smaller_leaf_valid, const bool is_larger_leaf_valid); void LaunchFindBestFromAllSplitsKernel(const int cur_num_leaves, const int smaller_leaf_index, - const int larger_leaf_index, std::vector* leaf_best_split_feature, - std::vector* leaf_best_split_threshold, std::vector* leaf_best_split_default_left, int* best_leaf_index); + const int larger_leaf_index, + int* smaller_leaf_best_split_feature, + uint32_t* smaller_leaf_best_split_threshold, + uint8_t* smaller_leaf_best_split_default_left, + int* larger_leaf_best_split_feature, + uint32_t* larger_leaf_best_split_threshold, + uint8_t* larger_leaf_best_split_default_left, + int* best_leaf_index); // Host memory const int num_features_; @@ -75,8 +90,7 @@ class CUDABestSplitFinder { std::vector feature_mfb_offsets_; std::vector feature_default_bins_; std::vector feature_num_bins_; - // None --> 0, Zero --> 1, NaN --> 2 - std::vector feature_missing_type_; + std::vector feature_missing_type_; const double lambda_l1_; const double lambda_l2_; const data_size_t min_data_in_leaf_; @@ -84,30 +98,24 @@ class CUDABestSplitFinder { const double min_gain_to_split_; std::vector cuda_streams_; // for best split find tasks - std::vector cpu_task_feature_index_; - std::vector cpu_task_reverse_; - std::vector cpu_task_skip_default_bin_; - std::vector cpu_task_na_as_missing_; - std::vector cpu_task_out_default_left_; + std::vector host_task_feature_index_; + std::vector host_task_reverse_; + std::vector host_task_skip_default_bin_; + std::vector host_task_na_as_missing_; + std::vector host_task_out_default_left_; int num_tasks_; // CUDA memory, held by this object // for per leaf best split information - int* cuda_best_leaf_; CUDASplitInfo* cuda_leaf_best_split_info_; // for best split information when finding best split CUDASplitInfo* cuda_best_split_info_; - int* cuda_num_total_bin_; - // TODO(shiyu1994): use prefix sum to accelerate best split finding - hist_t* prefix_sum_hist_left_; - hist_t* prefix_sum_hist_right_; // feature information uint32_t* cuda_feature_hist_offsets_; uint8_t* cuda_feature_mfb_offsets_; uint32_t* cuda_feature_default_bins_; - uint8_t* cuda_feature_missing_type_; uint32_t* cuda_feature_num_bins_; - // best split information buffer, to be copied to CPU + // best split information buffer, to be copied to host int* cuda_best_split_info_buffer_; // find best split task information int* cuda_task_feature_index_; @@ -118,7 +126,6 @@ class CUDABestSplitFinder { // CUDA memory, held by other object const hist_t* cuda_hist_; - const int* cuda_num_features_; }; } diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 64e0496f8ea5..675f5a35c61f 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -108,47 +108,68 @@ void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { } void CUDADataPartition::Split( + // input best split info const CUDASplitInfo* best_split_info, const int left_leaf_index, const int right_leaf_index, - // for leaf splits information update + const int leaf_best_split_feature, + const uint32_t leaf_best_split_threshold, + const uint8_t leaf_best_split_default_left, + const data_size_t num_data_in_leaf, + const data_size_t leaf_data_start, + // for leaf information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, - std::vector* cpu_leaf_num_data, - std::vector* cpu_leaf_data_start, - std::vector* cpu_leaf_sum_hessians, - const std::vector& cpu_leaf_best_split_feature, - const std::vector& cpu_leaf_best_split_threshold, - const std::vector& cpu_leaf_best_split_default_left, - int* smaller_leaf_index, int* larger_leaf_index) { + // gather information for CPU, used for launching kernels + data_size_t* left_leaf_num_data, + data_size_t* right_leaf_num_data, + data_size_t* left_leaf_start, + data_size_t* right_leaf_start, + double* left_leaf_sum_of_hessians, + double* right_leaf_sum_of_hessians) { global_timer.Start("GenDataToLeftBitVector"); global_timer.Start("SplitInner Copy CUDA To Host"); - const data_size_t num_data_in_leaf = cpu_leaf_num_data->at(left_leaf_index); - const int split_feature_index = cpu_leaf_best_split_feature[left_leaf_index]; - const uint32_t split_threshold = cpu_leaf_best_split_threshold[left_leaf_index]; - const uint8_t split_default_left = cpu_leaf_best_split_default_left[left_leaf_index]; - const data_size_t leaf_data_start = cpu_leaf_data_start->at(left_leaf_index); global_timer.Stop("SplitInner Copy CUDA To Host"); - GenDataToLeftBitVector(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, left_leaf_index, right_leaf_index); + GenDataToLeftBitVector(num_data_in_leaf, + leaf_best_split_feature, + leaf_best_split_threshold, + leaf_best_split_default_left, + leaf_data_start, + left_leaf_index, + right_leaf_index); global_timer.Stop("GenDataToLeftBitVector"); global_timer.Start("SplitInner"); SplitInner(num_data_in_leaf, - best_split_info, - left_leaf_index, - right_leaf_index, - smaller_leaf_splits, - larger_leaf_splits, - cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, - smaller_leaf_index, larger_leaf_index); + best_split_info, + left_leaf_index, + right_leaf_index, + smaller_leaf_splits, + larger_leaf_splits, + left_leaf_num_data, + right_leaf_num_data, + left_leaf_start, + right_leaf_start, + left_leaf_sum_of_hessians, + right_leaf_sum_of_hessians); global_timer.Stop("SplitInner"); } -void CUDADataPartition::GenDataToLeftBitVector(const data_size_t num_data_in_leaf, - const int split_feature_index, const uint32_t split_threshold, - const uint8_t split_default_left, const data_size_t leaf_data_start, - const int left_leaf_index, const int right_leaf_index) { - LaunchGenDataToLeftBitVectorKernel(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, left_leaf_index, right_leaf_index); +void CUDADataPartition::GenDataToLeftBitVector( + const data_size_t num_data_in_leaf, + const int split_feature_index, + const uint32_t split_threshold, + const uint8_t split_default_left, + const data_size_t leaf_data_start, + const int left_leaf_index, + const int right_leaf_index) { + LaunchGenDataToLeftBitVectorKernel(num_data_in_leaf, + split_feature_index, + split_threshold, + split_default_left, + leaf_data_start, + left_leaf_index, + right_leaf_index); } void CUDADataPartition::SplitInner( @@ -159,9 +180,12 @@ void CUDADataPartition::SplitInner( // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, - std::vector* cpu_leaf_num_data, std::vector* cpu_leaf_data_start, - std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index) { + data_size_t* left_leaf_num_data, + data_size_t* right_leaf_num_data, + data_size_t* left_leaf_start, + data_size_t* right_leaf_start, + double* left_leaf_sum_of_hessians, + double* right_leaf_sum_of_hessians) { LaunchSplitInnerKernel( num_data_in_leaf, best_split_info, @@ -169,13 +193,15 @@ void CUDADataPartition::SplitInner( right_leaf_index, smaller_leaf_splits, larger_leaf_splits, - cpu_leaf_num_data, cpu_leaf_data_start, cpu_leaf_sum_hessians, - smaller_leaf_index, larger_leaf_index); + left_leaf_num_data, + right_leaf_num_data, + left_leaf_start, + right_leaf_start, + left_leaf_sum_of_hessians, + right_leaf_sum_of_hessians); ++cur_num_leaves_; } -Tree* CUDADataPartition::GetCPUTree() {} - void CUDADataPartition::UpdateTrainScore(const double learning_rate, double* cuda_scores) { LaunchAddPredictionToScoreKernel(learning_rate, cuda_scores); } diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index c0b2e2e79428..7d91a54a7491 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -331,14 +331,13 @@ __device__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, const uint } template -__global__ void UpdateDataIndexToLeafIndexKernel(const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, +__global__ void UpdateDataIndexToLeafIndexKernel( + const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, const uint32_t th, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint32_t min_bin_ref, int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, const int default_leaf_index, const int missing_default_leaf_index) { - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; @@ -378,14 +377,14 @@ __global__ void UpdateDataIndexToLeafIndexKernel(const data_size_t cuda_leaf_dat } } -#define UpdateDataIndexToLeafIndex_ARGS leaf_data_start, \ - num_data_in_leaf, cuda_data_indices, th, column_data, \ +#define UpdateDataIndexToLeafIndex_ARGS \ + num_data_in_leaf, data_indices_in_leaf, th, column_data, \ t_zero_bin, max_bin_ref, min_bin_ref, cuda_data_index_to_leaf_index, left_leaf_index, right_leaf_index, \ default_leaf_index, missing_default_leaf_index template -void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel(const data_size_t leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, +void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel( + const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, const uint32_t th, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint32_t min_bin_ref, @@ -530,8 +529,8 @@ void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel(const data_size_t // min_bin_ref < max_bin_ref template -__global__ void GenDataToLeftBitVectorKernel0(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, +__global__ void GenDataToLeftBitVectorKernel0(const int best_split_feature_ref, + const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, @@ -543,7 +542,6 @@ __global__ void GenDataToLeftBitVectorKernel0(const int best_split_feature_ref, const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; @@ -577,8 +575,8 @@ __global__ void GenDataToLeftBitVectorKernel0(const int best_split_feature_ref, // min_bin_ref == max_bin_ref template -__global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, +__global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, + const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, @@ -590,7 +588,6 @@ __global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, const int default_leaf_index, const int missing_default_leaf_index) { __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; - const data_size_t* data_indices_in_leaf = cuda_data_indices + cuda_leaf_data_start; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; @@ -629,7 +626,7 @@ __global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, } #define GenBitVector_ARGS \ - split_feature_index, leaf_data_start, num_data_in_leaf, cuda_data_indices_, \ + split_feature_index, num_data_in_leaf, data_indices_in_leaf, \ th, num_features_, \ column_data, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, \ split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, \ @@ -661,6 +658,7 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( const int default_leaf_index, const int missing_default_leaf_index) { const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index); + const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start; if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); @@ -784,6 +782,7 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( const int default_leaf_index, const int missing_default_leaf_index) { const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index); + const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start; if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); @@ -932,6 +931,8 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num const bool max_bin_to_left = (max_bin <= th); + const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start; + if (min_bin < max_bin) { if (bit_type == 8) { LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( @@ -1082,8 +1083,8 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index); if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(column_data_pointer); - LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, - cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, + LaunchUpdateDataIndexToLeafIndexKernel(num_data_in_leaf, + data_indices_in_leaf, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, static_cast(missing_is_zero), static_cast(missing_is_na), @@ -1094,8 +1095,8 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num split_indices_block_size_data_partition_aligned); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(column_data_pointer); - LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, - cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, + LaunchUpdateDataIndexToLeafIndexKernel(num_data_in_leaf, + data_indices_in_leaf, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, static_cast(missing_is_zero), static_cast(missing_is_na), @@ -1106,8 +1107,8 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num split_indices_block_size_data_partition_aligned); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(column_data_pointer); - LaunchUpdateDataIndexToLeafIndexKernel(leaf_data_start, num_data_in_leaf, - cuda_data_indices_, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, + LaunchUpdateDataIndexToLeafIndexKernel(num_data_in_leaf, + data_indices_in_leaf, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, static_cast(missing_is_zero), static_cast(missing_is_na), @@ -1439,10 +1440,12 @@ void CUDADataPartition::LaunchSplitInnerKernel( // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, - std::vector* cpu_leaf_num_data, - std::vector* cpu_leaf_data_start, - std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, int* larger_leaf_index) { + data_size_t* left_leaf_num_data_ref, + data_size_t* right_leaf_num_data_ref, + data_size_t* left_leaf_start_ref, + data_size_t* right_leaf_start_ref, + double* left_leaf_sum_of_hessians_ref, + double* right_leaf_sum_of_hessians_ref) { const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; @@ -1485,7 +1488,6 @@ void CUDADataPartition::LaunchSplitInnerKernel( left_leaf_index, right_leaf_index, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); - //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::SplitInnerKernel"); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); @@ -1517,32 +1519,12 @@ void CUDADataPartition::LaunchSplitInnerKernel( left_leaf_num_data + right_leaf_num_data, cuda_out_data_indices_in_leaf_, cuda_data_indices_ + left_leaf_data_start); global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); const data_size_t right_leaf_data_start = cpu_split_info_buffer[5]; - (*cpu_leaf_num_data)[left_leaf_index] = left_leaf_num_data; - (*cpu_leaf_data_start)[left_leaf_index] = left_leaf_data_start; - (*cpu_leaf_num_data)[right_leaf_index] = right_leaf_num_data; - (*cpu_leaf_data_start)[right_leaf_index] = right_leaf_data_start; - (*cpu_leaf_sum_hessians)[left_leaf_index] = cpu_sum_hessians_info[0]; - (*cpu_leaf_sum_hessians)[right_leaf_index] = cpu_sum_hessians_info[1]; - *smaller_leaf_index = cpu_split_info_buffer[6]; - *larger_leaf_index = cpu_split_info_buffer[7]; -} - -__global__ void PrefixSumKernel(uint32_t* cuda_elements) { - __shared__ uint32_t elements[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1]; - const unsigned int threadIdx_x = threadIdx.x; - const unsigned int global_read_index = blockIdx.x * blockDim.x * 2 + threadIdx_x; - elements[threadIdx_x] = cuda_elements[global_read_index]; - elements[threadIdx_x + blockDim.x] = cuda_elements[global_read_index + blockDim.x]; - __syncthreads(); - PrefixSum(elements, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); - __syncthreads(); - cuda_elements[global_read_index] = elements[threadIdx_x]; - cuda_elements[global_read_index + blockDim.x] = elements[threadIdx_x + blockDim.x]; -} - -void CUDADataPartition::LaunchPrefixSumKernel(uint32_t* cuda_elements) { - PrefixSumKernel<<<1, SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION / 2>>>(cuda_elements); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + *left_leaf_num_data_ref = left_leaf_num_data; + *left_leaf_start_ref = left_leaf_data_start; + *right_leaf_num_data_ref = right_leaf_num_data; + *right_leaf_start_ref = right_leaf_data_start; + *left_leaf_sum_of_hessians_ref = cpu_sum_hessians_info[0]; + *right_leaf_sum_of_hessians_ref = cpu_sum_hessians_info[1]; } __global__ void AddPredictionToScoreKernel(const double* cuda_leaf_output, @@ -1569,19 +1551,6 @@ void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double learning_r global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel"); } -__global__ void CopyColWiseDataKernel(const uint8_t* row_wise_data, - const data_size_t num_data, const int num_features, - uint8_t* col_wise_data) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - const data_size_t read_offset = data_index * num_features; - for (int feature_index = 0; feature_index < num_features; ++feature_index) { - const data_size_t write_pos = feature_index * num_data + data_index; - col_wise_data[write_pos] = row_wise_data[read_offset + feature_index]; - } - } -} - } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 27b8d1267f40..999c074d07d0 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -44,40 +44,48 @@ class CUDADataPartition { const CUDASplitInfo* best_split_info, const int left_leaf_index, const int right_leaf_index, + const int leaf_best_split_feature, + const uint32_t leaf_best_split_threshold, + const uint8_t leaf_best_split_default_left, + const data_size_t num_data_in_leaf, + const data_size_t leaf_data_start, // for leaf information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, // gather information for CPU, used for launching kernels - std::vector* leaf_num_data, - std::vector* leaf_data_start, - std::vector* leaf_sum_hessians, - const std::vector& leaf_best_split_feature, - const std::vector& leaf_best_split_threshold, - const std::vector& leaf_best_split_default_left, - int* smaller_leaf_index, - int* larger_leaf_index); - - Tree* GetCPUTree(); + data_size_t* left_leaf_num_data, + data_size_t* right_leaf_num_data, + data_size_t* left_leaf_start, + data_size_t* right_leaf_start, + double* left_leaf_sum_of_hessians, + double* right_leaf_sum_of_hessians); void UpdateTrainScore(const double learning_rate, double* cuda_scores); const data_size_t* cuda_data_indices() const { return cuda_data_indices_; } private: - void CalcBlockDim(const data_size_t num_data_in_leaf, + void CalcBlockDim( + const data_size_t num_data_in_leaf, int* grid_dim, int* block_dim); - void CalcBlockDimInCopy(const data_size_t num_data_in_leaf, + void CalcBlockDimInCopy( + const data_size_t num_data_in_leaf, int* grid_dim, int* block_dim); - void GenDataToLeftBitVector(const data_size_t num_data_in_leaf, - const int split_feature_index, const uint32_t split_threshold, - const uint8_t split_default_left, const data_size_t leaf_data_start, - const int left_leaf_index, const int right_leaf_index); + void GenDataToLeftBitVector( + const data_size_t num_data_in_leaf, + const int split_feature_index, + const uint32_t split_threshold, + const uint8_t split_default_left, + const data_size_t leaf_data_start, + const int left_leaf_index, + const int right_leaf_index); void SplitInner( + // input best split info const data_size_t num_data_in_leaf, const CUDASplitInfo* best_split_info, const int left_leaf_index, @@ -85,16 +93,19 @@ class CUDADataPartition { // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, - std::vector* cpu_leaf_num_data, - std::vector* cpu_leaf_data_start, - std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, - int* larger_leaf_index); + // gather information for CPU, used for launching kernels + data_size_t* left_leaf_num_data, + data_size_t* right_leaf_num_data, + data_size_t* left_leaf_start, + data_size_t* right_leaf_start, + double* left_leaf_sum_of_hessians, + double* right_leaf_sum_of_hessians); // kernel launch functions void LaunchFillDataIndicesBeforeTrain(); void LaunchSplitInnerKernel( + // input best split info const data_size_t num_data_in_leaf, const CUDASplitInfo* best_split_info, const int left_leaf_index, @@ -102,16 +113,22 @@ class CUDADataPartition { // for leaf splits information update CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, - std::vector* cpu_leaf_num_data, - std::vector* cpu_leaf_data_start, - std::vector* cpu_leaf_sum_hessians, - int* smaller_leaf_index, - int* larger_leaf_index); - - void LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, - const int split_feature_index, const uint32_t split_threshold, - const uint8_t split_default_left, const data_size_t leaf_data_start, - const int left_leaf_index, const int right_leaf_index); + // gather information for CPU, used for launching kernels + data_size_t* left_leaf_num_data, + data_size_t* right_leaf_num_data, + data_size_t* left_leaf_start, + data_size_t* right_leaf_start, + double* left_leaf_sum_of_hessians, + double* right_leaf_sum_of_hessians); + + void LaunchGenDataToLeftBitVectorKernel( + const data_size_t num_data_in_leaf, + const int split_feature_index, + const uint32_t split_threshold, + const uint8_t split_default_left, + const data_size_t leaf_data_start, + const int left_leaf_index, + const int right_leaf_index); template void LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( @@ -163,17 +180,27 @@ class CUDADataPartition { const int missing_default_leaf_index); template - void LaunchUpdateDataIndexToLeafIndexKernel(const data_size_t cuda_leaf_data_start, - const data_size_t num_data_in_leaf, const data_size_t* cuda_data_indices, - const uint32_t th, const BIN_TYPE* column_data, + void LaunchUpdateDataIndexToLeafIndexKernel( + const data_size_t num_data_in_leaf, + const data_size_t* data_indices_in_leaf, + const uint32_t th, + const BIN_TYPE* column_data, // values from feature - const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index, - const bool missing_is_zero, const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, const bool max_to_left, - const int num_blocks, const int block_size); - - void LaunchPrefixSumKernel(uint32_t* cuda_elements); + const uint32_t t_zero_bin, + const uint32_t max_bin_ref, + const uint32_t min_bin_ref, + int* cuda_data_index_to_leaf_index, + const int left_leaf_index, + const int right_leaf_index, + const int default_leaf_index, + const int missing_default_leaf_index, + const bool missing_is_zero, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_to_left, + const int num_blocks, + const int block_size); void LaunchAddPredictionToScoreKernel(const double learning_rate, double* cuda_scores); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 9be270b14621..757adf2bcc1d 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -10,17 +10,22 @@ namespace LightGBM { -CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, - const int num_leaves, const int num_threads, +CUDAHistogramConstructor::CUDAHistogramConstructor( + const Dataset* train_data, + const int num_leaves, + const int num_threads, const std::vector& feature_hist_offsets, - const int min_data_in_leaf, const double min_sum_hessian_in_leaf): num_data_(train_data->num_data()), - num_features_(train_data->num_features()), num_leaves_(num_leaves), num_threads_(num_threads), + const int min_data_in_leaf, + const double min_sum_hessian_in_leaf): + num_data_(train_data->num_data()), + num_features_(train_data->num_features()), + num_leaves_(num_leaves), + num_threads_(num_threads), num_feature_groups_(train_data->num_feature_groups()), - min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf) { - train_data_ = train_data; + min_data_in_leaf_(min_data_in_leaf), + min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf) { int offset = 0; for (int group_id = 0; group_id < train_data->num_feature_groups(); ++group_id) { - feature_group_bin_offsets_.emplace_back(offset); offset += train_data->FeatureGroupNumBin(group_id); } need_fix_histogram_features_.clear(); @@ -28,10 +33,7 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) { const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); const uint32_t most_freq_bin = bin_mapper->GetMostFreqBin(); - if (most_freq_bin == 0) { - feature_mfb_offsets_.emplace_back(1); - } else { - feature_mfb_offsets_.emplace_back(0); + if (most_freq_bin != 0) { need_fix_histogram_features_.emplace_back(feature_index); uint32_t num_bin_ref = static_cast(bin_mapper->num_bin()) - 1; uint32_t num_bin_aligned = 1; @@ -44,12 +46,12 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(const Dataset* train_data, feature_num_bins_.emplace_back(static_cast(bin_mapper->num_bin())); feature_most_freq_bins_.emplace_back(most_freq_bin); } - feature_group_bin_offsets_.emplace_back(offset); feature_hist_offsets_.clear(); for (size_t i = 0; i < feature_hist_offsets.size(); ++i) { feature_hist_offsets_.emplace_back(feature_hist_offsets[i]); } num_total_bin_ = offset; + cuda_row_data_.reset(nullptr); } void CUDAHistogramConstructor::BeforeTrain(const score_t* gradients, const score_t* hessians) { @@ -62,19 +64,6 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat AllocateCUDAMemory(num_total_bin_ * 2 * num_leaves_, &cuda_hist_); SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_); - AllocateCUDAMemory(num_data_, &cuda_ordered_gradients_); - AllocateCUDAMemory(num_data_, &cuda_ordered_hessians_); - - InitCUDAMemoryFromHostMemory(&cuda_num_total_bin_, &num_total_bin_, 1); - - InitCUDAMemoryFromHostMemory(&cuda_num_feature_groups_, &num_feature_groups_, 1); - - InitCUDAMemoryFromHostMemory(&cuda_feature_group_bin_offsets_, - feature_group_bin_offsets_.data(), feature_group_bin_offsets_.size()); - - InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_offsets_, - feature_mfb_offsets_.data(), feature_mfb_offsets_.size()); - InitCUDAMemoryFromHostMemory(&cuda_feature_num_bins_, feature_num_bins_.data(), feature_num_bins_.size()); @@ -84,214 +73,23 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat InitCUDAMemoryFromHostMemory(&cuda_feature_most_freq_bins_, feature_most_freq_bins_.data(), feature_most_freq_bins_.size()); - InitCUDAValueFromConstant(&cuda_num_features_, num_features_); - - DivideCUDAFeatureGroups(train_data, share_state); + cuda_row_data_.reset(new CUDARowData(train_data, share_state)); + cuda_row_data_->Init(train_data, share_state); - InitCUDAData(share_state); - - cuda_streams_.resize(5); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[2])); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[3])); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[4])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size()); InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(), need_fix_histogram_features_num_bin_aligend_.size()); - - const int max_block_dim_y = NUM_THRADS_PER_BLOCK / max_num_column_per_partition_; - const int max_grid_dim_y = std::max(min_grid_dim_y_, - ((num_data_ + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + max_block_dim_y - 1) / max_block_dim_y); - AllocateCUDAMemory(num_total_bin_ * 2 * max_grid_dim_y, &block_cuda_hist_buffer_); -} - -void CUDAHistogramConstructor::InitCUDAData(TrainingShareStates* share_state) { - bit_type_ = 0; - size_t total_size = 0; - const uint8_t* data_ptr = nullptr; - data_ptr_bit_type_ = 0; - const uint8_t* cpu_data_ptr = share_state->GetRowWiseData(&bit_type_, &total_size, &is_sparse_, &data_ptr, &data_ptr_bit_type_); - Log::Warning("bit_type_ = %d, is_sparse_ = %d, data_ptr_bit_type_ = %d", bit_type_, static_cast(is_sparse_), data_ptr_bit_type_); - if (bit_type_ == 8) { - if (!is_sparse_) { - std::vector partitioned_data; - GetDenseDataPartitioned(cpu_data_ptr, &partitioned_data); - InitCUDAMemoryFromHostMemory(&cuda_data_uint8_t_, partitioned_data.data(), total_size); - } else { - std::vector> partitioned_data; - if (data_ptr_bit_type_ == 16) { - std::vector> partitioned_data_ptr; - std::vector partition_ptr; - const uint16_t* data_ptr_uint16_t = reinterpret_cast(data_ptr); - GetSparseDataPartitioned(cpu_data_ptr, data_ptr_uint16_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); - InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint16_t_, partition_ptr.data(), partition_ptr.size()); - AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint8_t_); - AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint16_t_); - for (size_t i = 0; i < partitioned_data.size(); ++i) { - const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; - const std::vector& data_for_this_partition = partitioned_data[i]; - CopyFromHostToCUDADevice(cuda_data_uint8_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); - CopyFromHostToCUDADevice(cuda_row_ptr_uint16_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); - } - } else if (data_ptr_bit_type_ == 32) { - const uint32_t* data_ptr_uint32_t = reinterpret_cast(data_ptr); - std::vector> partitioned_data_ptr; - std::vector partition_ptr; - GetSparseDataPartitioned(cpu_data_ptr, data_ptr_uint32_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); - InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint32_t_, partition_ptr.data(), partition_ptr.size()); - AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint8_t_); - AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint32_t_); - for (size_t i = 0; i < partitioned_data.size(); ++i) { - const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; - const std::vector& data_for_this_partition = partitioned_data[i]; - CopyFromHostToCUDADevice(cuda_data_uint8_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); - CopyFromHostToCUDADevice(cuda_row_ptr_uint32_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); - } - } else if (data_ptr_bit_type_ == 64) { - const uint64_t* data_ptr_uint64_t = reinterpret_cast(data_ptr); - std::vector> partitioned_data_ptr; - std::vector partition_ptr; - GetSparseDataPartitioned(cpu_data_ptr, data_ptr_uint64_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); - InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint64_t_, partition_ptr.data(), partition_ptr.size()); - AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint8_t_); - AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint64_t_); - for (size_t i = 0; i < partitioned_data.size(); ++i) { - const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; - const std::vector& data_for_this_partition = partitioned_data[i]; - CopyFromHostToCUDADevice(cuda_data_uint8_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); - CopyFromHostToCUDADevice(cuda_row_ptr_uint64_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); - } - } else { - Log::Fatal("Unknow data ptr bit type %d", data_ptr_bit_type_); - } - } - } else if (bit_type_ == 16) { - if (!is_sparse_) { - std::vector partitioned_data; - GetDenseDataPartitioned(reinterpret_cast(cpu_data_ptr), &partitioned_data); - InitCUDAMemoryFromHostMemory(&cuda_data_uint16_t_, partitioned_data.data(), total_size); - } else { - std::vector> partitioned_data; - if (data_ptr_bit_type_ == 16) { - std::vector> partitioned_data_ptr; - std::vector partition_ptr; - const uint16_t* data_ptr_uint16_t = reinterpret_cast(data_ptr); - GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint16_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); - InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint16_t_, partition_ptr.data(), partition_ptr.size()); - AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint16_t_); - AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint16_t_); - for (size_t i = 0; i < partitioned_data.size(); ++i) { - const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; - const std::vector& data_for_this_partition = partitioned_data[i]; - CopyFromHostToCUDADevice(cuda_data_uint16_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); - CopyFromHostToCUDADevice(cuda_row_ptr_uint16_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); - } - } else if (data_ptr_bit_type_ == 32) { - std::vector> partitioned_data_ptr; - std::vector partition_ptr; - const uint32_t* data_ptr_uint32_t = reinterpret_cast(data_ptr); - GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint32_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); - InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint32_t_, partition_ptr.data(), partition_ptr.size()); - AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint16_t_); - AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint32_t_); - for (size_t i = 0; i < partitioned_data.size(); ++i) { - const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; - const std::vector& data_for_this_partition = partitioned_data[i]; - CopyFromHostToCUDADevice(cuda_data_uint16_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); - CopyFromHostToCUDADevice(cuda_row_ptr_uint32_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); - } - } else if (data_ptr_bit_type_ == 64) { - std::vector> partitioned_data_ptr; - std::vector partition_ptr; - const uint64_t* data_ptr_uint64_t = reinterpret_cast(data_ptr); - GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint64_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); - InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint64_t_, partition_ptr.data(), partition_ptr.size()); - AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint16_t_); - AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint64_t_); - for (size_t i = 0; i < partitioned_data.size(); ++i) { - const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; - const std::vector& data_for_this_partition = partitioned_data[i]; - CopyFromHostToCUDADevice(cuda_data_uint16_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); - CopyFromHostToCUDADevice(cuda_row_ptr_uint64_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); - } - } else { - Log::Fatal("Unknow data ptr bit type %d", data_ptr_bit_type_); - } - } - } else if (bit_type_ == 32) { - if (!is_sparse_) { - std::vector partitioned_data; - GetDenseDataPartitioned(reinterpret_cast(cpu_data_ptr), &partitioned_data); - InitCUDAMemoryFromHostMemory(&cuda_data_uint32_t_, partitioned_data.data(), total_size); - } else { - std::vector> partitioned_data; - if (data_ptr_bit_type_ == 16) { - const uint16_t* data_ptr_uint16_t = reinterpret_cast(data_ptr); - std::vector> partitioned_data_ptr; - std::vector partition_ptr; - GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint16_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); - InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint16_t_, partition_ptr.data(), partition_ptr.size()); - AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint32_t_); - AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint16_t_); - for (size_t i = 0; i < partitioned_data.size(); ++i) { - const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; - const std::vector& data_for_this_partition = partitioned_data[i]; - CopyFromHostToCUDADevice(cuda_data_uint32_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); - CopyFromHostToCUDADevice(cuda_row_ptr_uint16_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); - } - } else if (data_ptr_bit_type_ == 32) { - const uint32_t* data_ptr_uint32_t = reinterpret_cast(data_ptr); - std::vector> partitioned_data_ptr; - std::vector partition_ptr; - GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint32_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); - InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint32_t_, partition_ptr.data(), partition_ptr.size()); - AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint32_t_); - AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint32_t_); - for (size_t i = 0; i < partitioned_data.size(); ++i) { - const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; - const std::vector& data_for_this_partition = partitioned_data[i]; - CopyFromHostToCUDADevice(cuda_data_uint32_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); - CopyFromHostToCUDADevice(cuda_row_ptr_uint32_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); - } - } else if (data_ptr_bit_type_ == 64) { - const uint64_t* data_ptr_uint64_t = reinterpret_cast(data_ptr); - std::vector> partitioned_data_ptr; - std::vector partition_ptr; - GetSparseDataPartitioned(reinterpret_cast(cpu_data_ptr), data_ptr_uint64_t, &partitioned_data, &partitioned_data_ptr, &partition_ptr); - InitCUDAMemoryFromHostMemory(&cuda_partition_ptr_uint64_t_, partition_ptr.data(), partition_ptr.size()); - AllocateCUDAMemory(partition_ptr.back(), &cuda_data_uint32_t_); - AllocateCUDAMemory((num_data_ + 1) * partitioned_data_ptr.size(), &cuda_row_ptr_uint64_t_); - for (size_t i = 0; i < partitioned_data.size(); ++i) { - const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; - const std::vector& data_for_this_partition = partitioned_data[i]; - CopyFromHostToCUDADevice(cuda_data_uint32_t_ + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size()); - CopyFromHostToCUDADevice(cuda_row_ptr_uint64_t_ + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size()); - } - } else { - Log::Fatal("Unknow data ptr bit type %d", data_ptr_bit_type_); - } - } - } else { - Log::Fatal("Unknow bit type = %d", bit_type_); - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -void CUDAHistogramConstructor::PushOneData(const uint32_t feature_bin_value, - const int feature_group_id, - const data_size_t data_index) { - const uint8_t feature_bin_value_uint8 = static_cast(feature_bin_value); - const size_t index = static_cast(data_index) * static_cast(num_feature_groups_) + - static_cast(feature_group_id); - data_[index] = feature_bin_value_uint8; } void CUDAHistogramConstructor::ConstructHistogramForLeaf( - const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits, - const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, - const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf) { + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const CUDALeafSplitsStruct* cuda_larger_leaf_splits, + const data_size_t num_data_in_smaller_leaf, + const data_size_t num_data_in_larger_leaf, + const double sum_hessians_in_smaller_leaf, + const double sum_hessians_in_larger_leaf) { if ((num_data_in_smaller_leaf <= min_data_in_leaf_ || sum_hessians_in_smaller_leaf <= min_sum_hessian_in_leaf_) && (num_data_in_larger_leaf <= min_data_in_leaf_ || sum_hessians_in_larger_leaf <= min_sum_hessian_in_leaf_)) { return; @@ -304,203 +102,16 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf( } void CUDAHistogramConstructor::CalcConstructHistogramKernelDim( - int* grid_dim_x, int* grid_dim_y, int* block_dim_x, int* block_dim_y, + int* grid_dim_x, + int* grid_dim_y, + int* block_dim_x, + int* block_dim_y, const data_size_t num_data_in_smaller_leaf) { - *block_dim_x = max_num_column_per_partition_; - *block_dim_y = NUM_THRADS_PER_BLOCK / max_num_column_per_partition_; - *grid_dim_x = num_feature_partitions_; + *block_dim_x = cuda_row_data_->max_num_column_per_partition(); + *block_dim_y = NUM_THRADS_PER_BLOCK / cuda_row_data_->max_num_column_per_partition(); + *grid_dim_x = cuda_row_data_->num_feature_partitions(); *grid_dim_y = std::max(min_grid_dim_y_, ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + (*block_dim_y) - 1) / (*block_dim_y)); - //Log::Warning("block_dim_x = %d, block_dim_y = %d, grid_dim_x = %d, grid_dim_y = %d", *block_dim_x, *block_dim_y, *grid_dim_x, *grid_dim_y); -} - -void CUDAHistogramConstructor::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state) { - const uint32_t max_num_bin_per_partition = SHRAE_HIST_SIZE / 2; - const std::vector& column_hist_offsets = share_state->column_hist_offsets(); - std::vector feature_group_num_feature_offsets; - int offsets = 0; - int prev_group_index = -1; - for (int feature_index = 0; feature_index < num_features_; ++feature_index) { - const int feature_group_index = train_data->Feature2Group(feature_index); - if (prev_group_index == -1 || feature_group_index != prev_group_index) { - feature_group_num_feature_offsets.emplace_back(offsets); - } - ++offsets; - } - CHECK_EQ(offsets, num_features_); - feature_group_num_feature_offsets.emplace_back(offsets); - - uint32_t start_hist_offset = 0; - feature_partition_column_index_offsets_.clear(); - column_hist_offsets_.clear(); - column_hist_offsets_full_.clear(); - feature_partition_column_index_offsets_.emplace_back(0); - column_hist_offsets_full_.emplace_back(0); - const int num_feature_groups = train_data->num_feature_groups(); - int column_index = 0; - num_feature_partitions_ = 0; - for (int feature_group_index = 0; feature_group_index < num_feature_groups; ++feature_group_index) { - if (!train_data->IsMultiGroup(feature_group_index)) { - const uint32_t column_feature_hist_start = column_hist_offsets[column_index]; - const uint32_t column_feature_hist_end = column_hist_offsets[column_index + 1]; - const uint32_t num_bin_in_dense_group = column_feature_hist_end - column_feature_hist_start; - if (num_bin_in_dense_group > max_num_bin_per_partition) { - Log::Fatal("Too many bins in a dense feature group."); - } - const uint32_t cur_hist_num_bin = column_feature_hist_end - start_hist_offset; - if (cur_hist_num_bin > max_num_bin_per_partition) { - feature_partition_column_index_offsets_.emplace_back(column_index); - start_hist_offset = column_feature_hist_start; - column_hist_offsets_full_.emplace_back(start_hist_offset); - ++num_feature_partitions_; - } - column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - start_hist_offset); - if (feature_group_index == num_feature_groups - 1) { - feature_partition_column_index_offsets_.emplace_back(column_index + 1); - column_hist_offsets_full_.emplace_back(column_hist_offsets.back()); - ++num_feature_partitions_; - } - ++column_index; - } else { - const int group_feature_index_start = feature_group_num_feature_offsets[feature_group_index]; - const int num_features_in_group = feature_group_num_feature_offsets[feature_group_index + 1] - group_feature_index_start; - for (int sub_feature_index = 0; sub_feature_index < num_features_in_group; ++sub_feature_index) { - const int feature_index = group_feature_index_start + sub_feature_index; - const uint32_t column_feature_hist_start = column_hist_offsets[column_index]; - const uint32_t column_feature_hist_end = column_hist_offsets[column_index + 1]; - const uint32_t cur_hist_num_bin = column_feature_hist_end - start_hist_offset; - if (cur_hist_num_bin > max_num_bin_per_partition) { - feature_partition_column_index_offsets_.emplace_back(column_index); - start_hist_offset = column_feature_hist_start; - column_hist_offsets_full_.emplace_back(start_hist_offset); - ++num_feature_partitions_; - } - column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - start_hist_offset); - if (feature_group_index == num_feature_groups - 1 && sub_feature_index == num_features_in_group - 1) { - CHECK_EQ(feature_index, num_features_ - 1); - feature_partition_column_index_offsets_.emplace_back(column_index + 1); - column_hist_offsets_full_.emplace_back(column_hist_offsets.back()); - ++num_feature_partitions_; - } - ++column_index; - } - } - } - max_num_column_per_partition_ = 0; - for (size_t i = 0; i < feature_partition_column_index_offsets_.size() - 1; ++i) { - const int num_column = feature_partition_column_index_offsets_[i + 1] - feature_partition_column_index_offsets_[i]; - if (num_column > max_num_column_per_partition_) { - max_num_column_per_partition_ = num_column; - } - } - - /*Log::Warning("max_num_column_per_partition_ = %d", max_num_column_per_partition_); - for (size_t i = 0; i < feature_partition_column_index_offsets_.size(); ++i) { - Log::Warning("feature_partition_column_index_offsets_[%d] = %d", i, feature_partition_column_index_offsets_[i]); - } - for (size_t i = 0; i < column_hist_offsets_full_.size(); ++i) { - Log::Warning("column_hist_offsets_full_[%d] = %d", i, column_hist_offsets_full_[i]); - } - for (size_t i = 0; i < column_hist_offsets_.size(); ++i) { - Log::Warning("column_hist_offsets_[%d] = %d", i, column_hist_offsets_[i]); - }*/ - - InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, - feature_partition_column_index_offsets_.data(), - feature_partition_column_index_offsets_.size()); - - InitCUDAMemoryFromHostMemory(&cuda_column_hist_offsets_, - column_hist_offsets_.data(), - column_hist_offsets_.size()); - - InitCUDAMemoryFromHostMemory(&cuda_column_hist_offsets_full_, - column_hist_offsets_full_.data(), - column_hist_offsets_full_.size()); -} - -template -void CUDAHistogramConstructor::GetDenseDataPartitioned(const BIN_TYPE* row_wise_data, std::vector* partitioned_data) { - const int num_total_columns = feature_partition_column_index_offsets_.back(); - partitioned_data->resize(static_cast(num_total_columns) * static_cast(num_data_), 0); - BIN_TYPE* out_data = partitioned_data->data(); - Threading::For(0, num_data_, 512, - [this, num_total_columns, row_wise_data, out_data] (int /*thread_index*/, data_size_t start, data_size_t end) { - for (size_t i = 0; i < feature_partition_column_index_offsets_.size() - 1; ++i) { - const int num_prev_columns = static_cast(feature_partition_column_index_offsets_[i]); - const data_size_t offset = num_data_ * num_prev_columns; - const int partition_column_start = feature_partition_column_index_offsets_[i]; - const int partition_column_end = feature_partition_column_index_offsets_[i + 1]; - const int num_columns_in_cur_partition = partition_column_end - partition_column_start; - for (data_size_t data_index = start; data_index < end; ++data_index) { - const data_size_t data_offset = offset + data_index * num_columns_in_cur_partition; - const data_size_t read_data_offset = data_index * num_total_columns; - for (int column_index = 0; column_index < num_columns_in_cur_partition; ++column_index) { - const int true_column_index = read_data_offset + column_index + partition_column_start; - const BIN_TYPE bin = row_wise_data[true_column_index]; - out_data[data_offset + column_index] = bin; - } - } - } - }); -} - -template -void CUDAHistogramConstructor::GetSparseDataPartitioned( - const BIN_TYPE* row_wise_data, - const DATA_PTR_TYPE* row_ptr, - std::vector>* partitioned_data, - std::vector>* partitioned_row_ptr, - std::vector* partition_ptr) { - const int num_partitions = static_cast(feature_partition_column_index_offsets_.size()) - 1; - partitioned_data->resize(num_partitions); - partitioned_row_ptr->resize(num_partitions); - std::vector thread_max_elements_per_row(num_threads_, 0); - Threading::For(0, num_partitions, 1, - [partitioned_data, partitioned_row_ptr, row_ptr, row_wise_data, &thread_max_elements_per_row, this] (int thread_index, int start, int end) { - for (int partition_index = start; partition_index < end; ++partition_index) { - std::vector& data_for_this_partition = partitioned_data->at(partition_index); - std::vector& row_ptr_for_this_partition = partitioned_row_ptr->at(partition_index); - const int partition_hist_start = column_hist_offsets_full_[partition_index]; - const int partition_hist_end = column_hist_offsets_full_[partition_index + 1]; - DATA_PTR_TYPE offset = 0; - row_ptr_for_this_partition.clear(); - data_for_this_partition.clear(); - row_ptr_for_this_partition.emplace_back(offset); - for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { - const DATA_PTR_TYPE row_start = row_ptr[data_index]; - const DATA_PTR_TYPE row_end = row_ptr[data_index + 1]; - const BIN_TYPE* row_data_start = row_wise_data + row_start; - const BIN_TYPE* row_data_end = row_wise_data + row_end; - const size_t partition_start_in_row = std::lower_bound(row_data_start, row_data_end, partition_hist_start) - row_data_start; - const size_t partition_end_in_row = std::lower_bound(row_data_start, row_data_end, partition_hist_end) - row_data_start; - for (size_t pos = partition_start_in_row; pos < partition_end_in_row; ++pos) { - const BIN_TYPE bin = row_data_start[pos]; - CHECK_GE(bin, static_cast(partition_hist_start)); - data_for_this_partition.emplace_back(bin - partition_hist_start); - } - CHECK_GE(partition_end_in_row, partition_start_in_row); - const data_size_t num_elements_in_row = partition_end_in_row - partition_start_in_row; - offset += static_cast(num_elements_in_row); - row_ptr_for_this_partition.emplace_back(offset); - if (num_elements_in_row > thread_max_elements_per_row[thread_index]) { - thread_max_elements_per_row[thread_index] = num_elements_in_row; - } - } - } - }); - partition_ptr->clear(); - DATA_PTR_TYPE offset = 0; - partition_ptr->emplace_back(offset); - for (size_t i = 0; i < partitioned_row_ptr->size(); ++i) { - offset += partitioned_row_ptr->at(i).back(); - partition_ptr->emplace_back(offset); - } - max_num_column_per_partition_ = 0; - for (int thread_index = 0; thread_index < num_threads_; ++thread_index) { - if (thread_max_elements_per_row[thread_index] > max_num_column_per_partition_) { - max_num_column_per_partition_ = thread_max_elements_per_row[thread_index]; - } - } } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index cdd4921568a8..5ffcad280964 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -82,7 +82,6 @@ __global__ void CUDAConstructHistogramDenseKernel( const CUDALeafSplitsStruct* smaller_leaf_splits, const score_t* cuda_gradients, const score_t* cuda_hessians, - const int* num_feature_groups, const BIN_TYPE* data, const uint32_t* column_hist_offsets, const uint32_t* column_hist_offsets_full, @@ -142,7 +141,6 @@ __global__ void CUDAConstructHistogramSparseKernel( const CUDALeafSplitsStruct* smaller_leaf_splits, const score_t* cuda_gradients, const score_t* cuda_hessians, - const int* num_feature_groups, const BIN_TYPE* data, const DATA_PTR_TYPE* row_ptr, const DATA_PTR_TYPE* partition_ptr, @@ -206,147 +204,137 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf); dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); - if (is_sparse_) { - if (bit_type_ == 8) { - if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel<<>>( + if (cuda_row_data_->is_sparse()) { + if (cuda_row_data_->bit_type() == 8) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, - cuda_data_uint8_t_, - cuda_row_ptr_uint16_t_, - cuda_partition_ptr_uint16_t_, - cuda_column_hist_offsets_full_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel<<>>( + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, - cuda_data_uint8_t_, - cuda_row_ptr_uint32_t_, - cuda_partition_ptr_uint32_t_, - cuda_column_hist_offsets_full_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel<<>>( + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, - cuda_data_uint8_t_, - cuda_row_ptr_uint64_t_, - cuda_partition_ptr_uint64_t_, - cuda_column_hist_offsets_full_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } - } else if (bit_type_ == 16) { - if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel<<>>( + } else if (cuda_row_data_->bit_type() == 16) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, - cuda_data_uint16_t_, - cuda_row_ptr_uint16_t_, - cuda_partition_ptr_uint16_t_, - cuda_column_hist_offsets_full_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel<<>>( + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, - cuda_data_uint16_t_, - cuda_row_ptr_uint32_t_, - cuda_partition_ptr_uint32_t_, - cuda_column_hist_offsets_full_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel<<>>( + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, - cuda_data_uint16_t_, - cuda_row_ptr_uint64_t_, - cuda_partition_ptr_uint64_t_, - cuda_column_hist_offsets_full_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } - } else if (bit_type_ == 32) { - if (data_ptr_bit_type_ == 16) { - CUDAConstructHistogramSparseKernel<<>>( + } else if (cuda_row_data_->bit_type() == 32) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, - cuda_data_uint32_t_, - cuda_row_ptr_uint16_t_, - cuda_partition_ptr_uint16_t_, - cuda_column_hist_offsets_full_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (data_ptr_bit_type_ == 32) { - CUDAConstructHistogramSparseKernel<<>>( + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, - cuda_data_uint32_t_, - cuda_row_ptr_uint32_t_, - cuda_partition_ptr_uint32_t_, - cuda_column_hist_offsets_full_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (data_ptr_bit_type_ == 64) { - CUDAConstructHistogramSparseKernel<<>>( + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, - cuda_data_uint32_t_, - cuda_row_ptr_uint64_t_, - cuda_partition_ptr_uint64_t_, - cuda_column_hist_offsets_full_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } } } else { - if (bit_type_ == 8) { - CUDAConstructHistogramDenseKernel<<>>( + if (cuda_row_data_->bit_type() == 8) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_data_uint8_t_, - cuda_column_hist_offsets_, - cuda_column_hist_offsets_full_, - cuda_feature_partition_column_index_offsets_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_column_hist_offsets(), + cuda_row_data_->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_feature_partition_column_index_offsets(), num_data_); - } else if (bit_type_ == 16) { - CUDAConstructHistogramDenseKernel<<>>( + } else if (cuda_row_data_->bit_type() == 16) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_data_uint16_t_, - cuda_column_hist_offsets_, - cuda_column_hist_offsets_full_, - cuda_feature_partition_column_index_offsets_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_column_hist_offsets(), + cuda_row_data_->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_feature_partition_column_index_offsets(), num_data_); - } else if (bit_type_ == 32) { - CUDAConstructHistogramDenseKernel<<>>( + } else if (cuda_row_data_->bit_type() == 32) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_num_feature_groups_, cuda_data_uint32_t_, - cuda_column_hist_offsets_, - cuda_column_hist_offsets_full_, - cuda_feature_partition_column_index_offsets_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_column_hist_offsets(), + cuda_row_data_->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_feature_partition_column_index_offsets(), num_data_); } } } __global__ void SubtractHistogramKernel( - const int* cuda_num_total_bin, + const int num_total_bin, const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits) { - const int cuda_num_total_bin_ref = *cuda_num_total_bin; const unsigned int global_thread_index = threadIdx.x + blockIdx.x * blockDim.x; const int cuda_larger_leaf_index_ref = cuda_larger_leaf_splits->leaf_index; if (cuda_larger_leaf_index_ref >= 0) { const hist_t* smaller_leaf_hist = cuda_smaller_leaf_splits->hist_in_leaf; hist_t* larger_leaf_hist = cuda_larger_leaf_splits->hist_in_leaf; - if (global_thread_index < 2 * cuda_num_total_bin_ref) { + if (global_thread_index < 2 * num_total_bin) { larger_leaf_hist[global_thread_index] -= smaller_leaf_hist[global_thread_index]; } } @@ -400,21 +388,19 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel( const int num_subtract_threads = 2 * num_total_bin_; const int num_subtract_blocks = (num_subtract_threads + SUBTRACT_BLOCK_SIZE - 1) / SUBTRACT_BLOCK_SIZE; global_timer.Start("CUDAHistogramConstructor::FixHistogramKernel"); - FixHistogramKernel<<>>( + FixHistogramKernel<<>>( cuda_feature_num_bins_, cuda_feature_hist_offsets_, cuda_feature_most_freq_bins_, cuda_need_fix_histogram_features_, cuda_need_fix_histogram_features_num_bin_aligned_, cuda_smaller_leaf_splits); - //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDAHistogramConstructor::FixHistogramKernel"); global_timer.Start("CUDAHistogramConstructor::SubtractHistogramKernel"); - SubtractHistogramKernel<<>>( - cuda_num_total_bin_, + SubtractHistogramKernel<<>>( + num_total_bin_, cuda_smaller_leaf_splits, cuda_larger_leaf_splits); - //SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDAHistogramConstructor::SubtractHistogramKernel"); } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index a3cfecc9ef6e..395a614ce894 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -8,6 +8,7 @@ #ifdef USE_CUDA +#include #include #include @@ -31,16 +32,23 @@ namespace LightGBM { class CUDAHistogramConstructor { public: - CUDAHistogramConstructor(const Dataset* train_data, const int num_leaves, const int num_threads, + CUDAHistogramConstructor( + const Dataset* train_data, + const int num_leaves, + const int num_threads, const std::vector& feature_hist_offsets, - const int min_data_in_leaf, const double min_sum_hessian_in_leaf); + const int min_data_in_leaf, + const double min_sum_hessian_in_leaf); void Init(const Dataset* train_data, TrainingShareStates* share_state); void ConstructHistogramForLeaf( - const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits, - const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, - const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const CUDALeafSplitsStruct* cuda_larger_leaf_splits, + const data_size_t num_data_in_smaller_leaf, + const data_size_t num_data_in_larger_leaf, + const double sum_hessians_in_smaller_leaf, + const double sum_hessians_in_larger_leaf); void BeforeTrain(const score_t* gradients, const score_t* hessians); @@ -50,11 +58,12 @@ class CUDAHistogramConstructor { hist_t* cuda_hist_pointer() { return cuda_hist_; } - const uint8_t* cuda_data() const { return cuda_data_uint8_t_; } - private: - - void CalcConstructHistogramKernelDim(int* grid_dim_x, int* grid_dim_y, int* block_dim_x, int* block_dim_y, + void CalcConstructHistogramKernelDim( + int* grid_dim_x, + int* grid_dim_y, + int* block_dim_x, + int* block_dim_y, const data_size_t num_data_in_smaller_leaf); void LaunchConstructHistogramKernel( @@ -65,83 +74,63 @@ class CUDAHistogramConstructor { const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits); - void InitCUDAData(TrainingShareStates* share_state); - - void PushOneData(const uint32_t feature_bin_value, const int feature_group_id, const data_size_t data_index); - - void DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state); - - template - void GetDenseDataPartitioned(const BIN_TYPE* row_wise_data, std::vector* partitioned_data); - - template - void GetSparseDataPartitioned(const BIN_TYPE* row_wise_data, - const DATA_PTR_TYPE* row_ptr, - std::vector>* partitioned_data, - std::vector>* partitioned_row_ptr, - std::vector* partition_ptr); - // Host memory - // data on CPU, stored in row-wise style + + /*! \brief size of training data */ const data_size_t num_data_; + /*! \brief number of features in training data */ const int num_features_; + /*! \brief maximum number of leaves */ const int num_leaves_; + /*! \brief number of threads */ const int num_threads_; + /*! \brief total number of bins in histogram */ int num_total_bin_; + /*! \brief number of feature groups */ int num_feature_groups_; - std::vector data_; - std::vector feature_group_bin_offsets_; - std::vector feature_mfb_offsets_; + /*! \brief number of bins per feature */ std::vector feature_num_bins_; + /*! \brief offsets in histogram of all features */ std::vector feature_hist_offsets_; + /*! \brief most frequent bins in each feature */ std::vector feature_most_freq_bins_; + /*! \brief minimum number of data allowed per leaf */ const int min_data_in_leaf_; + /*! \brief minimum sum value of hessians allowed per leaf */ const double min_sum_hessian_in_leaf_; - std::vector feature_partition_column_index_offsets_; - std::vector column_hist_offsets_; - std::vector column_hist_offsets_full_; - bool is_sparse_; - int num_feature_partitions_; - int max_num_column_per_partition_; - uint8_t data_ptr_bit_type_; - uint8_t bit_type_; - const Dataset* train_data_; - std::vector cuda_streams_; + /*! \brief cuda stream for histogram construction */ + cudaStream_t cuda_stream_; + /*! \brief indices of feature whose histograms need to be fixed */ std::vector need_fix_histogram_features_; + /*! \brief aligned number of bins of the features whose histograms need to be fixed */ std::vector need_fix_histogram_features_num_bin_aligend_; - + /*! \brief minimum number of blocks allowed in the y dimension */ const int min_grid_dim_y_ = 160; + // CUDA memory, held by this object - uint32_t* cuda_feature_group_bin_offsets_; - uint8_t* cuda_feature_mfb_offsets_; + + /*! \brief CUDA row wise data */ + std::unique_ptr cuda_row_data_; + /*! \brief number of bins per feature */ uint32_t* cuda_feature_num_bins_; + /*! \brief offsets in histogram of all features */ uint32_t* cuda_feature_hist_offsets_; + /*! \brief most frequent bins in each feature */ uint32_t* cuda_feature_most_freq_bins_; + /*! \brief CUDA histograms */ hist_t* cuda_hist_; - hist_t* block_cuda_hist_buffer_; - int* cuda_num_total_bin_; - int* cuda_num_feature_groups_; - uint8_t* cuda_data_uint8_t_; - uint16_t* cuda_data_uint16_t_; - uint32_t* cuda_data_uint32_t_; - uint16_t* cuda_row_ptr_uint16_t_; - uint32_t* cuda_row_ptr_uint32_t_; - uint64_t* cuda_row_ptr_uint64_t_; - uint16_t* cuda_partition_ptr_uint16_t_; - uint32_t* cuda_partition_ptr_uint32_t_; - uint64_t* cuda_partition_ptr_uint64_t_; - int* cuda_num_features_; - score_t* cuda_ordered_gradients_; - score_t* cuda_ordered_hessians_; - int* cuda_feature_partition_column_index_offsets_; - uint32_t* cuda_column_hist_offsets_; - uint32_t* cuda_column_hist_offsets_full_; + /*! \brief indices of feature whose histograms need to be fixed */ int* cuda_need_fix_histogram_features_; + /*! \brief aligned number of bins of the features whose histograms need to be fixed */ uint32_t* cuda_need_fix_histogram_features_num_bin_aligned_; - // CUDA memory, held by other objects + + // CUDA memory, held by other object + + /*! \brief gradients on CUDA */ const score_t* cuda_gradients_; + /*! \brief hessians on CUDA */ const score_t* cuda_hessians_; }; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 34a4c0b92ead..26c747e03b4e 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -41,10 +41,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_histogram_constructor_->cuda_hist_pointer())); cuda_data_partition_->Init(); cuda_best_split_finder_.reset(new CUDABestSplitFinder(cuda_histogram_constructor_->cuda_hist(), - train_data_, this->share_state_->feature_hist_offsets(), this->config_->num_leaves, - this->config_->lambda_l1, this->config_->lambda_l2, this->config_->min_data_in_leaf, - this->config_->min_sum_hessian_in_leaf, this->config_->min_gain_to_split, - cuda_centralized_info_->cuda_num_features())); + train_data_, this->share_state_->feature_hist_offsets(), config_)); cuda_best_split_finder_->Init(); leaf_best_split_feature_.resize(config_->num_leaves, -1); @@ -137,14 +134,32 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, find_best_split_time += duration.count(); start = std::chrono::steady_clock::now(); global_timer.Start("NewCUDATreeLearner::FindBestFromAllSplits"); - const CUDASplitInfo* best_split_info = cuda_best_split_finder_->FindBestFromAllSplits( - tree->num_leaves(), - smaller_leaf_index_, - larger_leaf_index_, - &leaf_best_split_feature_, - &leaf_best_split_threshold_, - &leaf_best_split_default_left_, - &best_leaf_index_); + const CUDASplitInfo* best_split_info = nullptr; + if (larger_leaf_index_ >= 0) { + best_split_info = cuda_best_split_finder_->FindBestFromAllSplits( + tree->num_leaves(), + smaller_leaf_index_, + larger_leaf_index_, + &leaf_best_split_feature_[smaller_leaf_index_], + &leaf_best_split_threshold_[smaller_leaf_index_], + &leaf_best_split_default_left_[smaller_leaf_index_], + &leaf_best_split_feature_[larger_leaf_index_], + &leaf_best_split_threshold_[larger_leaf_index_], + &leaf_best_split_default_left_[larger_leaf_index_], + &best_leaf_index_); + } else { + best_split_info = cuda_best_split_finder_->FindBestFromAllSplits( + tree->num_leaves(), + smaller_leaf_index_, + larger_leaf_index_, + &leaf_best_split_feature_[smaller_leaf_index_], + &leaf_best_split_threshold_[smaller_leaf_index_], + &leaf_best_split_default_left_[smaller_leaf_index_], + nullptr, + nullptr, + nullptr, + &best_leaf_index_); + } global_timer.Stop("NewCUDATreeLearner::FindBestFromAllSplits"); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); @@ -158,26 +173,30 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, global_timer.Start("NewCUDATreeLearner::Split"); start = std::chrono::steady_clock::now(); int right_leaf_index = tree->Split(best_leaf_index_, - train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]), - train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], - leaf_best_split_threshold_[best_leaf_index_]), - train_data_->FeatureBinMapper(leaf_best_split_feature_[best_leaf_index_])->missing_type(), - best_split_info); - - cuda_data_partition_->Split( - best_split_info, - best_leaf_index_, - right_leaf_index, - cuda_smaller_leaf_splits_->GetCUDAStructRef(), - cuda_larger_leaf_splits_->GetCUDAStructRef(), - &leaf_num_data_, - &leaf_data_start_, - &leaf_sum_hessians_, - leaf_best_split_feature_, - leaf_best_split_threshold_, - leaf_best_split_default_left_, - &smaller_leaf_index_, - &larger_leaf_index_); + train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]), + train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], + leaf_best_split_threshold_[best_leaf_index_]), + train_data_->FeatureBinMapper(leaf_best_split_feature_[best_leaf_index_])->missing_type(), + best_split_info); + + cuda_data_partition_->Split(best_split_info, + best_leaf_index_, + right_leaf_index, + leaf_best_split_feature_[best_leaf_index_], + leaf_best_split_threshold_[best_leaf_index_], + leaf_best_split_default_left_[best_leaf_index_], + leaf_num_data_[best_leaf_index_], + leaf_data_start_[best_leaf_index_], + cuda_smaller_leaf_splits_->GetCUDAStructRef(), + cuda_larger_leaf_splits_->GetCUDAStructRef(), + &leaf_num_data_[best_leaf_index_], + &leaf_num_data_[right_leaf_index], + &leaf_data_start_[best_leaf_index_], + &leaf_data_start_[right_leaf_index], + &leaf_sum_hessians_[best_leaf_index_], + &leaf_sum_hessians_[right_leaf_index]); + smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] <= leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index); + larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::Split"); From 572e2b08536db03f35cb7dd07e29cf7e55cce537 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 26 Jul 2021 11:35:57 +0000 Subject: [PATCH 044/166] clean src/treelearner/cuda --- CMakeLists.txt | 15 - include/LightGBM/cuda/cuda_algorithms.hpp | 12 +- .../cuda/cuda_best_split_finder.cu | 3 +- .../cuda/cuda_best_split_finder.hpp | 1 - .../cuda/cuda_binary_objective.cpp | 31 - src/treelearner/cuda/cuda_binary_objective.cu | 75 --- .../cuda/cuda_binary_objective.hpp | 47 -- .../cuda/cuda_centralized_info.cpp | 38 -- .../cuda/cuda_centralized_info.hpp | 75 --- src/treelearner/cuda/cuda_data_partition.cpp | 50 +- src/treelearner/cuda/cuda_data_partition.cu | 2 +- src/treelearner/cuda/cuda_data_partition.hpp | 7 +- .../cuda/cuda_histogram_constructor.cpp | 24 +- .../cuda/cuda_histogram_constructor.hpp | 2 +- src/treelearner/cuda/cuda_leaf_splits.cpp | 15 +- src/treelearner/cuda/cuda_leaf_splits.cu | 7 +- src/treelearner/cuda/cuda_leaf_splits.hpp | 8 +- src/treelearner/cuda/cuda_objective.cpp | 17 - src/treelearner/cuda/cuda_objective.hpp | 7 +- .../cuda/cuda_ranking_objective.cpp | 97 --- .../cuda/cuda_ranking_objective.cu | 582 ------------------ .../cuda/cuda_ranking_objective.hpp | 77 --- .../cuda/cuda_regression_objective.cpp | 31 - .../cuda/cuda_regression_objective.cu | 69 --- .../cuda/cuda_regression_objective.hpp | 46 -- src/treelearner/cuda/cuda_score_updater.cpp | 30 - src/treelearner/cuda/cuda_score_updater.cu | 40 -- src/treelearner/cuda/cuda_score_updater.hpp | 6 +- src/treelearner/cuda/cuda_tree_predictor.hpp | 2 +- .../cuda/new_cuda_tree_learner.cpp | 8 +- .../cuda/new_cuda_tree_learner.hpp | 7 - src/treelearner/cuda/new_cuda_utils.cpp | 24 - src/treelearner/cuda/new_cuda_utils.cu | 11 - src/treelearner/cuda/new_cuda_utils.hpp | 102 --- 34 files changed, 71 insertions(+), 1497 deletions(-) delete mode 100644 src/treelearner/cuda/cuda_binary_objective.cpp delete mode 100644 src/treelearner/cuda/cuda_binary_objective.cu delete mode 100644 src/treelearner/cuda/cuda_binary_objective.hpp delete mode 100644 src/treelearner/cuda/cuda_centralized_info.cpp delete mode 100644 src/treelearner/cuda/cuda_centralized_info.hpp delete mode 100644 src/treelearner/cuda/cuda_objective.cpp delete mode 100644 src/treelearner/cuda/cuda_ranking_objective.cpp delete mode 100644 src/treelearner/cuda/cuda_ranking_objective.cu delete mode 100644 src/treelearner/cuda/cuda_ranking_objective.hpp delete mode 100644 src/treelearner/cuda/cuda_regression_objective.cpp delete mode 100644 src/treelearner/cuda/cuda_regression_objective.cu delete mode 100644 src/treelearner/cuda/cuda_regression_objective.hpp delete mode 100644 src/treelearner/cuda/cuda_score_updater.cpp delete mode 100644 src/treelearner/cuda/cuda_score_updater.cu delete mode 100644 src/treelearner/cuda/new_cuda_utils.cpp delete mode 100644 src/treelearner/cuda/new_cuda_utils.cu delete mode 100644 src/treelearner/cuda/new_cuda_utils.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ba4baac8d40..e7a692af2573 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -230,9 +230,6 @@ if(USE_CUDA) add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}") endforeach() - add_library(new_cuda_utils OBJECT src/treelearner/cuda/new_cuda_utils.cu) - set_target_properties(new_cuda_utils PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - add_library(cuda_leaf_splits OBJECT src/treelearner/cuda/cuda_leaf_splits.cu) set_target_properties(cuda_leaf_splits PROPERTIES CUDA_SEPARABLE_COMPILATION ON) @@ -244,18 +241,6 @@ if(USE_CUDA) add_library(cuda_best_split_finder OBJECT src/treelearner/cuda/cuda_best_split_finder.cu) set_target_properties(cuda_best_split_finder PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - add_library(cuda_binary_objective OBJECT src/treelearner/cuda/cuda_binary_objective.cu) - set_target_properties(cuda_binary_objective PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - add_library(cuda_regression_objective OBJECT src/treelearner/cuda/cuda_regression_objective.cu) - set_target_properties(cuda_regression_objective PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - add_library(cuda_ranking_objective OBJECT src/treelearner/cuda/cuda_ranking_objective.cu) - set_target_properties(cuda_ranking_objective PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - add_library(cuda_score_updater OBJECT src/treelearner/cuda/cuda_score_updater.cu) - set_target_properties(cuda_score_updater PROPERTIES CUDA_SEPARABLE_COMPILATION ON) endif(USE_CUDA) if(USE_HDFS) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 53ab11ba315e..497f2aeeb8e5 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -15,14 +15,14 @@ namespace LightGBM { -template -__device__ ReduceSum(T* values, size_t n); +template +__device__ void ReduceSum(T* values, size_t n); -template -__device__ ReduceMax(T* values, size_t n); +template +__device__ void ReduceMax(T* values, size_t n); -template -__device__ PrefixSum(T* values, size_t n); +template +__device__ void PrefixSum(T* values, size_t n); } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 3c4d45e28982..5dba41a1bf06 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -772,7 +772,6 @@ __global__ void FindBestFromAllSplitsKernel(const int cur_num_leaves, __syncthreads(); ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf, cur_num_valid_threads); if (threadIdx_x == 0) { - const int best_leaf_index = thread_best_leaf[0]; cuda_best_split_info_buffer[6] = thread_best_leaf[0]; } } @@ -816,7 +815,7 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int cur_num_le cuda_leaf_best_split_info_); std::vector host_leaf_best_split_info_buffer(7); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CopyFromCUDADeviceToHost(host_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 7); + CopyFromCUDADeviceToHostOuter(host_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 7, __FILE__, __LINE__); *smaller_leaf_best_split_feature = host_leaf_best_split_info_buffer[0]; *smaller_leaf_best_split_threshold = static_cast(host_leaf_best_split_info_buffer[1]); *smaller_leaf_best_split_default_left = static_cast(host_leaf_best_split_info_buffer[2]); diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index e5e0e59f4766..e87bf3288052 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -9,7 +9,6 @@ #ifdef USE_CUDA -#include "new_cuda_utils.hpp" #include "cuda_leaf_splits.hpp" #include diff --git a/src/treelearner/cuda/cuda_binary_objective.cpp b/src/treelearner/cuda/cuda_binary_objective.cpp deleted file mode 100644 index aaedf907e454..000000000000 --- a/src/treelearner/cuda/cuda_binary_objective.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_binary_objective.hpp" - -namespace LightGBM { - -CUDABinaryObjective::CUDABinaryObjective(const data_size_t num_data, const label_t* cuda_labels, const double sigmoid): -CUDAObjective(num_data), cuda_labels_(cuda_labels), sigmoid_(sigmoid) {} - -void CUDABinaryObjective::Init() { - AllocateCUDAMemory(1, &cuda_init_score_); - SetCUDAMemory(cuda_init_score_, 0, 1); -} - -void CUDABinaryObjective::GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - LaunchGetGradientsKernel(cuda_scores, cuda_out_gradients, cuda_out_hessians); -} - -void CUDABinaryObjective::CalcInitScore() { - LaunchCalcInitScoreKernel(); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_binary_objective.cu b/src/treelearner/cuda/cuda_binary_objective.cu deleted file mode 100644 index a47e66ec4455..000000000000 --- a/src/treelearner/cuda/cuda_binary_objective.cu +++ /dev/null @@ -1,75 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_binary_objective.hpp" - -namespace LightGBM { - -__global__ void CalcInitScoreKernel_1_Binary(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_init_score) { - __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE_BINARY]; - const unsigned int tid = threadIdx.x; - const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; - shared_label[tid] = 0.0f; - __syncthreads(); - for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; ++j) { - if (i + j < num_data) { - shared_label[tid] += cuda_labels[i + j]; - } - } - __syncthreads(); - for (unsigned int s = 1; s < blockDim.x; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE_BINARY) { - shared_label[tid] += shared_label[tid + s]; - } - __syncthreads(); - } - if (tid == 0) { - atomicAdd_system(out_cuda_init_score, shared_label[0]); - } -} - -__global__ void CalcInitScoreKernel_2_Binary(double* out_cuda_init_score, const data_size_t num_data, const double sigmoid) { - const double suml = *out_cuda_init_score; - const double sumw = static_cast(num_data); - const double pavg = suml / sumw; - const double init_score = log(pavg / (1.0f - pavg)) / sigmoid; - *out_cuda_init_score = init_score; -} - -void CUDABinaryObjective::LaunchCalcInitScoreKernel() { - const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE_BINARY * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; - const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; - CalcInitScoreKernel_1_Binary<<>>(cuda_labels_, num_data_, cuda_init_score_); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CalcInitScoreKernel_2_Binary<<<1, 1>>>(cuda_init_score_, num_data_, sigmoid_); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -__global__ void GetGradientsKernel_Binary(const double* cuda_scores, const label_t* cuda_labels, - const double sigmoid, const data_size_t num_data, - score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - const label_t cuda_label = static_cast(cuda_labels[data_index]); - const int label = cuda_label == 0 ? -1 : 1; - const double response = -label * sigmoid / (1.0f + std::exp(label * sigmoid * cuda_scores[data_index])); - const double abs_response = fabs(response); - cuda_out_gradients[data_index] = static_cast(response); - cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response)); - } -} - -void CUDABinaryObjective::LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY; - GetGradientsKernel_Binary<<>>(cuda_scores, cuda_labels_, sigmoid_, num_data_, - cuda_out_gradients, cuda_out_hessians); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_binary_objective.hpp b/src/treelearner/cuda/cuda_binary_objective.hpp deleted file mode 100644 index eaab85492210..000000000000 --- a/src/treelearner/cuda/cuda_binary_objective.hpp +++ /dev/null @@ -1,47 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifndef LIGHTGBM_NEW_CUDA_BINARY_OBJECTIVE_HPP_ -#define LIGHTGBM_NEW_CUDA_BINARY_OBJECTIVE_HPP_ - -#ifdef USE_CUDA - -#define GET_GRADIENTS_BLOCK_SIZE_BINARY (1024) -#define CALC_INIT_SCORE_BLOCK_SIZE_BINARY (1024) -#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY (6) - -#include "cuda_objective.hpp" - -namespace LightGBM { - -class CUDABinaryObjective : public CUDAObjective { - public: - CUDABinaryObjective(const data_size_t num_data, const label_t* cuda_label, const double sigmoid); - - void Init() override; - - void CalcInitScore() override; - - const double* cuda_init_score() const override { - return cuda_init_score_; - } - - void GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) override; - - private: - void LaunchCalcInitScoreKernel(); - - void LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians); - - const label_t* cuda_labels_; - double* cuda_init_score_; - const double sigmoid_; -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_NEW_CUDA_BINARY_OBJECTIVE_HPP_ diff --git a/src/treelearner/cuda/cuda_centralized_info.cpp b/src/treelearner/cuda/cuda_centralized_info.cpp deleted file mode 100644 index 97f0f5e5beca..000000000000 --- a/src/treelearner/cuda/cuda_centralized_info.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_centralized_info.hpp" - -namespace LightGBM { - -CUDACentralizedInfo::CUDACentralizedInfo(const data_size_t num_data, const int num_leaves, const int num_features): -num_data_(num_data), num_leaves_(num_leaves), num_features_(num_features) {} - -void CUDACentralizedInfo::Init(const score_t* labels, const Dataset* train_data) { - InitCUDAMemoryFromHostMemory(&cuda_num_data_, &num_data_, 1); - InitCUDAMemoryFromHostMemory(&cuda_num_leaves_, &num_leaves_, 1); - InitCUDAMemoryFromHostMemory(&cuda_num_features_, &num_features_, 1); - - InitCUDAMemoryFromHostMemory(&cuda_labels_, labels, num_data_); - - if (train_data->metadata().query_boundaries() != nullptr) { - InitCUDAMemoryFromHostMemory( - &cuda_query_boundaries_, - train_data->metadata().query_boundaries(), - static_cast(train_data->metadata().num_queries() + 1)); - } -} - -void CUDACentralizedInfo::BeforeTrain(const score_t* gradients, const score_t* hessians) { - cuda_gradients_ = gradients; - cuda_hessians_ = hessians; -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_centralized_info.hpp b/src/treelearner/cuda/cuda_centralized_info.hpp deleted file mode 100644 index ad212dc9a14e..000000000000 --- a/src/treelearner/cuda/cuda_centralized_info.hpp +++ /dev/null @@ -1,75 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifndef LIGHTGBM_CUDA_CENTRALIZED_INFO_HPP_ -#define LIGHTGBM_CUDA_CENTRALIZED_INFO_HPP_ - -#ifdef USE_CUDA - -#include -#include -#include -#include "new_cuda_utils.hpp" - -namespace LightGBM { - -// maintina centralized information for tree training -// these information are shared by various cuda objects in tree training -class CUDACentralizedInfo { - public: - CUDACentralizedInfo(const data_size_t num_data, const int num_leaves, const int num_features); - - void Init(const label_t* labels, const Dataset* train_data); - - void BeforeTrain(const score_t* gradients, const score_t* hessians); - - const data_size_t* cuda_num_data() const { return cuda_num_data_; } - - const int* cuda_num_leaves() const { return cuda_num_leaves_; } - - const int* cuda_num_features() const { return cuda_num_features_; } - - const score_t* cuda_gradients() const { return cuda_gradients_; } - - const score_t* cuda_hessians() const { return cuda_hessians_; } - - const label_t* cuda_labels() const { return cuda_labels_; } - - const data_size_t* cuda_query_boundaries() { return cuda_query_boundaries_; } - - void Test() { - data_size_t test_num_data = 0; - int test_num_leaves = 0; - int test_num_features = 0; - - CopyFromCUDADeviceToHost(&test_num_data, cuda_num_data_, 1); - CopyFromCUDADeviceToHost(&test_num_leaves, cuda_num_leaves_, 1); - CopyFromCUDADeviceToHost(&test_num_features, cuda_num_features_, 1); - Log::Warning("CUDACentralizedInfo::Test test_num_data = %d", test_num_data); - Log::Warning("CUDACentralizedInfo::Test test_num_leaves = %d", test_num_leaves); - Log::Warning("CUDACentralizedInfo::Test test_num_features = %d", test_num_features); - } - - private: - // Host memory - const data_size_t num_data_; - const int num_leaves_; - const int num_features_; - - // CUDA memory, held by this object - data_size_t* cuda_num_data_; - int* cuda_num_leaves_; - int* cuda_num_features_; - const score_t* cuda_gradients_; - const score_t* cuda_hessians_; - label_t* cuda_labels_; - data_size_t* cuda_query_boundaries_; -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_CUDA_CENTRALIZED_INFO_HPP_ diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 675f5a35c61f..b18746be414b 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -15,7 +15,6 @@ CUDADataPartition::CUDADataPartition( const int num_total_bin, const int num_leaves, const int num_threads, - const data_size_t* cuda_num_data, hist_t* cuda_hist): num_data_(train_data->num_data()), @@ -24,8 +23,6 @@ CUDADataPartition::CUDADataPartition( num_leaves_(num_leaves), num_threads_(num_threads), cuda_hist_(cuda_hist) { - - cuda_num_data_ = cuda_num_data; max_num_split_indices_blocks_ = (num_data_ + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; cur_num_leaves_ = 1; @@ -49,24 +46,24 @@ CUDADataPartition::CUDADataPartition( void CUDADataPartition::Init() { // allocate CUDA memory - AllocateCUDAMemory(static_cast(num_data_), &cuda_data_indices_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_data_start_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_data_end_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_num_data_); + AllocateCUDAMemoryOuter(&cuda_data_indices_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_leaf_data_start_, static_cast(num_leaves_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_leaf_data_end_, static_cast(num_leaves_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_leaf_num_data_, static_cast(num_leaves_), __FILE__, __LINE__); // leave some space for alignment - AllocateCUDAMemory(static_cast(num_data_) + 1024 * 8, &cuda_data_to_left_); - AllocateCUDAMemory(static_cast(num_data_), &cuda_data_index_to_leaf_index_); - AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_) + 1, &cuda_block_data_to_left_offset_); - AllocateCUDAMemory(static_cast(max_num_split_indices_blocks_) + 1, &cuda_block_data_to_right_offset_); - SetCUDAMemory(cuda_block_data_to_left_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1); - SetCUDAMemory(cuda_block_data_to_right_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1); - AllocateCUDAMemory(static_cast(num_data_), &cuda_out_data_indices_in_leaf_); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_hist_pool_); - CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1); + AllocateCUDAMemoryOuter(&cuda_data_to_left_, static_cast(num_data_) + 1024 * 8, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_data_index_to_leaf_index_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_data_to_left_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_data_to_right_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_data_to_left_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_data_to_right_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_out_data_indices_in_leaf_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_hist_pool_, static_cast(num_leaves_), __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__); - AllocateCUDAMemory(12, &cuda_split_info_buffer_); + AllocateCUDAMemoryOuter(&cuda_split_info_buffer_, 12, __FILE__, __LINE__); - AllocateCUDAMemory(static_cast(num_leaves_), &cuda_leaf_output_); + AllocateCUDAMemoryOuter(&cuda_leaf_output_, static_cast(num_leaves_), __FILE__, __LINE__); cuda_streams_.resize(4); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); @@ -86,22 +83,23 @@ void CUDADataPartition::Init() { offset += feature_num_bins_[i]; feature_num_bin_offsets.emplace_back(offset); } - InitCUDAMemoryFromHostMemory(&cuda_feature_num_bin_offsets_, feature_num_bin_offsets.data(), feature_num_bin_offsets.size()); - InitCUDAMemoryFromHostMemory(&cuda_bin_upper_bounds_, flatten_bin_upper_bounds.data(), flatten_bin_upper_bounds.size()); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_num_bin_offsets_, feature_num_bin_offsets.data(), feature_num_bin_offsets.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_bin_upper_bounds_, flatten_bin_upper_bounds.data(), flatten_bin_upper_bounds.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_num_data_, &num_data_, 1, __FILE__, __LINE__); } void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { if (data_indices == nullptr) { // no bagging LaunchFillDataIndicesBeforeTrain(); - SetCUDAMemory(cuda_leaf_num_data_, 0, static_cast(num_leaves_)); - SetCUDAMemory(cuda_leaf_data_start_, 0, static_cast(num_leaves_)); - SetCUDAMemory(cuda_leaf_data_end_, 0, static_cast(num_leaves_)); + SetCUDAMemoryOuter(cuda_leaf_num_data_, 0, static_cast(num_leaves_), __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_leaf_data_start_, 0, static_cast(num_leaves_), __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_leaf_data_end_, 0, static_cast(num_leaves_), __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CopyFromCUDADeviceToCUDADevice(cuda_leaf_num_data_, cuda_num_data_, 1); - CopyFromCUDADeviceToCUDADevice(cuda_leaf_data_end_, cuda_num_data_, 1); + CopyFromCUDADeviceToCUDADeviceOuter(cuda_leaf_num_data_, cuda_num_data_, 1, __FILE__, __LINE__); + CopyFromCUDADeviceToCUDADeviceOuter(cuda_leaf_data_end_, cuda_num_data_, 1, __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1); + CopyFromHostToCUDADeviceOuter(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__); } else { Log::Fatal("bagging is not supported by GPU"); } diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 7d91a54a7491..a90947a17c9e 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -1508,7 +1508,7 @@ void CUDADataPartition::LaunchSplitInnerKernel( std::vector cpu_split_info_buffer(12); const double* cpu_sum_hessians_info = reinterpret_cast(cpu_split_info_buffer.data() + 8); global_timer.Start("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); - CopyFromCUDADeviceToHostAsync(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 12, cuda_streams_[0]); + CopyFromCUDADeviceToHostAsyncOuter(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 12, cuda_streams_[0], __FILE__, __LINE__); global_timer.Stop("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); const data_size_t left_leaf_num_data = cpu_split_info_buffer[1]; diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 999c074d07d0..db600e53df76 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -12,7 +12,7 @@ #include #include #include -#include "new_cuda_utils.hpp" + #include "cuda_leaf_splits.hpp" #include @@ -32,7 +32,6 @@ class CUDADataPartition { const int num_total_bin, const int num_leaves, const int num_threads, - const data_size_t* cuda_num_data, hist_t* cuda_hist); void Init(); @@ -277,13 +276,13 @@ class CUDADataPartition { double* cuda_bin_upper_bounds_; /*! \brief the bin offsets of features, used to access cuda_bin_upper_bounds_ */ int* cuda_feature_num_bin_offsets_; + /*! \brief number of data in training set, for intialization of cuda_leaf_num_data_ and cuda_leaf_data_end_ */ + data_size_t* cuda_num_data_; // CUDA memory, held by other object // dataset information - /*! \brief number of data in training set, for intialization of cuda_leaf_num_data_ and cuda_leaf_data_end_ */ - const data_size_t* cuda_num_data_; /*! \brief beginning of histograms, for initialization of cuda_hist_pool_ */ hist_t* cuda_hist_; }; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 757adf2bcc1d..87dc92b51660 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -57,30 +57,30 @@ CUDAHistogramConstructor::CUDAHistogramConstructor( void CUDAHistogramConstructor::BeforeTrain(const score_t* gradients, const score_t* hessians) { cuda_gradients_ = gradients; cuda_hessians_ = hessians; - SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_); + SetCUDAMemoryOuter(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); } void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStates* share_state) { - AllocateCUDAMemory(num_total_bin_ * 2 * num_leaves_, &cuda_hist_); - SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_); + AllocateCUDAMemoryOuter(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_feature_num_bins_, - feature_num_bins_.data(), feature_num_bins_.size()); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_num_bins_, + feature_num_bins_.data(), feature_num_bins_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_feature_hist_offsets_, - feature_hist_offsets_.data(), feature_hist_offsets_.size()); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_hist_offsets_, + feature_hist_offsets_.data(), feature_hist_offsets_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_feature_most_freq_bins_, - feature_most_freq_bins_.data(), feature_most_freq_bins_.size()); + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_most_freq_bins_, + feature_most_freq_bins_.data(), feature_most_freq_bins_.size(), __FILE__, __LINE__); cuda_row_data_.reset(new CUDARowData(train_data, share_state)); cuda_row_data_->Init(train_data, share_state); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); - InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size()); - InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(), - need_fix_histogram_features_num_bin_aligend_.size()); + InitCUDAMemoryFromHostMemoryOuter(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(), + need_fix_histogram_features_num_bin_aligend_.size(), __FILE__, __LINE__); } void CUDAHistogramConstructor::ConstructHistogramForLeaf( diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 395a614ce894..fc30731a92c7 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -14,7 +14,7 @@ #include -#include "new_cuda_utils.hpp" + #include "cuda_leaf_splits.hpp" #include diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 13a512278d51..5cdb08afbb98 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -10,10 +10,9 @@ namespace LightGBM { -CUDALeafSplits::CUDALeafSplits(const data_size_t num_data, const int leaf_index, - const int* cuda_num_data): num_data_(num_data), leaf_index_(leaf_index) { +CUDALeafSplits::CUDALeafSplits(const data_size_t num_data, const int leaf_index): +num_data_(num_data), leaf_index_(leaf_index) { cuda_struct_ = nullptr; - cuda_num_data_ = cuda_num_data; } void CUDALeafSplits::Init() { @@ -21,8 +20,8 @@ void CUDALeafSplits::Init() { // allocate more memory for sum reduction in CUDA // only the first element records the final sum - AllocateCUDAMemory(num_blocks_init_from_gradients_, &cuda_sum_of_gradients_buffer_); - AllocateCUDAMemory(num_blocks_init_from_gradients_, &cuda_sum_of_hessians_buffer_); + AllocateCUDAMemoryOuter(&cuda_sum_of_gradients_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_of_hessians_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_struct_, 1, __FILE__, __LINE__); @@ -42,11 +41,11 @@ void CUDALeafSplits::InitValues( double* root_sum_hessians) { cuda_gradients_ = cuda_gradients; cuda_hessians_ = cuda_hessians; - SetCUDAMemory(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_); - SetCUDAMemory(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_); + SetCUDAMemoryOuter(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); LaunchInitValuesKernal(cuda_data_indices_in_leaf, cuda_hist_in_leaf); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CopyFromCUDADeviceToHostAsync(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, cuda_streams_[1]); + CopyFromCUDADeviceToHostAsyncOuter(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, cuda_streams_[1], __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index c29176ad7363..1072dee37b04 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -11,17 +11,16 @@ namespace LightGBM { __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t* cuda_num_data, double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) { + const data_size_t num_data, double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) { __shared__ score_t shared_gradients[NUM_THRADS_PER_BLOCK_LEAF_SPLITS]; __shared__ score_t shared_hessians[NUM_THRADS_PER_BLOCK_LEAF_SPLITS]; const unsigned int tid = threadIdx.x; const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_LEAF_SPLITS; - const unsigned int num_data_ref = static_cast(*cuda_num_data); shared_gradients[tid] = 0.0f; shared_hessians[tid] = 0.0f; __syncthreads(); for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_LEAF_SPLITS; ++j) { - if (i + j < num_data_ref) { + if (i + j < num_data) { shared_gradients[tid] += cuda_gradients[i + j]; shared_hessians[tid] += cuda_hessians[i + j]; } @@ -85,7 +84,7 @@ void CUDALeafSplits::LaunchInitValuesKernal( const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf) { CUDAInitValuesKernel1<<>>( - cuda_gradients_, cuda_hessians_, cuda_num_data_, cuda_sum_of_gradients_buffer_, + cuda_gradients_, cuda_hessians_, num_data_, cuda_sum_of_gradients_buffer_, cuda_sum_of_hessians_buffer_); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CUDAInitValuesKernel2<<<1, 1>>>( diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index 2e1b81e86fbe..a78e1eab55c3 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -8,10 +8,10 @@ #ifdef USE_CUDA +#include +#include #include #include -#include -#include "new_cuda_utils.hpp" #define INIT_SUM_BLOCK_SIZE_LEAF_SPLITS (6144) #define NUM_THRADS_PER_BLOCK_LEAF_SPLITS (1024) @@ -33,8 +33,7 @@ struct CUDALeafSplitsStruct { class CUDALeafSplits { public: - CUDALeafSplits(const data_size_t num_data, const int leaf_index, - const int* cuda_num_data); + CUDALeafSplits(const data_size_t num_data, const int leaf_index); CUDALeafSplits(); @@ -71,7 +70,6 @@ class CUDALeafSplits { // CUDA memory, held by other object const score_t* cuda_gradients_; const score_t* cuda_hessians_; - const int* cuda_num_data_; }; } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_objective.cpp b/src/treelearner/cuda/cuda_objective.cpp deleted file mode 100644 index 4996396a108e..000000000000 --- a/src/treelearner/cuda/cuda_objective.cpp +++ /dev/null @@ -1,17 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_objective.hpp" - -namespace LightGBM { - -CUDAObjective::CUDAObjective(const data_size_t num_data): num_data_(num_data) {} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_objective.hpp b/src/treelearner/cuda/cuda_objective.hpp index abba984919a1..38d2bce780e0 100644 --- a/src/treelearner/cuda/cuda_objective.hpp +++ b/src/treelearner/cuda/cuda_objective.hpp @@ -3,13 +3,13 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ - +/* #ifndef LIGHTGBM_NEW_CUDA_OBJECTIVE_HPP_ #define LIGHTGBM_NEW_CUDA_OBJECTIVE_HPP_ #ifdef USE_CUDA -#include "new_cuda_utils.hpp" + #include namespace LightGBM { @@ -35,4 +35,5 @@ class CUDAObjective { } // namespace LightGBM #endif // USE_CUDA -#endif // LIGHTGBM_NEW_CUDA_OBJECTIVE_HPP_ \ No newline at end of file +#endif // LIGHTGBM_NEW_CUDA_OBJECTIVE_HPP_ +*/ \ No newline at end of file diff --git a/src/treelearner/cuda/cuda_ranking_objective.cpp b/src/treelearner/cuda/cuda_ranking_objective.cpp deleted file mode 100644 index eb158118df15..000000000000 --- a/src/treelearner/cuda/cuda_ranking_objective.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_ranking_objective.hpp" - -namespace LightGBM { - -CUDARankingObjective::CUDARankingObjective( - const data_size_t num_data, - const label_t* cuda_labels, - const data_size_t* cuda_query_boundaries, - const data_size_t* cpu_query_boundaries, - const int num_queries, - const bool norm, - const double sigmoid, - const int truncation_level, - const label_t* labels, - const int num_threads): -CUDAObjective(num_data), -cuda_labels_(cuda_labels), -cuda_query_boundaries_(cuda_query_boundaries), -num_queries_(num_queries), -norm_(norm), -sigmoid_(sigmoid), -truncation_level_(truncation_level), -num_threads_(num_threads) { - std::vector thread_max_label(num_threads, 0.0f); - Threading::For(0, num_data_, 512, - [labels, &thread_max_label, this] (int thread_index, data_size_t start, data_size_t end) { - if (start < num_data_) { - thread_max_label[thread_index] = labels[start]; - } - for (data_size_t data_index = start + 1; data_index < end; ++data_index) { - const label_t label = labels[data_index]; - if (label > thread_max_label[thread_index]) { - thread_max_label[thread_index] = label; - } - } - }); - max_label_ = thread_max_label[0]; - for (int thread_index = 1; thread_index < num_threads_; ++thread_index) { - max_label_ = std::max(max_label_, thread_max_label[thread_index]); - } - - std::vector thread_max_num_items_in_query(num_threads_); - Threading::For(0, num_queries_, 1, - [cpu_query_boundaries, &thread_max_num_items_in_query] (int thread_index, data_size_t start, data_size_t end) { - for (data_size_t query_index = start; query_index < end; ++query_index) { - const data_size_t query_item_count = cpu_query_boundaries[query_index + 1] - cpu_query_boundaries[query_index]; - if (query_item_count > thread_max_num_items_in_query[thread_index]) { - thread_max_num_items_in_query[thread_index] = query_item_count; - } - } - }); - data_size_t max_items_in_query = 0; - for (int thread_index = 0; thread_index < num_threads_; ++thread_index) { - if (thread_max_num_items_in_query[thread_index] > max_items_in_query) { - max_items_in_query = thread_max_num_items_in_query[thread_index]; - } - } - max_items_in_query_aligned_ = 1; - --max_items_in_query; - while (max_items_in_query > 0) { - max_items_in_query >>= 1; - max_items_in_query_aligned_ <<= 1; - } - if (max_items_in_query_aligned_ > MAX_NUM_ITEM_IN_QUERY) { - Log::Warning("Too many items (%d) in a query.", max_items_in_query_aligned_); - } -} - -void CUDARankingObjective::Init() { - AllocateCUDAMemory(1, &cuda_init_score_); - SetCUDAMemory(cuda_init_score_, 0, 1); - AllocateCUDAMemory(num_data_, &cuda_lambdas_); - AllocateCUDAMemory(num_queries_, &cuda_inverse_max_dcgs_); - LaunchCalcInverseMaxDCGKernel(); -} - -void CUDARankingObjective::GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - LaunchGetGradientsKernel(cuda_scores, cuda_out_gradients, cuda_out_hessians); -} - -void CUDARankingObjective::CalcInitScore() {} - -void CUDARankingObjective::TestGlobalArgSort() const { - LaunchGlobalArgSort(); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_ranking_objective.cu b/src/treelearner/cuda/cuda_ranking_objective.cu deleted file mode 100644 index ea24b458f907..000000000000 --- a/src/treelearner/cuda/cuda_ranking_objective.cu +++ /dev/null @@ -1,582 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_ranking_objective.hpp" - -namespace LightGBM { - -__device__ void ArgSort(const score_t* scores, uint16_t* indices, const uint16_t num_items) { - uint16_t num_items_aligned = 1; - uint16_t num_items_ref = num_items - 1; - uint16_t depth = 1; - while (num_items_ref > 0) { - num_items_aligned <<= 1; - num_items_ref >>= 1; - ++depth; - } - for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { - const uint16_t outer_segment_length = 1 << (depth - outer_depth); - const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; - const bool ascending = (outer_segment_index % 2 > 0); - for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { - const uint16_t segment_length = 1 << (depth - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < num_items_aligned) { - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length; - if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) { - const uint16_t index = indices[threadIdx.x]; - indices[threadIdx.x] = indices[index_to_compare]; - indices[index_to_compare] = index; - } - } - } - __syncthreads(); - } - } -} - -__device__ void ArgSort_Partial(const score_t* scores, uint16_t* indices, const uint16_t num_items, const bool outer_decending) { - uint16_t num_items_aligned = 1; - uint16_t num_items_ref = num_items - 1; - uint16_t depth = 1; - while (num_items_ref > 0) { - num_items_aligned <<= 1; - num_items_ref >>= 1; - ++depth; - } - for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { - const uint16_t outer_segment_length = 1 << (depth - outer_depth); - const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; - const bool ascending = outer_decending ? (outer_segment_index % 2 > 0) : (outer_segment_index % 2 == 0); - for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { - const uint16_t segment_length = 1 << (depth - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < num_items_aligned) { - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length; - if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) { - const uint16_t index = indices[threadIdx.x]; - indices[threadIdx.x] = indices[index_to_compare]; - indices[index_to_compare] = index; - } - } - } - __syncthreads(); - } - } -} - -__device__ void ArgSort_2048(const score_t* scores, uint16_t* indices, const uint16_t num_items) { - const uint16_t depth = 11; - const uint16_t half_num_items_aligned = 1024; - ArgSort_Partial(scores, indices, half_num_items_aligned, true); - ArgSort_Partial(scores + half_num_items_aligned, indices + half_num_items_aligned, half_num_items_aligned, false); - const unsigned int index_to_compare = threadIdx.x + half_num_items_aligned; - if (scores[indices[index_to_compare]] > scores[indices[threadIdx.x]]) { - const uint16_t temp_index = indices[index_to_compare]; - indices[index_to_compare] = indices[threadIdx.x]; - indices[threadIdx.x] = temp_index; - } - __syncthreads(); - for (uint16_t inner_depth = 1; inner_depth < depth; ++inner_depth) { - const uint16_t segment_length = 1 << (depth - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < half_num_items_aligned) { - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length; - if (scores[indices[threadIdx.x]] < scores[indices[index_to_compare]]) { - const uint16_t index = indices[threadIdx.x]; - indices[threadIdx.x] = indices[index_to_compare]; - indices[index_to_compare] = index; - } - } - } - __syncthreads(); - } - const score_t* scores_ptr = scores + half_num_items_aligned; - uint16_t* indices_ptr = indices + half_num_items_aligned; - for (uint16_t inner_depth = 1; inner_depth < depth; ++inner_depth) { - const uint16_t segment_length = 1 << (depth - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < half_num_items_aligned) { - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length; - if (scores_ptr[indices_ptr[threadIdx.x]] < scores_ptr[indices_ptr[index_to_compare]]) { - const uint16_t index = indices_ptr[threadIdx.x]; - indices_ptr[threadIdx.x] = indices_ptr[index_to_compare]; - indices_ptr[index_to_compare] = index; - } - } - } - __syncthreads(); - } -} - -__global__ void ArgSortGlobal(const double* scores, uint16_t* indices, const uint16_t num_items) { - uint16_t num_items_aligned = 1; - uint16_t num_items_ref = num_items - 1; - uint16_t depth = 1; - while (num_items_ref > 0) { - num_items_aligned <<= 1; - num_items_ref >>= 1; - ++depth; - } - for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { - const uint16_t outer_segment_length = 1 << (depth - outer_depth); - const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; - const bool ascending = (outer_segment_index % 2 > 0); - for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { - const uint16_t segment_length = 1 << (depth - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < num_items_aligned) { - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length; - if (ascending) { - if (scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) { - const uint16_t index = indices[threadIdx.x]; - indices[threadIdx.x] = indices[index_to_compare]; - indices[index_to_compare] = index; - } - } else { - if (scores[indices[threadIdx.x]] < scores[indices[index_to_compare]]) { - const uint16_t index = indices[threadIdx.x]; - indices[threadIdx.x] = indices[index_to_compare]; - indices[index_to_compare] = index; - } - } - } - } - __syncthreads(); - } - } -} - -void CUDARankingObjective::LaunchGlobalArgSort() const { - std::vector scores{1.0f, -2.0f, 3.0f, 0.1f, -8.0f, 1.2f, -10000000.0f, -10000000.0f}; - std::vector indices{0, 1, 2, 3, 4, 5, 6, 7}; - double* cuda_scores = nullptr; - uint16_t* cuda_indices = nullptr; - InitCUDAMemoryFromHostMemory(&cuda_scores, scores.data(), scores.size()); - InitCUDAMemoryFromHostMemory(&cuda_indices, indices.data(), indices.size()); - ArgSortGlobal<<<1, 8>>>(cuda_scores, cuda_indices, indices.size()); - std::vector sorted_indices(indices.size()); - CopyFromCUDADeviceToHost(sorted_indices.data(), cuda_indices, sorted_indices.size()); - for (size_t i = 0; i < sorted_indices.size(); ++i) { - Log::Warning("sorted_indices[%d] = %d", i, sorted_indices[i]); - } -} - -__global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, - const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, - const bool norm, const double sigmoid, const int truncation_level, - score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - __shared__ score_t shared_scores[MAX_NUM_ITEM_IN_QUERY]; - __shared__ uint16_t shared_indices[MAX_NUM_ITEM_IN_QUERY]; - __shared__ score_t shared_lambdas[MAX_NUM_ITEM_IN_QUERY]; - __shared__ score_t shared_hessians[MAX_NUM_ITEM_IN_QUERY]; - const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; - const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); - const double min_score = -100000000000.0f; - for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { - const double inverse_max_dcg = cuda_inverse_max_dcgs[query_index]; - const data_size_t query_start = cuda_query_boundaries[query_index]; - const data_size_t query_end = cuda_query_boundaries[query_index + 1]; - const data_size_t query_item_count = query_end - query_start; - const double* cuda_scores_pointer = cuda_scores + query_start; - score_t* cuda_out_gradients_pointer = cuda_out_gradients + query_start; - score_t* cuda_out_hessians_pointer = cuda_out_hessians + query_start; - const label_t* cuda_label_pointer = cuda_labels + query_start; - if (threadIdx.x < query_item_count) { - shared_scores[threadIdx.x] = cuda_scores_pointer[threadIdx.x]; - shared_indices[threadIdx.x] = static_cast(threadIdx.x); - shared_lambdas[threadIdx.x] = 0.0f; - shared_hessians[threadIdx.x] = 0.0f; - } else { - shared_scores[threadIdx.x] = min_score; - shared_indices[threadIdx.x] = static_cast(threadIdx.x); - } - __syncthreads(); - ArgSort(shared_scores, shared_indices, static_cast(query_item_count)); - __syncthreads(); - // get best and worst score - const double best_score = shared_scores[shared_indices[0]]; - data_size_t worst_idx = query_item_count - 1; - if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == min_score) { - worst_idx -= 1; - } - const double worst_score = shared_scores[shared_indices[worst_idx]]; - __shared__ double sum_lambdas; - if (threadIdx.x == 0) { - sum_lambdas = 0.0f; - } - __syncthreads(); - // start accumulate lambdas by pairs that contain at least one document above truncation level - const data_size_t num_items_i = min(query_item_count - 1, truncation_level); - const data_size_t num_j_per_i = query_item_count - 1; - const data_size_t num_pairs = num_items_i * num_j_per_i; - const data_size_t num_pairs_per_thread = (num_pairs + blockDim.x - 1) / blockDim.x; - const data_size_t thread_start = static_cast(threadIdx.x) * num_pairs_per_thread; - const data_size_t thread_end = min(thread_start + num_pairs_per_thread, num_pairs); - for (data_size_t pair_index = thread_start; pair_index < thread_end; ++pair_index) { - const data_size_t i = pair_index / num_j_per_i; - const data_size_t j = pair_index % num_j_per_i + 1; - if (j > i) { - // skip pairs with the same labels - if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != min_score) { - data_size_t high_rank, low_rank; - if (cuda_label_pointer[shared_indices[i]] > cuda_label_pointer[shared_indices[j]]) { - high_rank = i; - low_rank = j; - } else { - high_rank = j; - low_rank = i; - } - const data_size_t high = shared_indices[high_rank]; - const int high_label = static_cast(cuda_label_pointer[high]); - const double high_score = shared_scores[high]; - const double high_label_gain = static_cast((1 << high_label) - 1); - const double high_discount = log2(2.0f + high_rank); - const data_size_t low = shared_indices[low_rank]; - const int low_label = static_cast(cuda_label_pointer[low]); - const double low_score = shared_scores[low]; - const double low_label_gain = static_cast((1 << low_label) - 1); - const double low_discount = log2(2.0f + low_rank); - - const double delta_score = high_score - low_score; - - // get dcg gap - const double dcg_gap = high_label_gain - low_label_gain; - // get discount of this pair - const double paired_discount = fabs(high_discount - low_discount); - // get delta NDCG - double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; - // regular the delta_pair_NDCG by score distance - if (norm && best_score != worst_score) { - delta_pair_NDCG /= (0.01f + fabs(delta_score)); - } - // calculate lambda for this pair - double p_lambda = 1.0f / (1.0f + exp(sigmoid * delta_score)); - double p_hessian = p_lambda * (1.0f - p_lambda); - // update - p_lambda *= -sigmoid * delta_pair_NDCG; - p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; - atomicAdd_block(shared_lambdas + low, -static_cast(p_lambda)); - atomicAdd_block(shared_hessians + low, static_cast(p_hessian)); - atomicAdd_block(shared_lambdas + high, static_cast(p_lambda)); - atomicAdd_block(shared_hessians + high, static_cast(p_hessian)); - // lambda is negative, so use minus to accumulate - atomicAdd_block(&sum_lambdas, -2 * p_lambda); - } - } - } - __syncthreads(); - if (norm && sum_lambdas > 0) { - double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; - if (threadIdx.x < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x] * norm_factor); - cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x] * norm_factor); - } - } else { - if (threadIdx.x < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x]); - cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x]); - } - } - __syncthreads(); - } -} - -/*__device__ void ReduceSumRanking(double* array, const size_t size) { - //const unsigned int threadIdx_x = threadIdx.x; - for (int s = 1; s < size; s <<= 1) { - if (threadIdx.x; % (2 * s) == 0 && (threadIdx.x; + s) < size) { - array[threadIdx.x;] += array[threadIdx.x; + s]; - } - __syncthreads(); - } -}*/ - -__global__ void GetGradientsKernel_Ranking_2048(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, - const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, - const bool norm, const double sigmoid, const int truncation_level, - score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - __shared__ score_t shared_scores[MAX_NUM_ITEM_IN_QUERY]; - __shared__ uint16_t shared_indices[MAX_NUM_ITEM_IN_QUERY]; - __shared__ score_t shared_lambdas[MAX_NUM_ITEM_IN_QUERY]; - __shared__ score_t shared_hessians[MAX_NUM_ITEM_IN_QUERY]; - const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; - const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); - const double min_score = -100000000000.0f; - for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { - const double inverse_max_dcg = cuda_inverse_max_dcgs[query_index]; - const data_size_t query_start = cuda_query_boundaries[query_index]; - const data_size_t query_end = cuda_query_boundaries[query_index + 1]; - const data_size_t query_item_count = query_end - query_start; - const double* cuda_scores_pointer = cuda_scores + query_start; - score_t* cuda_out_gradients_pointer = cuda_out_gradients + query_start; - score_t* cuda_out_hessians_pointer = cuda_out_hessians + query_start; - const label_t* cuda_label_pointer = cuda_labels + query_start; - if (threadIdx.x < query_item_count) { - shared_scores[threadIdx.x] = cuda_scores_pointer[threadIdx.x]; - shared_indices[threadIdx.x] = static_cast(threadIdx.x); - shared_lambdas[threadIdx.x] = 0.0f; - shared_hessians[threadIdx.x] = 0.0f; - } else { - shared_scores[threadIdx.x] = min_score; - shared_indices[threadIdx.x] = static_cast(threadIdx.x); - } - if (query_item_count > 1024) { - const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; - if (threadIdx_x_plus_1024 < query_item_count) { - shared_scores[threadIdx_x_plus_1024] = cuda_scores_pointer[threadIdx_x_plus_1024]; - shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); - shared_lambdas[threadIdx_x_plus_1024] = 0.0f; - shared_hessians[threadIdx_x_plus_1024] = 0.0f; - } else { - shared_scores[threadIdx_x_plus_1024] = min_score; - shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); - } - } - __syncthreads(); - if (query_item_count > 1024) { - ArgSort_2048(shared_scores, shared_indices, static_cast(query_item_count)); - } else { - ArgSort(shared_scores, shared_indices, static_cast(query_item_count)); - } - __syncthreads(); - // get best and worst score - const double best_score = shared_scores[shared_indices[0]]; - data_size_t worst_idx = query_item_count - 1; - if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == min_score) { - worst_idx -= 1; - } - const double worst_score = shared_scores[shared_indices[worst_idx]]; - __shared__ double sum_lambdas; - if (threadIdx.x == 0) { - sum_lambdas = 0.0f; - } - __syncthreads(); - // start accumulate lambdas by pairs that contain at least one document above truncation level - const data_size_t num_items_i = min(query_item_count - 1, truncation_level); - const data_size_t num_j_per_i = query_item_count - 1; - const data_size_t num_pairs = num_items_i * num_j_per_i; - const data_size_t num_pairs_per_thread = (num_pairs + blockDim.x - 1) / blockDim.x; - const data_size_t thread_start = static_cast(threadIdx.x) * num_pairs_per_thread; - const data_size_t thread_end = min(thread_start + num_pairs_per_thread, num_pairs); - double thread_sum_lambdas = 0.0f; - for (data_size_t pair_index = thread_start; pair_index < thread_end; ++pair_index) { - const data_size_t i = pair_index / num_j_per_i; - const data_size_t j = pair_index % num_j_per_i + 1; - if (j > i) { - // skip pairs with the same labels - if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != min_score) { - data_size_t high_rank, low_rank; - if (cuda_label_pointer[shared_indices[i]] > cuda_label_pointer[shared_indices[j]]) { - high_rank = i; - low_rank = j; - } else { - high_rank = j; - low_rank = i; - } - const data_size_t high = shared_indices[high_rank]; - const int high_label = static_cast(cuda_label_pointer[high]); - const double high_score = shared_scores[high]; - const double high_label_gain = static_cast((1 << high_label) - 1); - const double high_discount = log2(2.0f + high_rank); - const data_size_t low = shared_indices[low_rank]; - const int low_label = static_cast(cuda_label_pointer[low]); - const double low_score = shared_scores[low]; - const double low_label_gain = static_cast((1 << low_label) - 1); - const double low_discount = log2(2.0f + low_rank); - - const double delta_score = high_score - low_score; - - // get dcg gap - const double dcg_gap = high_label_gain - low_label_gain; - // get discount of this pair - const double paired_discount = fabs(high_discount - low_discount); - // get delta NDCG - double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; - // regular the delta_pair_NDCG by score distance - if (norm && best_score != worst_score) { - delta_pair_NDCG /= (0.01f + fabs(delta_score)); - } - // calculate lambda for this pair - double p_lambda = 1.0f / (1.0f + exp(sigmoid * delta_score)); - double p_hessian = p_lambda * (1.0f - p_lambda); - // update - p_lambda *= -sigmoid * delta_pair_NDCG; - p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; - atomicAdd_block(shared_lambdas + low, -static_cast(p_lambda)); - atomicAdd_block(shared_hessians + low, static_cast(p_hessian)); - atomicAdd_block(shared_lambdas + high, static_cast(p_lambda)); - atomicAdd_block(shared_hessians + high, static_cast(p_hessian)); - // lambda is negative, so use minus to accumulate - thread_sum_lambdas -= 2 * p_lambda; - } - } - } - atomicAdd_block(&sum_lambdas, thread_sum_lambdas); - __syncthreads(); - if (norm && sum_lambdas > 0) { - double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; - if (threadIdx.x < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x] * norm_factor); - cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x] * norm_factor); - } - if (query_item_count > 1024) { - const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; - if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024] * norm_factor); - cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024] * norm_factor); - } - } - } else { - if (threadIdx.x < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x]); - cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x]); - } - if (query_item_count > 1024) { - const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; - if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024]); - cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024]); - } - } - } - __syncthreads(); - } -} - -void CUDARankingObjective::LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; - if (max_items_in_query_aligned_ <= 1024) { - GetGradientsKernel_Ranking<<>>(cuda_scores, cuda_labels_, num_data_, - num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, - norm_, sigmoid_, truncation_level_, - cuda_out_gradients, cuda_out_hessians); - } else if (max_items_in_query_aligned_ <= 2048) { - GetGradientsKernel_Ranking_2048<<>>(cuda_scores, cuda_labels_, num_data_, - num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, - norm_, sigmoid_, truncation_level_, - cuda_out_gradients, cuda_out_hessians); - } else { - Log::Fatal("Too large max_items_in_query_aligned_ = %d", max_items_in_query_aligned_); - } -} - -__device__ void PrefixSumBankConflict(uint16_t* elements, unsigned int n) { - unsigned int offset = 1; - unsigned int threadIdx_x = threadIdx.x; - const uint16_t last_element = elements[n - 1]; - __syncthreads(); - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - elements[dst_pos] += elements[src_pos]; - } - offset <<= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[n - 1] = 0; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const uint32_t src_val = elements[src_pos]; - elements[src_pos] = elements[dst_pos]; - elements[dst_pos] += src_val; - } - __syncthreads(); - } - if (threadIdx.x == 0) { - elements[n] = elements[n - 1] + last_element; - } - __syncthreads(); -} - -__global__ void CalcInverseMaxDCGKernel( - const data_size_t* cuda_query_boundaries, - const label_t* cuda_labels, - const int truncation_level, - const data_size_t num_queries, - double* cuda_inverse_max_dcgs) { - __shared__ uint32_t label_sum[MAX_RANK_LABEL]; - __shared__ uint16_t label_pos[MAX_RANK_LABEL + 1]; - const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; - const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); - for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { - const data_size_t query_start = cuda_query_boundaries[query_index]; - const data_size_t query_end = cuda_query_boundaries[query_index + 1]; - const data_size_t query_count = query_end - query_start; - if (threadIdx.x < MAX_RANK_LABEL) { - label_sum[threadIdx.x] = 0; - } - __syncthreads(); - const label_t* label_pointer = cuda_labels + query_start; - if (threadIdx.x < static_cast(query_count)) { - atomicAdd_system(label_sum + (MAX_RANK_LABEL - 1 - static_cast(label_pointer[threadIdx.x])), 1); - } - __syncthreads(); - if (threadIdx.x < MAX_RANK_LABEL) { - label_pos[threadIdx.x] = label_sum[threadIdx.x]; - } - __syncthreads(); - PrefixSumBankConflict(label_pos, MAX_RANK_LABEL); - __syncthreads(); - __shared__ double gain; - if (threadIdx.x == 0) { - gain = 0.0f; - } - __syncthreads(); - if (threadIdx.x < MAX_RANK_LABEL && label_sum[threadIdx.x] > 0) { - const uint16_t start_pos = label_pos[threadIdx.x]; - const uint16_t end_pos = min(label_pos[threadIdx.x + 1], truncation_level); - double label_gain = 0.0f; - for (uint16_t k = start_pos; k < end_pos; ++k) { - label_gain += ((1 << (MAX_RANK_LABEL - 1 - threadIdx.x)) - 1) / log(2.0f + k); - } - atomicAdd_system(&gain, label_gain); - } - __syncthreads(); - if (threadIdx.x == 0) { - if (gain > 0.0f) { - cuda_inverse_max_dcgs[query_index] = 1.0f / gain; - } else { - cuda_inverse_max_dcgs[query_index] = 0.0f; - } - } - __syncthreads(); - } -} - -void CUDARankingObjective::LaunchCalcInverseMaxDCGKernel() { - const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; - CalcInverseMaxDCGKernel<<>>( - cuda_query_boundaries_, - cuda_labels_, - truncation_level_, - num_queries_, - cuda_inverse_max_dcgs_); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_ranking_objective.hpp b/src/treelearner/cuda/cuda_ranking_objective.hpp deleted file mode 100644 index 88df461485a9..000000000000 --- a/src/treelearner/cuda/cuda_ranking_objective.hpp +++ /dev/null @@ -1,77 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifndef LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ -#define LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ - -#ifdef USE_CUDA - -#define MAX_NUM_ITEM_IN_QUERY (2048) -#define NUM_QUERY_PER_BLOCK (10) -#define MAX_RANK_LABEL (32) - -#include "cuda_objective.hpp" -#include - -namespace LightGBM { - -class CUDARankingObjective : public CUDAObjective { - public: - CUDARankingObjective( - const data_size_t num_data, - const label_t* cuda_label, - const data_size_t* cuda_query_boundaries, - const data_size_t* cpu_query_boundaries, - const int num_queries, - const bool norm, - const double sigmoid, - const int truncation_level, - const label_t* labels, - const int num_threads); - - void Init() override; - - void CalcInitScore() override; - - const double* cuda_init_score() const override { - return cuda_init_score_; - } - - void GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) override; - - void TestGlobalArgSort() const override; - - private: - - void LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians); - - void LaunchCalcInverseMaxDCGKernel(); - - void LaunchGlobalArgSort() const; - - // CUDA memory, held by this object - double* cuda_init_score_; - double* cuda_lambdas_; - double* cuda_inverse_max_dcgs_; - - // CUDA memory, held by other objects - const label_t* cuda_labels_; - const data_size_t* cuda_query_boundaries_; - - // Host memory - const int num_queries_; - const bool norm_; - const double sigmoid_; - const int truncation_level_; - label_t max_label_; - const int num_threads_; - int max_items_in_query_aligned_; -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ diff --git a/src/treelearner/cuda/cuda_regression_objective.cpp b/src/treelearner/cuda/cuda_regression_objective.cpp deleted file mode 100644 index 8964e25a1328..000000000000 --- a/src/treelearner/cuda/cuda_regression_objective.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_regression_objective.hpp" - -namespace LightGBM { - -CUDARegressionObjective::CUDARegressionObjective(const data_size_t num_data, const label_t* cuda_labels): -CUDAObjective(num_data), cuda_labels_(cuda_labels) {} - -void CUDARegressionObjective::Init() { - AllocateCUDAMemory(1, &cuda_init_score_); - SetCUDAMemory(cuda_init_score_, 0, 1); -} - -void CUDARegressionObjective::GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - LaunchGetGradientsKernel(cuda_scores, cuda_out_gradients, cuda_out_hessians); -} - -void CUDARegressionObjective::CalcInitScore() { - LaunchCalcInitScoreKernel(); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_regression_objective.cu b/src/treelearner/cuda/cuda_regression_objective.cu deleted file mode 100644 index ef771fc48d7a..000000000000 --- a/src/treelearner/cuda/cuda_regression_objective.cu +++ /dev/null @@ -1,69 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_regression_objective.hpp" - -namespace LightGBM { - -__global__ void CalcInitScoreKernel_1_Regression(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_init_score) { - __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION]; - const unsigned int tid = threadIdx.x; - const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; - shared_label[tid] = 0.0f; - __syncthreads(); - for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; ++j) { - if (i + j < num_data) { - shared_label[tid] += cuda_labels[i + j]; - } - } - __syncthreads(); - for (unsigned int s = 1; s < blockDim.x; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION) { - shared_label[tid] += shared_label[tid + s]; - } - __syncthreads(); - } - if (tid == 0) { - atomicAdd_system(out_cuda_init_score, shared_label[0]); - } -} - -__global__ void CalcInitScoreKernel_2_Regression(double* out_cuda_init_score, const data_size_t num_data) { - const double suml = *out_cuda_init_score; - const double sumw = static_cast(num_data); - const double init_score = suml / sumw; - *out_cuda_init_score = init_score; -} - -void CUDARegressionObjective::LaunchCalcInitScoreKernel() { - const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; - const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; - CalcInitScoreKernel_1_Regression<<>>(cuda_labels_, num_data_, cuda_init_score_); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CalcInitScoreKernel_2_Regression<<<1, 1>>>(cuda_init_score_, num_data_); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -__global__ void GetGradientsKernel_Regression(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, - score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - cuda_out_gradients[data_index] = static_cast(cuda_scores[data_index] - cuda_labels[data_index]); - cuda_out_hessians[data_index] = 1.0f; - } -} - -void CUDARegressionObjective::LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - GetGradientsKernel_Regression<<>>(cuda_scores, cuda_labels_, num_data_, - cuda_out_gradients, cuda_out_hessians); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_regression_objective.hpp b/src/treelearner/cuda/cuda_regression_objective.hpp deleted file mode 100644 index 6fcb29ed50a1..000000000000 --- a/src/treelearner/cuda/cuda_regression_objective.hpp +++ /dev/null @@ -1,46 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifndef LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ -#define LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ - -#ifdef USE_CUDA - -#define GET_GRADIENTS_BLOCK_SIZE_REGRESSION (1024) -#define CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION (1024) -#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION (6) - -#include "cuda_objective.hpp" - -namespace LightGBM { - -class CUDARegressionObjective : public CUDAObjective { - public: - CUDARegressionObjective(const data_size_t num_data, const label_t* cuda_label); - - void Init() override; - - void CalcInitScore() override; - - const double* cuda_init_score() const override { - return cuda_init_score_; - } - - void GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) override; - - private: - void LaunchCalcInitScoreKernel(); - - void LaunchGetGradientsKernel(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians); - - const label_t* cuda_labels_; - double* cuda_init_score_; -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ diff --git a/src/treelearner/cuda/cuda_score_updater.cpp b/src/treelearner/cuda/cuda_score_updater.cpp deleted file mode 100644 index 18aa3b9beaf2..000000000000 --- a/src/treelearner/cuda/cuda_score_updater.cpp +++ /dev/null @@ -1,30 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_score_updater.hpp" - -namespace LightGBM { - -CUDAScoreUpdater::CUDAScoreUpdater(const data_size_t num_data): -num_data_(num_data) {} - -void CUDAScoreUpdater::Init() { - AllocateCUDAMemory(static_cast(num_data_), &cuda_scores_); -} - -void CUDAScoreUpdater::SetInitScore(const double* cuda_init_score) { - LaunchSetInitScoreKernel(cuda_init_score); -} - -void CUDAScoreUpdater::AddScore(const double* cuda_score_to_add) { - LaunchAddScoreKernel(cuda_score_to_add); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_score_updater.cu b/src/treelearner/cuda/cuda_score_updater.cu deleted file mode 100644 index 9013e8e4d5e6..000000000000 --- a/src/treelearner/cuda/cuda_score_updater.cu +++ /dev/null @@ -1,40 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_score_updater.hpp" - -namespace LightGBM { - -__global__ void SetInitScoreKernel(double* cuda_scores, const double* cuda_init_score, const data_size_t num_data) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - const double init_score = *cuda_init_score; - if (data_index < num_data) { - cuda_scores[data_index] = init_score; - } -} - -void CUDAScoreUpdater::LaunchSetInitScoreKernel(const double* cuda_init_score) { - const int num_blocks = (num_data_ + SET_INIT_SCORE_BLOCK_SIZE - 1) / SET_INIT_SCORE_BLOCK_SIZE; - SetInitScoreKernel<<>>(cuda_scores_, cuda_init_score, num_data_); -} - -__global__ void AddScoreKernel(double* cuda_scores, const double* cuda_scores_to_add, const data_size_t num_data) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - cuda_scores[data_index] += cuda_scores_to_add[data_index]; - } -} - -void CUDAScoreUpdater::LaunchAddScoreKernel(const double* cuda_scores_to_add) { - const int num_blocks = (num_data_ + SET_INIT_SCORE_BLOCK_SIZE - 1) / SET_INIT_SCORE_BLOCK_SIZE; - AddScoreKernel<<>>(cuda_scores_, cuda_scores_to_add, num_data_); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_score_updater.hpp b/src/treelearner/cuda/cuda_score_updater.hpp index 8af36a38d603..b3195afd5e0b 100644 --- a/src/treelearner/cuda/cuda_score_updater.hpp +++ b/src/treelearner/cuda/cuda_score_updater.hpp @@ -5,11 +5,11 @@ */ #ifndef LIGHTGBM_NEW_CUDA_SCORE_UPDATER_HPP_ #define LIGHTGBM_NEW_CUDA_SCORE_UPDATER_HPP_ - +/* #ifdef USE_CUDA #include -#include "new_cuda_utils.hpp" + #include @@ -42,5 +42,5 @@ class CUDAScoreUpdater { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA*/ #endif // LIGHTGBM_NEW_CUDA_SCORE_UPDATER_HPP_ diff --git a/src/treelearner/cuda/cuda_tree_predictor.hpp b/src/treelearner/cuda/cuda_tree_predictor.hpp index cec5f6a4837a..be58b8cf353e 100644 --- a/src/treelearner/cuda/cuda_tree_predictor.hpp +++ b/src/treelearner/cuda/cuda_tree_predictor.hpp @@ -10,7 +10,7 @@ #include #include -#include "new_cuda_utils.hpp" +#include #include diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 26c747e03b4e..8a7e1f9b0618 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -23,12 +23,9 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia SerialTreeLearner::Init(train_data, is_constant_hessian); num_threads_ = OMP_NUM_THREADS(); CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); - const label_t* labels = train_data->metadata().label(); - cuda_centralized_info_.reset(new CUDACentralizedInfo(num_data_, this->config_->num_leaves, num_features_)); - cuda_centralized_info_->Init(labels, train_data_); - cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0, cuda_centralized_info_->cuda_num_data())); + cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0)); cuda_smaller_leaf_splits_->Init(); - cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_, -1, cuda_centralized_info_->cuda_num_data())); + cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_, -1)); cuda_larger_leaf_splits_->Init(); cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, share_state_->feature_hist_offsets(), @@ -37,7 +34,6 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_data_partition_.reset(new CUDADataPartition( train_data_, share_state_->feature_hist_offsets().back(), this->config_->num_leaves, num_threads_, - cuda_centralized_info_->cuda_num_data(), cuda_histogram_constructor_->cuda_hist_pointer())); cuda_data_partition_->Init(); cuda_best_split_finder_.reset(new CUDABestSplitFinder(cuda_histogram_constructor_->cuda_hist(), diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 3e3ae7e7d486..79a85e3e7ad8 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -13,11 +13,6 @@ #include "cuda_histogram_constructor.hpp" #include "cuda_data_partition.hpp" #include "cuda_best_split_finder.hpp" -#include "cuda_centralized_info.hpp" -#include "cuda_score_updater.hpp" -#include "cuda_binary_objective.hpp" -#include "cuda_regression_objective.hpp" -#include "cuda_ranking_objective.hpp" namespace LightGBM { @@ -55,8 +50,6 @@ class NewCUDATreeLearner: public SerialTreeLearner { int num_threads_; // CUDA components for tree training - // centralized information shared by other CUDA components - std::unique_ptr cuda_centralized_info_; // leaf splits information for smaller and larger leaves std::unique_ptr cuda_smaller_leaf_splits_; std::unique_ptr cuda_larger_leaf_splits_; diff --git a/src/treelearner/cuda/new_cuda_utils.cpp b/src/treelearner/cuda/new_cuda_utils.cpp deleted file mode 100644 index 62b0559e8377..000000000000 --- a/src/treelearner/cuda/new_cuda_utils.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "new_cuda_utils.hpp" - -namespace LightGBM { - -void SynchronizeCUDADevice() { - CUDASUCCESS_OR_FATAL(cudaDeviceSynchronize()); -} - -void PrintLastCUDAError() { - const char* error_name = cudaGetErrorName(cudaGetLastError()); - Log::Warning(error_name); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/new_cuda_utils.cu b/src/treelearner/cuda/new_cuda_utils.cu deleted file mode 100644 index 5b2b4a3ba9fa..000000000000 --- a/src/treelearner/cuda/new_cuda_utils.cu +++ /dev/null @@ -1,11 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#include "new_cuda_utils.hpp" - -namespace LightGBM { - -} // namespace LightGBM diff --git a/src/treelearner/cuda/new_cuda_utils.hpp b/src/treelearner/cuda/new_cuda_utils.hpp deleted file mode 100644 index dbbf77d9bf0a..000000000000 --- a/src/treelearner/cuda/new_cuda_utils.hpp +++ /dev/null @@ -1,102 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifndef LIGHTGBM_NEW_CUDA_UTILS_HPP_ -#define LIGHTGBM_NEW_CUDA_UTILS_HPP_ - -#ifdef USE_CUDA - -#include -#include -#include - -#include - -#define PREFIX_SUM_ARRAY_SIZE_NEW_CUDA_UTILS (1024) - -namespace LightGBM { - -template -void AllocateCUDAMemory(size_t size, T** out_ptr) { - void* tmp_ptr = nullptr; - CUDASUCCESS_OR_FATAL(cudaMalloc(&tmp_ptr, size * sizeof(T))); - *out_ptr = reinterpret_cast(tmp_ptr); -} - -template -void CopyFromHostToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size) { - void* void_dst_ptr = reinterpret_cast(dst_ptr); - const void* void_src_ptr = reinterpret_cast(src_ptr); - size_t size_in_bytes = size * sizeof(T); - CUDASUCCESS_OR_FATAL(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice)); -} - -template -void CopyFromHostToCUDADeviceAsync(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream) { - void* void_dst_ptr = reinterpret_cast(dst_ptr); - const void* void_src_ptr = reinterpret_cast(src_ptr); - size_t size_in_bytes = size * sizeof(T); - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice, stream)); -} - -template -void InitCUDAMemoryFromHostMemory(T** dst_ptr, const T* src_ptr, size_t size) { - AllocateCUDAMemory(size, dst_ptr); - CopyFromHostToCUDADevice(*dst_ptr, src_ptr, size); -} - -template -void InitCUDAValueFromConstant(T** dst_ptr, const T value) { - AllocateCUDAMemory(1, dst_ptr); - CopyFromHostToCUDADevice(*dst_ptr, &value, 1); -} - -template -void CopyFromCUDADeviceToHost(T* dst_ptr, const T* src_ptr, size_t size) { - void* void_dst_ptr = reinterpret_cast(dst_ptr); - const void* void_src_ptr = reinterpret_cast(src_ptr); - size_t size_in_bytes = size * sizeof(T); - CUDASUCCESS_OR_FATAL(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost)); -} - -template -void CopyFromCUDADeviceToHostAsync(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream) { - void* void_dst_ptr = reinterpret_cast(dst_ptr); - const void* void_src_ptr = reinterpret_cast(src_ptr); - size_t size_in_bytes = size * sizeof(T); - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost, stream)); -} - -template -void CopyFromCUDADeviceToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size) { - void* void_dst_ptr = reinterpret_cast(dst_ptr); - const void* void_src_ptr = reinterpret_cast(src_ptr); - size_t size_in_bytes = size * sizeof(T); - CUDASUCCESS_OR_FATAL(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice)); -} - -template -void CopyFromCUDADeviceToCUDADeviceAsync(T* dst_ptr, const T* src_ptr, size_t size) { - void* void_dst_ptr = reinterpret_cast(dst_ptr); - const void* void_src_ptr = reinterpret_cast(src_ptr); - size_t size_in_bytes = size * sizeof(T); - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice)); -} - -void SynchronizeCUDADevice(); - -template -void SetCUDAMemory(T* dst_ptr, int value, size_t size) { - CUDASUCCESS_OR_FATAL(cudaMemset(reinterpret_cast(dst_ptr), value, size * sizeof(T))); -} - -void PrintLastCUDAError(); - -} // namespace LightGBM - -#endif // USE_CUDA - -#endif // LIGHTGBM_NEW_CUDA_UTILS_HPP_ From fe58d4c0991f9d333d5456ab0e492c898246975b Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 27 Jul 2021 08:23:54 +0000 Subject: [PATCH 045/166] gather shared cuda device functions --- CMakeLists.txt | 14 +- include/LightGBM/bin.h | 4 +- include/LightGBM/cuda/cuda_algorithms.hpp | 16 +- include/LightGBM/train_share_states.h | 8 +- src/cuda/cuda_algorithms.cu | 92 +++++ src/io/cuda/cuda_row_data.cpp | 331 ++++++++++++++++++ src/io/multi_val_dense_bin.hpp | 16 +- src/io/multi_val_sparse_bin.hpp | 40 +-- src/objective/cuda/cuda_binary_objective.cpp | 11 - src/objective/cuda/cuda_binary_objective.cu | 3 - .../cuda/cuda_best_split_finder.cu | 114 +----- .../cuda/cuda_best_split_finder.hpp | 3 +- src/treelearner/cuda/cuda_data_partition.cu | 293 +--------------- src/treelearner/cuda/cuda_data_partition.hpp | 3 +- .../cuda/cuda_histogram_constructor.cu | 60 +--- .../cuda/cuda_histogram_constructor.hpp | 1 + src/treelearner/cuda/cuda_objective.hpp | 39 --- src/treelearner/cuda/cuda_score_updater.hpp | 46 --- 18 files changed, 505 insertions(+), 589 deletions(-) create mode 100644 src/cuda/cuda_algorithms.cu create mode 100644 src/io/cuda/cuda_row_data.cpp delete mode 100644 src/treelearner/cuda/cuda_objective.hpp delete mode 100644 src/treelearner/cuda/cuda_score_updater.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e7a692af2573..89494c9616bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -229,18 +229,6 @@ if(USE_CUDA) add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}") add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}") endforeach() - - add_library(cuda_leaf_splits OBJECT src/treelearner/cuda/cuda_leaf_splits.cu) - set_target_properties(cuda_leaf_splits PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - add_library(cuda_data_partition OBJECT src/treelearner/cuda/cuda_data_partition.cu) - set_target_properties(cuda_data_partition PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - add_library(cuda_histogram_constructor OBJECT src/treelearner/cuda/cuda_histogram_constructor.cu) - set_target_properties(cuda_histogram_constructor PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - - add_library(cuda_best_split_finder OBJECT src/treelearner/cuda/cuda_best_split_finder.cu) - set_target_properties(cuda_best_split_finder PROPERTIES CUDA_SEPARABLE_COMPILATION ON) endif(USE_CUDA) if(USE_HDFS) @@ -453,11 +441,13 @@ endif() if(USE_CUDA) set_target_properties(lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) + set_target_properties(lightgbm PROPERTIES CUDA_SEPARABLE_COMPILATION ON) TARGET_LINK_LIBRARIES( lightgbm ${histograms} ) set_target_properties(_lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) + set_target_properties(_lightgbm PROPERTIES CUDA_SEPARABLE_COMPILATION ON) TARGET_LINK_LIBRARIES( _lightgbm ${histograms} diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 631173b56f88..ebaa32eff975 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -466,10 +466,10 @@ class MultiValBin { virtual MultiValBin* Clone() = 0; - virtual const uint8_t* GetRowWiseData(uint8_t* bit_type, + virtual const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const = 0; }; diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 497f2aeeb8e5..442fd768a9cc 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -1,7 +1,8 @@ /*! - * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + #ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_ #define LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_ @@ -11,19 +12,32 @@ #include #include +#include #include +#define NUM_BANKS_DATA_PARTITION (16) +#define LOG_NUM_BANKS_DATA_PARTITION (4) + +#define CONFLICT_FREE_INDEX(n) \ + ((n) + ((n) >> LOG_NUM_BANKS_DATA_PARTITION)) \ + namespace LightGBM { template __device__ void ReduceSum(T* values, size_t n); +template +__device__ void ReduceSumConflictFree(T* values, size_t n); + template __device__ void ReduceMax(T* values, size_t n); template __device__ void PrefixSum(T* values, size_t n); +template +__device__ void PrefixSumConflictFree(T* values, size_t n); + } // namespace LightGBM #endif // USE_CUDA diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index ff303b86c32c..466575b00fef 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -125,11 +125,11 @@ class MultiValBinWrapper { is_subrow_copied_ = is_subrow_copied; } - const uint8_t* GetRowWiseData( + const void* GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { if (multi_val_bin_ == nullptr) { *bit_type = 0; @@ -229,10 +229,10 @@ struct TrainingShareStates { } } - const uint8_t* GetRowWiseData(uint8_t* bit_type, + const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) { if (multi_val_bin_wrapper_ != nullptr) { return multi_val_bin_wrapper_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type); diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu new file mode 100644 index 000000000000..745b474259ef --- /dev/null +++ b/src/cuda/cuda_algorithms.cu @@ -0,0 +1,92 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include + +namespace LightGBM { + +#define ReduceSumInner(values, n) \ + const unsigned int thread_index = threadIdx.x; \ + for (size_t s = 1; s < n; s <<= 1) { \ + if (thread_index % (s << 1) == 0 && (thread_index + s) < n) { \ + values[thread_index] += values[thread_index + s]; \ + } \ + __syncthreads(); \ + } + + +#define ReduceSumConflictFreeInner(values, n) \ + const unsigned int thread_index = threadIdx.x; \ + for (size_t s = 1; s < n; s <<= 1) { \ + if (thread_index % (s << 1) == 0 && (thread_index + s) < n) { \ + values[CONFLICT_FREE_INDEX(thread_index)] += values[CONFLICT_FREE_INDEX(thread_index + s)]; \ + } \ + __syncthreads(); \ + } \ + + +#define PrefixSumInner(elements, n, type) \ + size_t offset = 1; \ + unsigned int threadIdx_x = threadIdx.x; \ + const size_t conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); \ + const type last_element = elements[conflict_free_n_minus_1]; \ + __syncthreads(); \ + for (int d = (n >> 1); d > 0; d >>= 1) { \ + if (threadIdx_x < d) { \ + const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; \ + const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; \ + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; \ + } \ + offset <<= 1; \ + __syncthreads(); \ + } \ + if (threadIdx_x == 0) { \ + elements[conflict_free_n_minus_1] = 0; \ + } \ + __syncthreads(); \ + for (int d = 1; d < n; d <<= 1) { \ + offset >>= 1; \ + if (threadIdx_x < d) { \ + const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; \ + const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; \ + const size_t conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); \ + const size_t conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); \ + const type src_val = elements[conflict_free_src_pos]; \ + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; \ + elements[conflict_free_dst_pos] += src_val; \ + } \ + __syncthreads(); \ + } \ + if (threadIdx_x == 0) { \ + elements[CONFLICT_FREE_INDEX(n)] = elements[conflict_free_n_minus_1] + last_element; \ + } \ + + +template <> +__device__ void ReduceSumConflictFree(uint16_t* values, size_t n) { + ReduceSumConflictFreeInner(values, n); +} + +template <> +__device__ void PrefixSumConflictFree(uint16_t* values, size_t n) { + PrefixSumInner(values, n, uint16_t); +} + +template <> +__device__ void PrefixSumConflictFree(uint32_t* values, size_t n) { + PrefixSumInner(values, n, uint32_t); +} + +template <> +__device__ void PrefixSumConflictFree(hist_t* values, size_t n) { + PrefixSumInner(values, n, hist_t); +} + +template <> +__device__ void ReduceSum(hist_t* values, size_t n) { + ReduceSumInner(values, n); +} + +} // namespace LightGBM diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp new file mode 100644 index 000000000000..d0a2ce697090 --- /dev/null +++ b/src/io/cuda/cuda_row_data.cpp @@ -0,0 +1,331 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include + +namespace LightGBM { + +CUDARowData::CUDARowData(const Dataset* train_data, + const TrainingShareStates* train_share_state) { + num_threads_ = OMP_NUM_THREADS(); + num_data_ = train_data->num_data(); + num_total_bin_ = static_cast(train_share_state->feature_hist_offsets().back()); + num_feature_group_ = train_data->num_feature_groups(); + num_feature_ = train_data->num_features(); +} + +void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_share_state) { + DivideCUDAFeatureGroups(train_data, train_share_state); + bit_type_ = 0; + size_t total_size = 0; + const void* host_row_ptr = nullptr; + row_ptr_bit_type_ = 0; + const void* host_data = train_share_state->GetRowWiseData(&bit_type_, &total_size, &is_sparse_, &host_row_ptr, &row_ptr_bit_type_); + Log::Warning("bit_type_ = %d, is_sparse_ = %d, row_ptr_bit_type_ = %d", bit_type_, static_cast(is_sparse_), row_ptr_bit_type_); + if (bit_type_ == 8) { + if (!is_sparse_) { + std::vector partitioned_data; + GetDenseDataPartitioned(reinterpret_cast(host_data), &partitioned_data); + InitCUDAMemoryFromHostMemoryOuter(&cuda_data_uint8_t_, partitioned_data.data(), total_size, __FILE__, __LINE__); + } else { + if (row_ptr_bit_type_ == 16) { + InitSparseData( + reinterpret_cast(host_data), + reinterpret_cast(host_row_ptr), + &cuda_data_uint8_t_, + &cuda_row_ptr_uint16_t_, + &cuda_partition_ptr_uint16_t_); + } else if (row_ptr_bit_type_ == 32) { + InitSparseData( + reinterpret_cast(host_data), + reinterpret_cast(host_row_ptr), + &cuda_data_uint8_t_, + &cuda_row_ptr_uint32_t_, + &cuda_partition_ptr_uint32_t_); + } else if (row_ptr_bit_type_ == 64) { + InitSparseData( + reinterpret_cast(host_data), + reinterpret_cast(host_row_ptr), + &cuda_data_uint8_t_, + &cuda_row_ptr_uint64_t_, + &cuda_partition_ptr_uint64_t_); + } else { + Log::Fatal("Unknow data ptr bit type %d", row_ptr_bit_type_); + } + } + } else if (bit_type_ == 16) { + if (!is_sparse_) { + std::vector partitioned_data; + GetDenseDataPartitioned(reinterpret_cast(host_data), &partitioned_data); + InitCUDAMemoryFromHostMemoryOuter(&cuda_data_uint16_t_, partitioned_data.data(), total_size, __FILE__, __LINE__); + } else { + if (row_ptr_bit_type_ == 16) { + InitSparseData( + reinterpret_cast(host_data), + reinterpret_cast(host_row_ptr), + &cuda_data_uint16_t_, + &cuda_row_ptr_uint16_t_, + &cuda_partition_ptr_uint16_t_); + } else if (row_ptr_bit_type_ == 32) { + InitSparseData( + reinterpret_cast(host_data), + reinterpret_cast(host_row_ptr), + &cuda_data_uint16_t_, + &cuda_row_ptr_uint32_t_, + &cuda_partition_ptr_uint32_t_); + } else if (row_ptr_bit_type_ == 64) { + InitSparseData( + reinterpret_cast(host_data), + reinterpret_cast(host_row_ptr), + &cuda_data_uint16_t_, + &cuda_row_ptr_uint64_t_, + &cuda_partition_ptr_uint64_t_); + } else { + Log::Fatal("Unknow data ptr bit type %d", row_ptr_bit_type_); + } + } + } else if (bit_type_ == 32) { + if (!is_sparse_) { + std::vector partitioned_data; + GetDenseDataPartitioned(reinterpret_cast(host_data), &partitioned_data); + InitCUDAMemoryFromHostMemoryOuter(&cuda_data_uint32_t_, partitioned_data.data(), total_size, __FILE__, __LINE__); + } else { + if (row_ptr_bit_type_ == 16) { + InitSparseData( + reinterpret_cast(host_data), + reinterpret_cast(host_row_ptr), + &cuda_data_uint32_t_, + &cuda_row_ptr_uint16_t_, + &cuda_partition_ptr_uint16_t_); + } else if (row_ptr_bit_type_ == 32) { + InitSparseData( + reinterpret_cast(host_data), + reinterpret_cast(host_row_ptr), + &cuda_data_uint32_t_, + &cuda_row_ptr_uint32_t_, + &cuda_partition_ptr_uint32_t_); + } else if (row_ptr_bit_type_ == 64) { + InitSparseData( + reinterpret_cast(host_data), + reinterpret_cast(host_row_ptr), + &cuda_data_uint32_t_, + &cuda_row_ptr_uint64_t_, + &cuda_partition_ptr_uint64_t_); + } else { + Log::Fatal("Unknow data ptr bit type %d", row_ptr_bit_type_); + } + } + } else { + Log::Fatal("Unknow bit type = %d", bit_type_); + } + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state) { + const uint32_t max_num_bin_per_partition = SHRAE_HIST_SIZE / 2; + const std::vector& column_hist_offsets = share_state->column_hist_offsets(); + std::vector feature_group_num_feature_offsets; + int offsets = 0; + int prev_group_index = -1; + for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { + const int feature_group_index = train_data->Feature2Group(feature_index); + if (prev_group_index == -1 || feature_group_index != prev_group_index) { + feature_group_num_feature_offsets.emplace_back(offsets); + } + ++offsets; + } + CHECK_EQ(offsets, num_feature_); + feature_group_num_feature_offsets.emplace_back(offsets); + + uint32_t start_hist_offset = 0; + feature_partition_column_index_offsets_.clear(); + column_hist_offsets_.clear(); + partition_hist_offsets_.clear(); + feature_partition_column_index_offsets_.emplace_back(0); + partition_hist_offsets_.emplace_back(0); + const int num_feature_groups = train_data->num_feature_groups(); + int column_index = 0; + num_feature_partitions_ = 0; + for (int feature_group_index = 0; feature_group_index < num_feature_groups; ++feature_group_index) { + if (!train_data->IsMultiGroup(feature_group_index)) { + const uint32_t column_feature_hist_start = column_hist_offsets[column_index]; + const uint32_t column_feature_hist_end = column_hist_offsets[column_index + 1]; + const uint32_t num_bin_in_dense_group = column_feature_hist_end - column_feature_hist_start; + if (num_bin_in_dense_group > max_num_bin_per_partition) { + Log::Fatal("Too many bins in a dense feature group."); + } + const uint32_t cur_hist_num_bin = column_feature_hist_end - start_hist_offset; + if (cur_hist_num_bin > max_num_bin_per_partition) { + feature_partition_column_index_offsets_.emplace_back(column_index); + start_hist_offset = column_feature_hist_start; + partition_hist_offsets_.emplace_back(start_hist_offset); + ++num_feature_partitions_; + } + column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - start_hist_offset); + if (feature_group_index == num_feature_groups - 1) { + feature_partition_column_index_offsets_.emplace_back(column_index + 1); + partition_hist_offsets_.emplace_back(column_hist_offsets.back()); + ++num_feature_partitions_; + } + ++column_index; + } else { + const int group_feature_index_start = feature_group_num_feature_offsets[feature_group_index]; + const int num_feature_in_group = feature_group_num_feature_offsets[feature_group_index + 1] - group_feature_index_start; + for (int sub_feature_index = 0; sub_feature_index < num_feature_in_group; ++sub_feature_index) { + const int feature_index = group_feature_index_start + sub_feature_index; + const uint32_t column_feature_hist_start = column_hist_offsets[column_index]; + const uint32_t column_feature_hist_end = column_hist_offsets[column_index + 1]; + const uint32_t cur_hist_num_bin = column_feature_hist_end - start_hist_offset; + if (cur_hist_num_bin > max_num_bin_per_partition) { + feature_partition_column_index_offsets_.emplace_back(column_index); + start_hist_offset = column_feature_hist_start; + partition_hist_offsets_.emplace_back(start_hist_offset); + ++num_feature_partitions_; + } + column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - start_hist_offset); + if (feature_group_index == num_feature_groups - 1 && sub_feature_index == num_feature_in_group - 1) { + CHECK_EQ(feature_index, num_feature_ - 1); + feature_partition_column_index_offsets_.emplace_back(column_index + 1); + partition_hist_offsets_.emplace_back(column_hist_offsets.back()); + ++num_feature_partitions_; + } + ++column_index; + } + } + } + max_num_column_per_partition_ = 0; + for (size_t i = 0; i < feature_partition_column_index_offsets_.size() - 1; ++i) { + const int num_column = feature_partition_column_index_offsets_[i + 1] - feature_partition_column_index_offsets_[i]; + if (num_column > max_num_column_per_partition_) { + max_num_column_per_partition_ = num_column; + } + } + + InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_partition_column_index_offsets_, + feature_partition_column_index_offsets_.data(), + feature_partition_column_index_offsets_.size(), + __FILE__, + __LINE__); + + InitCUDAMemoryFromHostMemoryOuter(&cuda_column_hist_offsets_, + column_hist_offsets_.data(), + column_hist_offsets_.size(), + __FILE__, + __LINE__); + + InitCUDAMemoryFromHostMemoryOuter(&cuda_partition_hist_offsets_, + partition_hist_offsets_.data(), + partition_hist_offsets_.size(), + __FILE__, + __LINE__); +} + +template +void CUDARowData::GetDenseDataPartitioned(const BIN_TYPE* row_wise_data, std::vector* partitioned_data) { + const int num_total_columns = feature_partition_column_index_offsets_.back(); + partitioned_data->resize(static_cast(num_total_columns) * static_cast(num_data_), 0); + BIN_TYPE* out_data = partitioned_data->data(); + Threading::For(0, num_data_, 512, + [this, num_total_columns, row_wise_data, out_data] (int /*thread_index*/, data_size_t start, data_size_t end) { + for (size_t i = 0; i < feature_partition_column_index_offsets_.size() - 1; ++i) { + const int num_prev_columns = static_cast(feature_partition_column_index_offsets_[i]); + const data_size_t offset = num_data_ * num_prev_columns; + const int partition_column_start = feature_partition_column_index_offsets_[i]; + const int partition_column_end = feature_partition_column_index_offsets_[i + 1]; + const int num_columns_in_cur_partition = partition_column_end - partition_column_start; + for (data_size_t data_index = start; data_index < end; ++data_index) { + const data_size_t data_offset = offset + data_index * num_columns_in_cur_partition; + const data_size_t read_data_offset = data_index * num_total_columns; + for (int column_index = 0; column_index < num_columns_in_cur_partition; ++column_index) { + const int true_column_index = read_data_offset + column_index + partition_column_start; + const BIN_TYPE bin = row_wise_data[true_column_index]; + out_data[data_offset + column_index] = bin; + } + } + } + }); +} + +template +void CUDARowData::GetSparseDataPartitioned( + const BIN_TYPE* row_wise_data, + const DATA_PTR_TYPE* row_ptr, + std::vector>* partitioned_data, + std::vector>* partitioned_row_ptr, + std::vector* partition_ptr) { + const int num_partitions = static_cast(feature_partition_column_index_offsets_.size()) - 1; + partitioned_data->resize(num_partitions); + partitioned_row_ptr->resize(num_partitions); + std::vector thread_max_elements_per_row(num_threads_, 0); + Threading::For(0, num_partitions, 1, + [partitioned_data, partitioned_row_ptr, row_ptr, row_wise_data, &thread_max_elements_per_row, this] (int thread_index, int start, int end) { + for (int partition_index = start; partition_index < end; ++partition_index) { + std::vector& data_for_this_partition = partitioned_data->at(partition_index); + std::vector& row_ptr_for_this_partition = partitioned_row_ptr->at(partition_index); + const int partition_hist_start = partition_hist_offsets_[partition_index]; + const int partition_hist_end = partition_hist_offsets_[partition_index + 1]; + DATA_PTR_TYPE offset = 0; + row_ptr_for_this_partition.clear(); + data_for_this_partition.clear(); + row_ptr_for_this_partition.emplace_back(offset); + for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { + const DATA_PTR_TYPE row_start = row_ptr[data_index]; + const DATA_PTR_TYPE row_end = row_ptr[data_index + 1]; + const BIN_TYPE* row_data_start = row_wise_data + row_start; + const BIN_TYPE* row_data_end = row_wise_data + row_end; + const size_t partition_start_in_row = std::lower_bound(row_data_start, row_data_end, partition_hist_start) - row_data_start; + const size_t partition_end_in_row = std::lower_bound(row_data_start, row_data_end, partition_hist_end) - row_data_start; + for (size_t pos = partition_start_in_row; pos < partition_end_in_row; ++pos) { + const BIN_TYPE bin = row_data_start[pos]; + CHECK_GE(bin, static_cast(partition_hist_start)); + data_for_this_partition.emplace_back(bin - partition_hist_start); + } + CHECK_GE(partition_end_in_row, partition_start_in_row); + const data_size_t num_elements_in_row = partition_end_in_row - partition_start_in_row; + offset += static_cast(num_elements_in_row); + row_ptr_for_this_partition.emplace_back(offset); + if (num_elements_in_row > thread_max_elements_per_row[thread_index]) { + thread_max_elements_per_row[thread_index] = num_elements_in_row; + } + } + } + }); + partition_ptr->clear(); + DATA_PTR_TYPE offset = 0; + partition_ptr->emplace_back(offset); + for (size_t i = 0; i < partitioned_row_ptr->size(); ++i) { + offset += partitioned_row_ptr->at(i).back(); + partition_ptr->emplace_back(offset); + } + max_num_column_per_partition_ = 0; + for (int thread_index = 0; thread_index < num_threads_; ++thread_index) { + if (thread_max_elements_per_row[thread_index] > max_num_column_per_partition_) { + max_num_column_per_partition_ = thread_max_elements_per_row[thread_index]; + } + } +} + +template +void CUDARowData::InitSparseData(const BIN_TYPE* host_data, + const ROW_PTR_TYPE* host_row_ptr, + BIN_TYPE** cuda_data, + ROW_PTR_TYPE** cuda_row_ptr, + ROW_PTR_TYPE** cuda_partition_ptr) { + std::vector> partitioned_data; + std::vector> partitioned_data_ptr; + std::vector partition_ptr; + GetSparseDataPartitioned(host_data, host_row_ptr, &partitioned_data, &partitioned_data_ptr, &partition_ptr); + InitCUDAMemoryFromHostMemoryOuter(cuda_partition_ptr, partition_ptr.data(), partition_ptr.size(), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(cuda_data, partition_ptr.back(), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(cuda_row_ptr, (num_data_ + 1) * partitioned_data_ptr.size(), __FILE__, __LINE__); + for (size_t i = 0; i < partitioned_data.size(); ++i) { + const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; + const std::vector& data_for_this_partition = partitioned_data[i]; + CopyFromHostToCUDADeviceOuter((*cuda_data) + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size(), __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter((*cuda_row_ptr) + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size(), __FILE__, __LINE__); + } +} + +} // namespace LightGBM diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index 13339cce0b23..d70689f825d4 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -210,10 +210,10 @@ class MultiValDenseBin : public MultiValBin { MultiValDenseBin* Clone() override; - const uint8_t* GetRowWiseData(uint8_t* bit_type, + const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const override; private: @@ -235,10 +235,10 @@ MultiValDenseBin* MultiValDenseBin::Clone() { } template <> -const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, +const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = data_.data(); *bit_type = 8; @@ -251,10 +251,10 @@ const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, } template <> -const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, +const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint16_t* data_ptr = data_.data(); const uint8_t* to_return = reinterpret_cast(data_ptr); @@ -268,10 +268,10 @@ const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, } template <> -const uint8_t* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, +const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint32_t* data_ptr = data_.data(); const uint8_t* to_return = reinterpret_cast(data_ptr); diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index a4e151a5830f..909ffade3634 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -290,10 +290,10 @@ class MultiValSparseBin : public MultiValBin { MultiValSparseBin* Clone() override; - const uint8_t* GetRowWiseData(uint8_t* bit_type, + const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const override; private: @@ -323,11 +323,11 @@ MultiValSparseBin* MultiValSparseBin::Clone() { } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = data_.data(); *bit_type = 8; @@ -339,11 +339,11 @@ const uint8_t* MultiValSparseBin::GetRowWiseData( } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 16; @@ -355,11 +355,11 @@ const uint8_t* MultiValSparseBin::GetRowWiseData( } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 32; @@ -371,11 +371,11 @@ const uint8_t* MultiValSparseBin::GetRowWiseData( } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = data_.data(); *bit_type = 8; @@ -387,11 +387,11 @@ const uint8_t* MultiValSparseBin::GetRowWiseData( } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 16; @@ -403,11 +403,11 @@ const uint8_t* MultiValSparseBin::GetRowWiseData( } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 32; @@ -419,11 +419,11 @@ const uint8_t* MultiValSparseBin::GetRowWiseData( } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = data_.data(); *bit_type = 8; @@ -435,11 +435,11 @@ const uint8_t* MultiValSparseBin::GetRowWiseData( } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 16; @@ -451,11 +451,11 @@ const uint8_t* MultiValSparseBin::GetRowWiseData( } template <> -const uint8_t* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, - const uint8_t** out_data_ptr, + const void** out_data_ptr, uint8_t* data_ptr_bit_type) const { const uint8_t* to_return = reinterpret_cast(data_.data()); *bit_type = 32; diff --git a/src/objective/cuda/cuda_binary_objective.cpp b/src/objective/cuda/cuda_binary_objective.cpp index 96fe8593e361..2210e93278ba 100644 --- a/src/objective/cuda/cuda_binary_objective.cpp +++ b/src/objective/cuda/cuda_binary_objective.cpp @@ -29,16 +29,6 @@ void CUDABinaryLogloss::Init(const Metadata& metadata, data_size_t num_data) { void CUDABinaryLogloss::GetGradients(const double* scores, score_t* gradients, score_t* hessians) const { LaunchGetGradientsKernel(scores, gradients, hessians); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - /*std::vector host_gradients(num_data_, 0.0f); - std::vector host_hessians(num_data_, 0.0f); - std::vector host_scores(num_data_, 0.0f); - CopyFromCUDADeviceToHostOuter(host_gradients.data(), gradients, static_cast(num_data_), __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(host_hessians.data(), hessians, static_cast(num_data_), __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(host_scores.data(), scores, static_cast(num_data_), __FILE__, __LINE__); - - for (size_t i = 0; i < 100; ++i) { - Log::Warning("===================================== host_gradients[%d] = %f, host_hessians[%d] = %f, host_score[%d] = %f =====================================", i, host_gradients[i], i, host_hessians[i], i, host_scores[i]); - }*/ } double CUDABinaryLogloss::BoostFromScore(int) const { @@ -46,7 +36,6 @@ double CUDABinaryLogloss::BoostFromScore(int) const { SynchronizeCUDADeviceOuter(__FILE__, __LINE__); double boost_from_score = 0.0f; CopyFromCUDADeviceToHostOuter(&boost_from_score, cuda_boost_from_score_, 1, __FILE__, __LINE__); - Log::Warning("boost_from_score = %f", boost_from_score); return boost_from_score; } diff --git a/src/objective/cuda/cuda_binary_objective.cu b/src/objective/cuda/cuda_binary_objective.cu index f418dc5c83bc..cfef05dacb05 100644 --- a/src/objective/cuda/cuda_binary_objective.cu +++ b/src/objective/cuda/cuda_binary_objective.cu @@ -36,9 +36,6 @@ __global__ void BoostFromScoreKernel_1_BinaryLogloss(const label_t* cuda_labels, __global__ void BoostFromScoreKernel_2_BinaryLogloss(double* out_cuda_init_score, const data_size_t num_data, const double sigmoid) { const double suml = *out_cuda_init_score; const double sumw = static_cast(num_data); - if (threadIdx.x == 0 && blockIdx.x == 0) { - printf("******************************************* suml = %f sumw = %f *******************************************\n", suml, sumw); - } const double pavg = suml / sumw; const double init_score = log(pavg / (1.0f - pavg)) / sigmoid; *out_cuda_init_score = init_score; diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 5dba41a1bf06..90e51ce84200 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -10,93 +10,15 @@ namespace LightGBM { -#define K_MIN_SCORE (-1000000.0) - -#define K_EPSILON (1e-15f) - -#define CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(n) \ - ((n) + ((n) >> LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER)) \ - -__device__ void PrefixSumHist(hist_t* elements, unsigned int n) { - unsigned int offset = 1; - unsigned int threadIdx_x = threadIdx.x; - const unsigned int conflict_free_n_minus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(n - 1); - const hist_t last_element = elements[conflict_free_n_minus_1]; - __syncthreads(); - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(dst_pos)] += elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(src_pos)]; - } - offset <<= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[conflict_free_n_minus_1] = 0; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const unsigned int dst_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 2) - 1); - const unsigned int src_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 1) - 1); - const hist_t src_val = elements[src_pos]; - elements[src_pos] = elements[dst_pos]; - elements[dst_pos] += src_val; - } - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(n)] = elements[conflict_free_n_minus_1] + last_element; - } - __syncthreads(); -} - -__device__ void PrefixSumHistCnt(data_size_t* elements, unsigned int n) { - unsigned int offset = 1; - unsigned int threadIdx_x = threadIdx.x; - const unsigned int conflict_free_n_minus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(n - 1); - const data_size_t last_element = elements[conflict_free_n_minus_1]; - __syncthreads(); - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(dst_pos)] += elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(src_pos)]; - } - offset <<= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[conflict_free_n_minus_1] = 0; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const unsigned int dst_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 2) - 1); - const unsigned int src_pos = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(offset * (2 * threadIdx_x + 1) - 1); - const data_size_t src_val = elements[src_pos]; - elements[src_pos] = elements[dst_pos]; - elements[dst_pos] += src_val; - } - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(n)] = elements[conflict_free_n_minus_1] + last_element; - } -} - __device__ void ReduceBestGain(double* gain, hist_t* sum_gradients, hist_t* sum_hessians, /*data_size_t* num_data,*/ uint8_t* found, uint32_t* threshold_value) { const unsigned int tid = threadIdx.x; - const unsigned int conflict_free_tid_plus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(tid + 1); + const unsigned int conflict_free_tid_plus_1 = CONFLICT_FREE_INDEX(tid + 1); for (unsigned int s = 1; s < MAX_NUM_BIN_IN_FEATURE; s *= 2) { if (tid % (2 * s) == 0 && (tid + s) < MAX_NUM_BIN_IN_FEATURE) { const uint32_t tid_s = tid + s; - const uint32_t conflict_free_tid_s_plus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(tid_s + 1); + const uint32_t conflict_free_tid_s_plus_1 = CONFLICT_FREE_INDEX(tid_s + 1); if ((found[tid_s] && !found[tid]) || (found[tid_s] && found[tid] && gain[tid_s] > gain[tid])) { gain[tid] = gain[tid_s]; sum_gradients[conflict_free_tid_plus_1] = sum_gradients[conflict_free_tid_s_plus_1]; @@ -211,8 +133,8 @@ __device__ void FindBestSplitsForLeafKernelInner( cuda_best_split_info->is_valid = false; - __shared__ hist_t local_grad_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; - __shared__ hist_t local_hess_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER]; + __shared__ hist_t local_grad_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION]; + __shared__ hist_t local_hess_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION]; __shared__ double local_gain[MAX_NUM_BIN_IN_FEATURE]; __shared__ uint8_t threshold_found[MAX_NUM_BIN_IN_FEATURE]; __shared__ uint32_t threshold_value[MAX_NUM_BIN_IN_FEATURE]; @@ -222,7 +144,7 @@ __device__ void FindBestSplitsForLeafKernelInner( const uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; const bool skip_split = (skip_default_bin && (feature_num_bin_minus_offset - 1 - threadIdx_x + feature_mfb_offset == static_cast(feature_default_bin))); const unsigned int bin_offset = threadIdx_x << 1; - const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(threadIdx_x); + const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); if (!reverse) { if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { local_grad_hist[conflict_free_threadIdx_x] = feature_hist_ptr[bin_offset]; @@ -235,7 +157,7 @@ __device__ void FindBestSplitsForLeafKernelInner( } else { if (threadIdx_x < feature_num_bin_minus_offset) { const unsigned int write_index = feature_num_bin_minus_offset - 1 - threadIdx_x; - const unsigned int conflict_free_write_index = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(write_index); + const unsigned int conflict_free_write_index = CONFLICT_FREE_INDEX(write_index); if (!skip_sum) { local_grad_hist[conflict_free_write_index] = feature_hist_ptr[bin_offset]; const hist_t hess = feature_hist_ptr[bin_offset + 1]; @@ -251,14 +173,14 @@ __device__ void FindBestSplitsForLeafKernelInner( } __syncthreads(); if (threadIdx_x == 0) { - local_hess_hist[conflict_free_threadIdx_x] += K_EPSILON; + local_hess_hist[conflict_free_threadIdx_x] += kEpsilon; } - local_gain[threadIdx_x] = K_MIN_SCORE; + local_gain[threadIdx_x] = kMinScore; __syncthreads(); - PrefixSumHist(local_grad_hist, MAX_NUM_BIN_IN_FEATURE); - PrefixSumHist(local_hess_hist, MAX_NUM_BIN_IN_FEATURE); + PrefixSumConflictFree(local_grad_hist, MAX_NUM_BIN_IN_FEATURE); + PrefixSumConflictFree(local_hess_hist, MAX_NUM_BIN_IN_FEATURE); __syncthreads(); - const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX_BEST_SPLIT_FINDER(threadIdx_x + 1); + const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); if (reverse) { if (threadIdx_x >= static_cast(na_as_missing) && threadIdx_x <= feature_num_bin - 2 && !skip_split) { const double sum_right_gradient = local_grad_hist[conflict_free_threadIdx_x_plus_1]; @@ -326,10 +248,10 @@ __device__ void FindBestSplitsForLeafKernelInner( cuda_best_split_info->default_left = assume_out_default_left; if (reverse) { const double sum_right_gradient = local_grad_hist[1]; - const double sum_right_hessian = local_hess_hist[1] - K_EPSILON; + const double sum_right_hessian = local_hess_hist[1] - kEpsilon; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double sum_left_gradient = sum_gradients - sum_right_gradient; - const double sum_left_hessian = sum_hessians - sum_right_hessian - K_EPSILON; + const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon; const data_size_t left_count = num_data - right_count; const double left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian, lambda_l1, use_l1, lambda_l2); @@ -349,10 +271,10 @@ __device__ void FindBestSplitsForLeafKernelInner( sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); } else { const double sum_left_gradient = local_grad_hist[1]; - const double sum_left_hessian = local_hess_hist[1] - K_EPSILON; + const double sum_left_hessian = local_hess_hist[1] - kEpsilon; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); const double sum_right_gradient = sum_gradients - sum_left_gradient; - const double sum_right_hessian = sum_hessians - sum_left_hessian - K_EPSILON; + const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon; const data_size_t right_count = num_data - left_count; const double left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian, lambda_l1, use_l1, lambda_l2); @@ -411,7 +333,7 @@ __global__ void FindBestSplitsForLeafKernel( const bool assume_out_default_left = task_out_default_left[task_index]; const double parent_gain = is_larger ? larger_leaf_splits->gain : smaller_leaf_splits->gain; const double sum_gradients = is_larger ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients; - const double sum_hessians = (is_larger ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * K_EPSILON; + const double sum_hessians = (is_larger ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon; const double num_data = is_larger ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf; const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; CUDASplitInfo* out = cuda_best_split_info + output_offset; @@ -592,7 +514,7 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i cuda_split_info->right_value = best_split_info->right_value; cuda_split_info->is_valid = true; } else { - cuda_split_info->gain = K_MIN_SCORE; + cuda_split_info->gain = kMinScore; cuda_split_info->is_valid = false; } } @@ -754,7 +676,7 @@ __global__ void FindBestFromAllSplitsKernel(const int cur_num_leaves, __shared__ double thread_best_gain[NUM_THREADS_FIND_BEST_LEAF]; __shared__ int thread_best_leaf[NUM_THREADS_FIND_BEST_LEAF]; const unsigned int threadIdx_x = threadIdx.x; - thread_best_gain[threadIdx_x] = K_MIN_SCORE; + thread_best_gain[threadIdx_x] = kMinScore; thread_best_leaf[threadIdx_x] = -1; const int num_leaves_per_thread = (cur_num_leaves + NUM_THREADS_FIND_BEST_LEAF - 1) / NUM_THREADS_FIND_BEST_LEAF; const int cur_num_valid_threads = (cur_num_leaves + num_leaves_per_thread - 1) / num_leaves_per_thread; diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index e87bf3288052..0f48b0ef260e 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -12,6 +12,7 @@ #include "cuda_leaf_splits.hpp" #include +#include #include #include @@ -19,8 +20,6 @@ #define MAX_NUM_BIN_IN_FEATURE (256) #define NUM_THREADS_FIND_BEST_LEAF (256) -#define LOG_NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER (4) -#define NUM_BANKS_DATA_PARTITION_BEST_SPLIT_FINDER (16) #define NUM_TASKS_PER_SYNC_BLOCK (1024) namespace LightGBM { diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index a90947a17c9e..d0b4b966637a 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -11,287 +11,6 @@ namespace LightGBM { -#define CONFLICT_FREE_INDEX(n) \ - ((n) + ((n) >> LOG_NUM_BANKS_DATA_PARTITION)) \ - -__device__ void PrefixSum(uint32_t* elements, unsigned int n) { - unsigned int offset = 1; - unsigned int threadIdx_x = threadIdx.x; - const unsigned int conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); - const uint32_t last_element = elements[conflict_free_n_minus_1]; - __syncthreads(); - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - offset <<= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[conflict_free_n_minus_1] = 0; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[CONFLICT_FREE_INDEX(n)] = elements[conflict_free_n_minus_1] + last_element; - } -} - -__device__ void PrefixSum_1024(uint32_t* elements, unsigned int n) { - unsigned int threadIdx_x = threadIdx.x; - const unsigned int conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); - const uint32_t last_element = elements[conflict_free_n_minus_1]; - __syncthreads(); - - if (threadIdx_x < 512) { - const unsigned int src_pos = (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = (2 * threadIdx_x + 2) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - __syncthreads(); - - if (threadIdx_x < 256) { - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 1) - 1; - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 1) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - __syncthreads(); - - if (threadIdx_x < 128) { - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 2) - 1; - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 2) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - __syncthreads(); - - if (threadIdx_x < 64) { - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 3) - 1; - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 3) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - __syncthreads(); - - if (threadIdx_x < 32) { - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 4) - 1; - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 4) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - __syncthreads(); - - if (threadIdx_x < 16) { - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 5) - 1; - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 5) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - __syncthreads(); - - if (threadIdx_x < 8) { - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 6) - 1; - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 6) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - __syncthreads(); - - if (threadIdx_x < 4) { - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 7) - 1; - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 7) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - __syncthreads(); - - if (threadIdx_x < 2) { - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 8) - 1; - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 8) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - __syncthreads(); - - if (threadIdx_x == 0) { - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(1023); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(511); - elements[conflict_free_dst_pos] += elements[conflict_free_src_pos]; - elements[conflict_free_n_minus_1] = 0; - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - - if (threadIdx_x < 2) { - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 8) - 1; - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 8) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - - if (threadIdx_x < 4) { - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 7) - 1; - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 7) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - - if (threadIdx_x < 8) { - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 6) - 1; - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 6) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - - if (threadIdx_x < 16) { - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 5) - 1; - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 5) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - - if (threadIdx_x < 32) { - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 4) - 1; - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 4) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - - if (threadIdx_x < 64) { - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 3) - 1; - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 3) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - - if (threadIdx_x < 128) { - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 2) - 1; - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 2) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - - if (threadIdx_x < 256) { - const unsigned int dst_pos = ((2 * threadIdx_x + 2) << 1) - 1; - const unsigned int src_pos = ((2 * threadIdx_x + 1) << 1) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - - if (threadIdx_x < 512) { - const unsigned int dst_pos = (2 * threadIdx_x + 2) - 1; - const unsigned int src_pos = (2 * threadIdx_x + 1) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint32_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - - if (threadIdx_x == 0) { - elements[CONFLICT_FREE_INDEX(n)] = elements[conflict_free_n_minus_1] + last_element; - } -} - -__device__ void PrefixSum(uint16_t* elements, unsigned int n) { - unsigned int offset = 1; - unsigned int threadIdx_x = threadIdx.x; - const unsigned int conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); - const uint16_t last_element = elements[conflict_free_n_minus_1]; - __syncthreads(); - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; - } - offset <<= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[conflict_free_n_minus_1] = 0; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const unsigned int conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const uint16_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[CONFLICT_FREE_INDEX(n)] = elements[conflict_free_n_minus_1] + last_element; - } -} - -__device__ void ReduceSum(uint16_t* array, const size_t size) { - const unsigned int threadIdx_x = threadIdx.x; - for (int s = 1; s < size; s <<= 1) { - if (threadIdx_x % (2 * s) == 0 && (threadIdx_x + s) < size) { - array[CONFLICT_FREE_INDEX(threadIdx_x)] += array[CONFLICT_FREE_INDEX(threadIdx_x + s)]; - } - __syncthreads(); - } -} - -__device__ void ReduceSum(double* array, const size_t size) { - const unsigned int threadIdx_x = threadIdx.x; - for (int s = 1; s < size; s <<= 1) { - if (threadIdx_x % (2 * s) == 0 && (threadIdx_x + s) < size) { - array[threadIdx_x] += array[threadIdx_x + s]; - } - __syncthreads(); - } -} - __global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_data, data_size_t* data_indices, int* cuda_data_index_to_leaf_index) { const data_size_t num_data_ref = *cuda_num_data; @@ -314,7 +33,7 @@ __device__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, const uint const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockDim_x = blockDim.x; __syncthreads(); - ReduceSum(thread_to_left_offset_cnt, split_indices_block_size_data_partition); + ReduceSumConflictFree(thread_to_left_offset_cnt, static_cast(split_indices_block_size_data_partition)); __syncthreads(); if (threadIdx_x == 0) { const data_size_t num_data_in_block = (blockIdx.x + 1) * blockDim_x <= num_data_in_leaf_ref ? static_cast(blockDim_x) : @@ -1165,8 +884,8 @@ __global__ void AggregateBlockOffsetKernel0( block_to_right_offset[conflict_free_threadIdx_x] = 0; } __syncthreads(); - PrefixSum_1024(block_to_left_offset, blockDim_x); - PrefixSum_1024(block_to_right_offset, blockDim_x); + PrefixSumConflictFree(block_to_left_offset, blockDim_x); + PrefixSumConflictFree(block_to_right_offset, blockDim_x); __syncthreads(); const uint32_t to_left_total_count = block_to_left_offset[CONFLICT_FREE_INDEX(blockDim_x)]; const uint32_t to_left_thread_block_offset = block_to_left_offset[conflict_free_threadIdx_x]; @@ -1209,8 +928,8 @@ __global__ void AggregateBlockOffsetKernel1( block_to_right_offset[conflict_free_threadIdx_x] = 0; } __syncthreads(); - PrefixSum(block_to_left_offset, num_blocks_aligned); - PrefixSum(block_to_right_offset, num_blocks_aligned); + PrefixSumConflictFree(block_to_left_offset, num_blocks_aligned); + PrefixSumConflictFree(block_to_right_offset, num_blocks_aligned); __syncthreads(); const uint32_t to_left_total_count = block_to_left_offset[CONFLICT_FREE_INDEX(num_blocks_aligned)]; if (threadIdx_x < static_cast(num_blocks)) { @@ -1396,7 +1115,7 @@ __global__ void SplitInnerKernel(const int left_leaf_index, const int right_leaf thread_to_right_pos[0] = 0; } __syncthreads(); - PrefixSum(thread_to_left_pos, split_indices_block_size_data_partition); + PrefixSumConflictFree(thread_to_left_pos, split_indices_block_size_data_partition); __syncthreads(); if (threadIdx_x > 0) { thread_to_right_pos[threadIdx_x] = (threadIdx_x - thread_to_left_pos[conflict_free_threadIdx_x_plus_1]); diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index db600e53df76..01d705b796ab 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -8,6 +8,7 @@ #ifdef USE_CUDA +#include #include #include #include @@ -19,8 +20,6 @@ // TODO(shiyu1994): adjust these values according to different CUDA and GPU versions #define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) #define SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION (512) -#define NUM_BANKS_DATA_PARTITION (32) -#define LOG_NUM_BANKS_DATA_PARTITION (5) #define AGGREGATE_BLOCK_SIZE_DATA_PARTITION (1024) namespace LightGBM { diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 5ffcad280964..85ade30827ff 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -9,44 +9,7 @@ #include "cuda_histogram_constructor.hpp" namespace LightGBM { - -__device__ void PrefixSum(hist_t* elements, unsigned int n) { - unsigned int offset = 1; - unsigned int threadIdx_x = threadIdx.x; - const unsigned int conflict_free_n_minus_1 = (n - 1); - const hist_t last_element = elements[conflict_free_n_minus_1]; - __syncthreads(); - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - elements[(dst_pos)] += elements[(src_pos)]; - } - offset <<= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[conflict_free_n_minus_1] = 0; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int conflict_free_dst_pos = (dst_pos); - const unsigned int conflict_free_src_pos = (src_pos); - const hist_t src_val = elements[conflict_free_src_pos]; - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; - elements[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[(n)] = elements[conflict_free_n_minus_1] + last_element; - } -} - +/* __device__ void ReduceSumHistogramConstructor(hist_t* array, const size_t size) { const unsigned int threadIdx_x = threadIdx.x; const size_t atomic_size = size / 4; @@ -60,22 +23,7 @@ __device__ void ReduceSumHistogramConstructor(hist_t* array, const size_t size) atomicAdd_block(array, array[threadIdx_x]); } __syncthreads(); -} - -__device__ void ReduceSumHistogramConstructorMerge(hist_t* array, const size_t size) { - const unsigned int threadIdx_x = (threadIdx.x % USED_HISTOGRAM_BUFFER_NUM); - const size_t atomic_size = size / 4; - for (int s = 1; s < atomic_size; s <<= 1) { - if (threadIdx_x % (2 * s) == 0 && (threadIdx_x + s) < size) { - array[threadIdx_x] += array[threadIdx_x + s]; - } - __syncthreads(); - } - if (threadIdx_x > 0 && threadIdx_x % atomic_size == 0) { - atomicAdd_block(array, array[threadIdx_x]); - } - __syncthreads(); -} +}*/ template __global__ void CUDAConstructHistogramDenseKernel( @@ -373,8 +321,8 @@ __global__ void FixHistogramKernel( hist_hessians[threadIdx_x] = 0.0f; } __syncthreads(); - ReduceSumHistogramConstructor(hist_gradients, num_bin_aligned); - ReduceSumHistogramConstructor(hist_hessians, num_bin_aligned); + ReduceSum(hist_gradients, num_bin_aligned); + ReduceSum(hist_hessians, num_bin_aligned); __syncthreads(); if (threadIdx_x == most_freq_bin) { feature_hist[hist_pos] = leaf_sum_gradients - hist_gradients[0]; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index fc30731a92c7..b6afc98dd50a 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -8,6 +8,7 @@ #ifdef USE_CUDA +#include #include #include #include diff --git a/src/treelearner/cuda/cuda_objective.hpp b/src/treelearner/cuda/cuda_objective.hpp deleted file mode 100644 index 38d2bce780e0..000000000000 --- a/src/treelearner/cuda/cuda_objective.hpp +++ /dev/null @@ -1,39 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ -/* -#ifndef LIGHTGBM_NEW_CUDA_OBJECTIVE_HPP_ -#define LIGHTGBM_NEW_CUDA_OBJECTIVE_HPP_ - -#ifdef USE_CUDA - - -#include - -namespace LightGBM { - -class CUDAObjective { - public: - CUDAObjective(const data_size_t num_data); - - virtual void Init() = 0; - - virtual void CalcInitScore() = 0; - - virtual void GetGradients(const double* cuda_scores, score_t* cuda_out_gradients, score_t* cuda_out_hessians) = 0; - - virtual const double* cuda_init_score() const = 0; - - virtual void TestGlobalArgSort() const {} - - protected: - const data_size_t num_data_; -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_NEW_CUDA_OBJECTIVE_HPP_ -*/ \ No newline at end of file diff --git a/src/treelearner/cuda/cuda_score_updater.hpp b/src/treelearner/cuda/cuda_score_updater.hpp deleted file mode 100644 index b3195afd5e0b..000000000000 --- a/src/treelearner/cuda/cuda_score_updater.hpp +++ /dev/null @@ -1,46 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ -#ifndef LIGHTGBM_NEW_CUDA_SCORE_UPDATER_HPP_ -#define LIGHTGBM_NEW_CUDA_SCORE_UPDATER_HPP_ -/* -#ifdef USE_CUDA - -#include - - -#include - -#define SET_INIT_SCORE_BLOCK_SIZE (1024) - -namespace LightGBM { - -class CUDAScoreUpdater { - public: - CUDAScoreUpdater(const data_size_t num_data); - - void Init(); - - void SetInitScore(const double* cuda_init_score); - - void AddScore(const double* cuda_score_to_add); - - const double* cuda_scores() const { return cuda_scores_; } - - double* cuda_score_ref() { return cuda_scores_; } - - private: - void LaunchSetInitScoreKernel(const double* cuda_init_score); - - void LaunchAddScoreKernel(const double* cuda_scores_to_add); - - const data_size_t num_data_; - double* cuda_scores_; -}; - -} // namespace LightGBM - -#endif // USE_CUDA*/ -#endif // LIGHTGBM_NEW_CUDA_SCORE_UPDATER_HPP_ From dc461dcd95975e1910fe7152be7c276651f4d56f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 27 Jul 2021 09:19:32 +0000 Subject: [PATCH 046/166] put shared CUDA functions into header file --- include/LightGBM/cuda/cuda_algorithms.hpp | 68 ++++++++++++++- src/cuda/cuda_algorithms.cu | 82 ------------------- .../cuda/cuda_best_split_finder.cu | 3 +- .../cuda/cuda_best_split_finder.hpp | 1 - src/treelearner/cuda/cuda_data_partition.cu | 1 + src/treelearner/cuda/cuda_data_partition.hpp | 1 - .../cuda/cuda_histogram_constructor.cu | 16 +--- .../cuda/cuda_histogram_constructor.hpp | 1 - src/treelearner/cuda/cuda_leaf_splits.cpp | 11 +-- src/treelearner/cuda/cuda_leaf_splits.hpp | 4 +- .../cuda/new_cuda_tree_learner.cpp | 4 +- 11 files changed, 75 insertions(+), 117 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 442fd768a9cc..7c9b4fd35bbf 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -23,11 +23,71 @@ namespace LightGBM { +#define ReduceSumInner(values, n) \ + const unsigned int thread_index = threadIdx.x; \ + for (size_t s = 1; s < n; s <<= 1) { \ + if (thread_index % (s << 1) == 0 && (thread_index + s) < n) { \ + values[thread_index] += values[thread_index + s]; \ + } \ + __syncthreads(); \ + } + + +#define ReduceSumConflictFreeInner(values, n) \ + const unsigned int thread_index = threadIdx.x; \ + for (size_t s = 1; s < n; s <<= 1) { \ + if (thread_index % (s << 1) == 0 && (thread_index + s) < n) { \ + values[CONFLICT_FREE_INDEX(thread_index)] += values[CONFLICT_FREE_INDEX(thread_index + s)]; \ + } \ + __syncthreads(); \ + } \ + + +#define PrefixSumInner(elements, n, type) \ + size_t offset = 1; \ + unsigned int threadIdx_x = threadIdx.x; \ + const size_t conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); \ + const type last_element = elements[conflict_free_n_minus_1]; \ + __syncthreads(); \ + for (int d = (n >> 1); d > 0; d >>= 1) { \ + if (threadIdx_x < d) { \ + const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; \ + const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; \ + elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; \ + } \ + offset <<= 1; \ + __syncthreads(); \ + } \ + if (threadIdx_x == 0) { \ + elements[conflict_free_n_minus_1] = 0; \ + } \ + __syncthreads(); \ + for (int d = 1; d < n; d <<= 1) { \ + offset >>= 1; \ + if (threadIdx_x < d) { \ + const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; \ + const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; \ + const size_t conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); \ + const size_t conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); \ + const type src_val = elements[conflict_free_src_pos]; \ + elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; \ + elements[conflict_free_dst_pos] += src_val; \ + } \ + __syncthreads(); \ + } \ + if (threadIdx_x == 0) { \ + elements[CONFLICT_FREE_INDEX(n)] = elements[conflict_free_n_minus_1] + last_element; \ + } \ + template -__device__ void ReduceSum(T* values, size_t n); +__device__ void ReduceSum(T* values, size_t n) { + ReduceSumInner(values, n); +} template -__device__ void ReduceSumConflictFree(T* values, size_t n); +__device__ void ReduceSumConflictFree(T* values, size_t n) { + ReduceSumConflictFreeInner(values, n); +} template __device__ void ReduceMax(T* values, size_t n); @@ -36,7 +96,9 @@ template __device__ void PrefixSum(T* values, size_t n); template -__device__ void PrefixSumConflictFree(T* values, size_t n); +__device__ void PrefixSumConflictFree(T* values, size_t n) { + PrefixSumInner(values, n, T); +} } // namespace LightGBM diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 745b474259ef..7b168c10fa17 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -7,86 +7,4 @@ namespace LightGBM { -#define ReduceSumInner(values, n) \ - const unsigned int thread_index = threadIdx.x; \ - for (size_t s = 1; s < n; s <<= 1) { \ - if (thread_index % (s << 1) == 0 && (thread_index + s) < n) { \ - values[thread_index] += values[thread_index + s]; \ - } \ - __syncthreads(); \ - } - - -#define ReduceSumConflictFreeInner(values, n) \ - const unsigned int thread_index = threadIdx.x; \ - for (size_t s = 1; s < n; s <<= 1) { \ - if (thread_index % (s << 1) == 0 && (thread_index + s) < n) { \ - values[CONFLICT_FREE_INDEX(thread_index)] += values[CONFLICT_FREE_INDEX(thread_index + s)]; \ - } \ - __syncthreads(); \ - } \ - - -#define PrefixSumInner(elements, n, type) \ - size_t offset = 1; \ - unsigned int threadIdx_x = threadIdx.x; \ - const size_t conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); \ - const type last_element = elements[conflict_free_n_minus_1]; \ - __syncthreads(); \ - for (int d = (n >> 1); d > 0; d >>= 1) { \ - if (threadIdx_x < d) { \ - const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; \ - const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; \ - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; \ - } \ - offset <<= 1; \ - __syncthreads(); \ - } \ - if (threadIdx_x == 0) { \ - elements[conflict_free_n_minus_1] = 0; \ - } \ - __syncthreads(); \ - for (int d = 1; d < n; d <<= 1) { \ - offset >>= 1; \ - if (threadIdx_x < d) { \ - const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; \ - const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; \ - const size_t conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); \ - const size_t conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); \ - const type src_val = elements[conflict_free_src_pos]; \ - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; \ - elements[conflict_free_dst_pos] += src_val; \ - } \ - __syncthreads(); \ - } \ - if (threadIdx_x == 0) { \ - elements[CONFLICT_FREE_INDEX(n)] = elements[conflict_free_n_minus_1] + last_element; \ - } \ - - -template <> -__device__ void ReduceSumConflictFree(uint16_t* values, size_t n) { - ReduceSumConflictFreeInner(values, n); -} - -template <> -__device__ void PrefixSumConflictFree(uint16_t* values, size_t n) { - PrefixSumInner(values, n, uint16_t); -} - -template <> -__device__ void PrefixSumConflictFree(uint32_t* values, size_t n) { - PrefixSumInner(values, n, uint32_t); -} - -template <> -__device__ void PrefixSumConflictFree(hist_t* values, size_t n) { - PrefixSumInner(values, n, hist_t); -} - -template <> -__device__ void ReduceSum(hist_t* values, size_t n) { - ReduceSumInner(values, n); -} - } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 90e51ce84200..30c00bccf13a 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -6,12 +6,13 @@ #ifdef USE_CUDA +#include #include "cuda_best_split_finder.hpp" namespace LightGBM { __device__ void ReduceBestGain(double* gain, hist_t* sum_gradients, - hist_t* sum_hessians, /*data_size_t* num_data,*/ uint8_t* found, + hist_t* sum_hessians, uint8_t* found, uint32_t* threshold_value) { const unsigned int tid = threadIdx.x; const unsigned int conflict_free_tid_plus_1 = CONFLICT_FREE_INDEX(tid + 1); diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 0f48b0ef260e..b5f6a052c46a 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -12,7 +12,6 @@ #include "cuda_leaf_splits.hpp" #include -#include #include #include diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index d0b4b966637a..309b0d57666b 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -6,6 +6,7 @@ #ifdef USE_CUDA +#include #include "cuda_data_partition.hpp" #include diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 01d705b796ab..db22feb61d28 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -8,7 +8,6 @@ #ifdef USE_CUDA -#include #include #include #include diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 85ade30827ff..7e58325127ef 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -6,24 +6,10 @@ #ifdef USE_CUDA +#include #include "cuda_histogram_constructor.hpp" namespace LightGBM { -/* -__device__ void ReduceSumHistogramConstructor(hist_t* array, const size_t size) { - const unsigned int threadIdx_x = threadIdx.x; - const size_t atomic_size = size / 4; - for (int s = 1; s < atomic_size; s <<= 1) { - if (threadIdx_x % (2 * s) == 0 && (threadIdx_x + s) < size) { - array[threadIdx_x] += array[threadIdx_x + s]; - } - __syncthreads(); - } - if (threadIdx_x > 0 && threadIdx_x % atomic_size == 0) { - atomicAdd_block(array, array[threadIdx_x]); - } - __syncthreads(); -}*/ template __global__ void CUDAConstructHistogramDenseKernel( diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index b6afc98dd50a..fc30731a92c7 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -8,7 +8,6 @@ #ifdef USE_CUDA -#include #include #include #include diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 5cdb08afbb98..dfb9a74e68d2 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -10,8 +10,8 @@ namespace LightGBM { -CUDALeafSplits::CUDALeafSplits(const data_size_t num_data, const int leaf_index): -num_data_(num_data), leaf_index_(leaf_index) { +CUDALeafSplits::CUDALeafSplits(const data_size_t num_data): +num_data_(num_data) { cuda_struct_ = nullptr; } @@ -24,10 +24,6 @@ void CUDALeafSplits::Init() { AllocateCUDAMemoryOuter(&cuda_sum_of_hessians_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_struct_, 1, __FILE__, __LINE__); - - cuda_streams_.resize(2); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); } void CUDALeafSplits::InitValues() { @@ -44,8 +40,7 @@ void CUDALeafSplits::InitValues( SetCUDAMemoryOuter(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); LaunchInitValuesKernal(cuda_data_indices_in_leaf, cuda_hist_in_leaf); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CopyFromCUDADeviceToHostAsyncOuter(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, cuda_streams_[1], __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index a78e1eab55c3..39d239303422 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -33,7 +33,7 @@ struct CUDALeafSplitsStruct { class CUDALeafSplits { public: - CUDALeafSplits(const data_size_t num_data, const int leaf_index); + CUDALeafSplits(const data_size_t num_data); CUDALeafSplits(); @@ -58,9 +58,7 @@ class CUDALeafSplits { // Host memory const int num_data_; - const int leaf_index_; int num_blocks_init_from_gradients_; - std::vector cuda_streams_; // CUDA memory, held by this object CUDALeafSplitsStruct* cuda_struct_; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 8a7e1f9b0618..b25fa81ea22a 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -23,9 +23,9 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia SerialTreeLearner::Init(train_data, is_constant_hessian); num_threads_ = OMP_NUM_THREADS(); CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); - cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_, 0)); + cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_)); cuda_smaller_leaf_splits_->Init(); - cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_, -1)); + cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_)); cuda_larger_leaf_splits_->Init(); cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, share_state_->feature_hist_offsets(), From ba565c1877275a3ed3d7479fc0c85eca4a09afbd Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 27 Jul 2021 11:19:31 +0000 Subject: [PATCH 047/166] change smaller leaf from <= back to < for consistent result with CPU --- src/treelearner/cuda/new_cuda_tree_learner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index b25fa81ea22a..0c0edd3fc4c8 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -191,7 +191,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, &leaf_data_start_[right_leaf_index], &leaf_sum_hessians_[best_leaf_index_], &leaf_sum_hessians_[right_leaf_index]); - smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] <= leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index); + smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index); larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_); end = std::chrono::steady_clock::now(); duration = static_cast>(end - start); From a781ef56912fdddc85e5a3885170d828d283fe2c Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 3 Aug 2021 08:21:58 +0000 Subject: [PATCH 048/166] add tree predictor --- CMakeLists.txt | 2 + include/LightGBM/boosting.h | 3 + include/LightGBM/cuda/cuda_metadata.hpp | 2 + include/LightGBM/cuda/cuda_tree.hpp | 14 +- src/application/cuda/cuda_predictor.cpp | 303 ++++++++++ src/application/cuda/cuda_predictor.cu | 115 ++++ src/application/cuda/cuda_predictor.hpp | 60 ++ src/application/predictor.hpp | 8 +- src/boosting/gbdt.cpp | 12 +- src/boosting/gbdt.h | 2 + src/c_api.cpp | 21 +- src/io/cuda/cuda_column_data.cpp | 42 +- src/io/cuda/cuda_row_data.cpp | 2 + src/io/cuda/cuda_tree.cpp | 43 +- src/io/cuda/cuda_tree.cu | 38 +- src/io/dataset.cpp | 41 +- src/objective/cuda/cuda_binary_objective.hpp | 3 +- src/objective/cuda/cuda_rank_objective.cpp | 58 ++ src/objective/cuda/cuda_rank_objective.cu | 517 ++++++++++++++++++ src/objective/cuda/cuda_rank_objective.hpp | 54 ++ .../cuda/cuda_regression_objective.cpp | 42 ++ .../cuda/cuda_regression_objective.cu | 68 +++ .../cuda/cuda_regression_objective.hpp | 49 ++ src/objective/objective_function.cpp | 6 + src/objective/rank_objective.hpp | 2 +- .../cuda/cuda_best_split_finder.cu | 9 +- .../cuda/new_cuda_tree_learner.cpp | 1 + src/treelearner/serial_tree_learner.cpp | 7 - 28 files changed, 1444 insertions(+), 80 deletions(-) create mode 100644 src/application/cuda/cuda_predictor.cpp create mode 100644 src/application/cuda/cuda_predictor.cu create mode 100644 src/application/cuda/cuda_predictor.hpp create mode 100644 src/objective/cuda/cuda_rank_objective.cpp create mode 100644 src/objective/cuda/cuda_rank_objective.cu create mode 100644 src/objective/cuda/cuda_rank_objective.hpp create mode 100644 src/objective/cuda/cuda_regression_objective.cpp create mode 100644 src/objective/cuda/cuda_regression_objective.cu create mode 100644 src/objective/cuda/cuda_regression_objective.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 89494c9616bd..9a64ceb968ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -350,6 +350,8 @@ if(USE_CUDA) src/objective/cuda/*.cu src/boosting/cuda/*.cpp src/boosting/cuda/*.cu + src/application/cuda/*.cpp + src/application/cuda/*.cu endif(USE_CUDA) ) diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index ddbcdbc18e44..bea12658b28e 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -314,6 +315,8 @@ class LIGHTGBM_EXPORT Boosting { static Boosting* CreateBoosting(const std::string& type, const char* filename); virtual bool IsLinear() const { return false; } + + virtual const std::vector>& models() const { return std::vector>(); } }; class GBDTBase : public Boosting { diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp index d4118e69856c..894147046b0c 100644 --- a/include/LightGBM/cuda/cuda_metadata.hpp +++ b/include/LightGBM/cuda/cuda_metadata.hpp @@ -29,6 +29,8 @@ class CUDAMetadata { const label_t* cuda_weights() const { return cuda_weights_; } + const data_size_t* cuda_query_boundaries() const { return cuda_query_boundaries_; } + private: label_t* cuda_label_; label_t* cuda_weights_; diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index 4607d7c7e8be..679cf02a7db5 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -13,6 +13,16 @@ namespace LightGBM { +__device__ void SetDecisionTypeCUDA(int8_t* decision_type, bool input, int8_t mask); + +__device__ void SetMissingTypeCUDA(int8_t* decision_type, int8_t input); + +__device__ bool GetDecisionTypeCUDA(int8_t decision_type, int8_t mask); + +__device__ int8_t GetMissingTypeCUDA(int8_t decision_type); + +__device__ bool IsZeroCUDA(double fval); + class CUDATree : public Tree { public: /*! @@ -72,6 +82,8 @@ class CUDATree : public Tree { inline void Shrinkage(double rate) override; + void ToHost(); + private: void InitCUDAMemory(); @@ -104,7 +116,7 @@ class CUDATree : public Tree { data_size_t* cuda_leaf_count_; double* cuda_leaf_weight_; data_size_t* cuda_internal_count_; - double* cuda_split_gain_; + float* cuda_split_gain_; cudaStream_t cuda_stream_; diff --git a/src/application/cuda/cuda_predictor.cpp b/src/application/cuda/cuda_predictor.cpp new file mode 100644 index 000000000000..676dcac4e2ea --- /dev/null +++ b/src/application/cuda/cuda_predictor.cpp @@ -0,0 +1,303 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_predictor.hpp" +#include + +namespace LightGBM { + +CUDAPredictor::CUDAPredictor(Boosting* boosting, int start_iteration, int num_iteration, bool is_raw_score, + bool predict_leaf_index, bool predict_contrib, bool early_stop, + int early_stop_freq, double early_stop_margin): + Predictor(boosting, start_iteration, num_iteration, is_raw_score, predict_leaf_index, predict_contrib, early_stop, early_stop_freq, early_stop_margin) { + auto start = std::chrono::steady_clock::now(); + InitCUDAModel(); + auto end = std::chrono::steady_clock::now(); + auto duration = static_cast>(end - start); + Log::Warning("init model time = %f", duration.count()); +} + +CUDAPredictor::~CUDAPredictor() {} + +void CUDAPredictor::Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check) { + auto start = std::chrono::steady_clock::now(); + const data_size_t num_data = ReadDataToCUDADevice(data_filename, header, disable_shape_check); + auto end = std::chrono::steady_clock::now(); + auto duration = static_cast>(end - start); + Log::Warning("read data to cuda device time = %f", duration.count()); + result_buffer_.resize(num_data, 0.0f); + // TODO(shiyu1994): free memory when prediction is finished + AllocateCUDAMemoryOuter(&cuda_data_, static_cast(num_data * num_feature_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_result_buffer_, static_cast(num_data), __FILE__, __LINE__); + start = std::chrono::steady_clock::now(); + LaunchPredictKernel(num_data); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + Log::Warning("duration = %f", duration.count()); + start = std::chrono::steady_clock::now(); + CopyFromCUDADeviceToHostOuter(result_buffer_.data(), cuda_result_buffer_, static_cast(num_data), __FILE__, __LINE__); + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + Log::Warning("copy result time = %f", duration.count()); + auto writer = VirtualFileWriter::Make(result_filename); + if (!writer->Init()) { + Log::Fatal("Prediction results file %s cannot be found", result_filename); + } + start = std::chrono::steady_clock::now(); + for (data_size_t i = 0; i < static_cast(result_buffer_.size()); ++i) { + std::string result = Common::Join({result_buffer_[i]}, "\t"); + writer->Write(result.c_str(), result.size()); + writer->Write("\n", 1); + } + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + Log::Warning("write result time = %f", duration.count()); +} + +int CUDAPredictor::ReadDataToCUDADevice(const char* data_filename, const bool header, const bool disable_shape_check) { + auto label_idx = header ? -1 : boosting_->LabelIdx(); + auto parser = std::unique_ptr(Parser::CreateParser(data_filename, header, boosting_->MaxFeatureIdx() + 1, label_idx)); + if (parser == nullptr) { + Log::Fatal("Could not recognize the data format of data file %s", data_filename); + } + if (!header && !disable_shape_check && parser->NumFeatures() != boosting_->MaxFeatureIdx() + 1) { + Log::Fatal("The number of features in data (%d) is not the same as it was in training data (%d).\n" \ + "You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", + parser->NumFeatures(), boosting_->MaxFeatureIdx() + 1); + } + TextReader predict_data_reader(data_filename, header); + std::vector feature_remapper(parser->NumFeatures(), -1); + bool need_adjust = false; + if (header) { + std::string first_line = predict_data_reader.first_line(); + std::vector header_words = Common::Split(first_line.c_str(), "\t,"); + std::unordered_map header_mapper; + for (int i = 0; i < static_cast(header_words.size()); ++i) { + if (header_mapper.count(header_words[i]) > 0) { + Log::Fatal("Feature (%s) appears more than one time.", header_words[i].c_str()); + } + header_mapper[header_words[i]] = i; + } + const auto& fnames = boosting_->FeatureNames(); + for (int i = 0; i < static_cast(fnames.size()); ++i) { + if (header_mapper.count(fnames[i]) <= 0) { + Log::Warning("Feature (%s) is missed in data file. If it is weight/query/group/ignore_column, you can ignore this warning.", fnames[i].c_str()); + } else { + feature_remapper[header_mapper.at(fnames[i])] = i; + } + } + for (int i = 0; i < static_cast(feature_remapper.size()); ++i) { + if (feature_remapper[i] >= 0 && i != feature_remapper[i]) { + need_adjust = true; + break; + } + } + } + // function for parse data + std::function>*)> parser_fun; + double tmp_label; + parser_fun = [&parser, &feature_remapper, &tmp_label, need_adjust] + (const char* buffer, std::vector>* feature) { + parser->ParseOneLine(buffer, feature, &tmp_label); + if (need_adjust) { + int i = 0, j = static_cast(feature->size()); + while (i < j) { + if (feature_remapper[(*feature)[i].first] >= 0) { + (*feature)[i].first = feature_remapper[(*feature)[i].first]; + ++i; + } else { + // move the non-used features to the end of the feature vector + std::swap((*feature)[i], (*feature)[--j]); + } + } + feature->resize(i); + } + }; + const int num_threads = OMP_NUM_THREADS(); + std::vector> feature_index_buffer(num_threads); + std::vector> feature_value_buffer(num_threads); + std::vector> feature_value_num_buffer(num_threads); + predict_feature_index_.clear(); + predict_feature_value_.clear(); + predict_row_ptr_.clear(); + predict_row_ptr_.emplace_back(0); + auto start = std::chrono::steady_clock::now(); + std::function&)> + process_fun = [&parser_fun, this, &feature_index_buffer, &feature_value_buffer, &feature_value_num_buffer, num_threads]( + data_size_t /*start_index*/, const std::vector& lines) { + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + feature_index_buffer[thread_index].clear(); + feature_value_buffer[thread_index].clear(); + feature_value_num_buffer[thread_index].clear(); + } + std::vector thread_value_num_offset(num_threads + 1, 0); + std::vector thread_line_num_offset(num_threads + 1, 0); + Threading::For(0, static_cast(lines.size()), 512, + [parser_fun, &lines, &feature_index_buffer, &feature_value_buffer, &feature_value_num_buffer, &thread_value_num_offset, &thread_line_num_offset] + (int thread_index, data_size_t start, data_size_t end) { + std::vector> oneline_features; + data_size_t num_values = 0; + for (data_size_t i = start; i < end; ++i) { + oneline_features.clear(); + // parser + parser_fun(lines[i].c_str(), &oneline_features); + for (const auto& pair : oneline_features) { + feature_index_buffer[thread_index].emplace_back(pair.first); + feature_value_buffer[thread_index].emplace_back(pair.second); + } + feature_value_num_buffer[thread_index].emplace_back(static_cast(oneline_features.size())); + num_values += static_cast(oneline_features.size()); + } + thread_value_num_offset[thread_index + 1] = num_values; + thread_line_num_offset[thread_index + 1] = end; + }); + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + thread_value_num_offset[thread_index + 1] += thread_value_num_offset[thread_index]; + } + const size_t old_num_value_size = predict_feature_index_.size(); + CHECK_EQ(old_num_value_size, predict_feature_value_.size()); + const size_t old_num_line_size = predict_row_ptr_.size(); + predict_feature_index_.resize(old_num_value_size + static_cast(thread_value_num_offset.back()), 0); + predict_feature_value_.resize(old_num_value_size + static_cast(thread_value_num_offset.back()), 0.0f); + predict_row_ptr_.resize(predict_row_ptr_.size() + lines.size(), 0); + int* predict_feature_index_ptr = predict_feature_index_.data() + old_num_value_size; + double* predict_feature_value_ptr = predict_feature_value_.data() + old_num_value_size; + data_size_t* predict_row_ptr_ptr = predict_row_ptr_.data() + old_num_line_size; + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + OMP_LOOP_EX_BEGIN(); + int* predict_feature_index_thread_ptr = predict_feature_index_ptr + thread_value_num_offset[thread_index]; + double* predict_feature_value_thread_ptr = predict_feature_value_ptr + thread_value_num_offset[thread_index]; + data_size_t* predict_row_ptr_thread_ptr = predict_row_ptr_ptr + thread_line_num_offset[thread_index]; + for (size_t i = 0; i < feature_index_buffer[thread_index].size(); ++i) { + predict_feature_index_thread_ptr[i] = feature_index_buffer[thread_index][i]; + predict_feature_value_thread_ptr[i] = feature_value_buffer[thread_index][i]; + } + for (size_t i = 0; i < feature_value_num_buffer[thread_index].size(); ++i) { + predict_row_ptr_thread_ptr[i] = feature_value_num_buffer[thread_index][i]; + } + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + }; + predict_data_reader.ReadAllAndProcessParallel(process_fun); + auto end = std::chrono::steady_clock::now(); + auto duration = static_cast>(end - start); + Log::Warning("read data to cpu duration = %f", duration.count()); + const data_size_t num_data = static_cast(predict_row_ptr_.size()) - 1; + GetPredictRowPtr(); + start = std::chrono::steady_clock::now(); + InitCUDAMemoryFromHostMemoryOuter(&cuda_predict_feature_value_, + predict_feature_value_.data(), + predict_feature_value_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_predict_feature_index_, + predict_feature_index_.data(), + predict_feature_index_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_predict_row_ptr_, + predict_row_ptr_.data(), + predict_row_ptr_.size(), + __FILE__, + __LINE__); + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + Log::Warning("read data to gpu duration = %f", duration.count()); + return num_data; +} + +void CUDAPredictor::GetPredictRowPtr() { + const int num_threads = OMP_NUM_THREADS(); + std::vector thread_offset(num_threads + 1, 0); + const data_size_t len = static_cast(predict_row_ptr_.size()); + Threading::For(0, len, 512, + [this, &thread_offset] (int thread_index, data_size_t start, data_size_t end) { + int num_value_in_thread = 0; + for (data_size_t i = start; i < end; ++i) { + num_value_in_thread += predict_row_ptr_[i]; + } + thread_offset[thread_index + 1] = num_value_in_thread; + }); + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + thread_offset[thread_index + 1] += thread_offset[thread_index]; + } + Threading::For(0, len, 512, + [this, &thread_offset] (int thread_index, data_size_t start, data_size_t end) { + int offset = thread_offset[thread_index]; + for (data_size_t i = start; i < end; ++i) { + const data_size_t num_feature_values = predict_row_ptr_[i]; + predict_row_ptr_[i] += offset; + offset += num_feature_values; + } + CHECK_EQ(offset, thread_offset[thread_index + 1]); + }); +} + +void CUDAPredictor::InitCUDAModel() { + const std::vector>& models = boosting_->models(); + const int num_trees = static_cast(models.size()); + num_trees_ = num_trees; + std::vector tree_num_leaves(num_trees, 0); + std::vector tree_left_child(num_trees, nullptr); + std::vector tree_right_child(num_trees, nullptr); + std::vector tree_leaf_value(num_trees, nullptr); + std::vector tree_threshold(num_trees, nullptr); + std::vector tree_decision_type(num_trees, nullptr); + std::vector tree_split_feature_index(num_trees, nullptr); + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_trees >= 1024) + for (int tree_index = 0; tree_index < num_trees; ++tree_index) { + tree_num_leaves[tree_index] = models[tree_index]->num_leaves(); + CHECK(models[tree_index]->is_cuda_tree()); + const CUDATree* cuda_tree = reinterpret_cast(models[tree_index].get()); + tree_left_child[tree_index] = cuda_tree->cuda_left_child(); + tree_right_child[tree_index] = cuda_tree->cuda_right_child(); + tree_leaf_value[tree_index] = cuda_tree->cuda_leaf_value(); + tree_threshold[tree_index] = cuda_tree->cuda_threshold(); + tree_decision_type[tree_index] = cuda_tree->cuda_decision_type(); + tree_split_feature_index[tree_index] = cuda_tree->cuda_split_feature(); + } + InitCUDAMemoryFromHostMemoryOuter(&cuda_tree_num_leaves_, + tree_num_leaves.data(), + tree_num_leaves.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_left_child_, + tree_left_child.data(), + tree_left_child.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_right_child_, + tree_right_child.data(), + tree_right_child.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_leaf_value_, + tree_leaf_value.data(), + tree_leaf_value.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_threshold_, + tree_threshold.data(), + tree_threshold.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_decision_type_, + tree_decision_type.data(), + tree_decision_type.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_split_feature_index_, + tree_split_feature_index.data(), + tree_split_feature_index.size(), + __FILE__, + __LINE__); +} + +} // namespace LightGBM diff --git a/src/application/cuda/cuda_predictor.cu b/src/application/cuda/cuda_predictor.cu new file mode 100644 index 000000000000..68901a0950f6 --- /dev/null +++ b/src/application/cuda/cuda_predictor.cu @@ -0,0 +1,115 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_predictor.hpp" + +namespace LightGBM { + +__global__ void PredictKernel(const data_size_t num_data, + const int num_feature, + const int* feature_index, + const double* feature_value, + const data_size_t* row_ptr, + const int* num_leaves, + const int** left_child, + const int** right_child, + const double** threshold, + const int8_t** decision_type, + const double** leaf_value, + const int** split_feature_index, + const int num_trees, + double* data, + double* cuda_result_buffer) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const unsigned int thread_index = threadIdx.x; + double* data_pointer = nullptr; + if (data_index < num_data) { + const data_size_t offset = row_ptr[data_index]; + data_pointer = data + offset; + for (int i = 0; i < num_feature; ++i) { + data_pointer[i] = 0.0f; + } + const data_size_t num_value = row_ptr[data_index + 1] - offset; + const int* data_feature_index = feature_index + offset; + const double* data_feature_value = feature_value + offset; + for (int value_index = 0; value_index < num_value; ++value_index) { + data_pointer[data_feature_index[value_index]] = data_feature_value[value_index]; + } + } + __shared__ double shared_tree_threshold[CUDA_PREDICTOR_MAX_TREE_SIZE]; + __shared__ int shared_tree_left_child[CUDA_PREDICTOR_MAX_TREE_SIZE]; + __shared__ int shared_tree_right_child[CUDA_PREDICTOR_MAX_TREE_SIZE]; + __shared__ int8_t shared_tree_decision_type[CUDA_PREDICTOR_MAX_TREE_SIZE]; + __shared__ double shared_tree_leaf_value[CUDA_PREDICTOR_MAX_TREE_SIZE]; + __shared__ int shared_tree_split_feature_index[CUDA_PREDICTOR_MAX_TREE_SIZE]; + for (int tree_index = 0; tree_index < num_trees; ++tree_index) { + const int tree_num_leaves = num_leaves[tree_index]; + const int* tree_left_child = left_child[tree_index]; + const int* tree_right_child = right_child[tree_index]; + const double* tree_threshold = threshold[tree_index]; + const double* tree_leaf_value = leaf_value[tree_index]; + const int8_t* tree_decision_type = decision_type[tree_index]; + const int* tree_split_feature_index = split_feature_index[tree_index]; + for (int leaf_index = static_cast(thread_index); leaf_index < tree_num_leaves; leaf_index += static_cast(blockDim.x)) { + shared_tree_threshold[leaf_index] = tree_threshold[leaf_index]; + shared_tree_left_child[leaf_index] = tree_left_child[leaf_index]; + shared_tree_right_child[leaf_index] = tree_right_child[leaf_index]; + shared_tree_leaf_value[leaf_index] = tree_leaf_value[leaf_index]; + shared_tree_decision_type[leaf_index] = tree_decision_type[leaf_index]; + shared_tree_split_feature_index[leaf_index] = tree_split_feature_index[leaf_index]; + } + __syncthreads(); + if (data_index < num_data) { + int node = 0; + while (node >= 0) { + const double node_threshold = shared_tree_threshold[node]; + const int node_split_feature_index = shared_tree_split_feature_index[node]; + const int8_t node_decision_type = shared_tree_decision_type[node]; + double value = data_pointer[node_split_feature_index]; + uint8_t missing_type = GetMissingTypeCUDA(node_decision_type); + if (isnan(value) && missing_type != MissingType::NaN) { + value = 0.0f; + } + if ((missing_type == MissingType::Zero && IsZeroCUDA(value)) || + (missing_type == MissingType::NaN && isnan(value))) { + if (GetDecisionTypeCUDA(node_decision_type, kDefaultLeftMask)) { + node = shared_tree_left_child[node]; + } else { + node = shared_tree_right_child[node]; + } + } else { + if (value <= node_threshold) { + node = shared_tree_left_child[node]; + } else { + node = shared_tree_right_child[node]; + } + } + } + cuda_result_buffer[data_index] += shared_tree_leaf_value[~node]; + } + } +} + +void CUDAPredictor::LaunchPredictKernel(const data_size_t num_data) { + const int num_blocks = (num_data + CUAA_PREDICTOR_PREDICT_BLOCK_SIZE - 1) / CUAA_PREDICTOR_PREDICT_BLOCK_SIZE; + PredictKernel<<>>( + num_data, + num_feature_, + cuda_predict_feature_index_, + cuda_predict_feature_value_, + cuda_predict_row_ptr_, + cuda_tree_num_leaves_, + cuda_left_child_, + cuda_right_child_, + cuda_threshold_, + cuda_decision_type_, + cuda_leaf_value_, + cuda_split_feature_index_, + num_trees_, + cuda_data_, + cuda_result_buffer_); +} + +} // namespace LightGBM diff --git a/src/application/cuda/cuda_predictor.hpp b/src/application/cuda/cuda_predictor.hpp new file mode 100644 index 000000000000..dd7033ff560e --- /dev/null +++ b/src/application/cuda/cuda_predictor.hpp @@ -0,0 +1,60 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_APPLICATION_CUDA_CUDA_PREDICTOR_HPP_ +#define LIGHTGBM_APPLICATION_CUDA_CUDA_PREDICTOR_HPP_ + +#include +#include +#include + +#include "../predictor.hpp" + +#define CUDA_PREDICTOR_MAX_TREE_SIZE (1024) +#define CUAA_PREDICTOR_PREDICT_BLOCK_SIZE (1024) + +namespace LightGBM { + +class CUDAPredictor : public Predictor { + public: + CUDAPredictor(Boosting* boosting, int start_iteration, int num_iteration, bool is_raw_score, + bool predict_leaf_index, bool predict_contrib, bool early_stop, + int early_stop_freq, double early_stop_margin); + + ~CUDAPredictor(); + + virtual void Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check) override; + private: + void InitCUDAModel(); + + data_size_t ReadDataToCUDADevice(const char* data_filename, const bool header, const bool diable_shape_check); + + void LaunchPredictKernel(const data_size_t num_data); + + void GetPredictRowPtr(); + + std::vector predict_feature_index_; + std::vector predict_feature_value_; + std::vector predict_row_ptr_; + std::vector result_buffer_; + int num_trees_; + + int* cuda_predict_feature_index_; + double* cuda_predict_feature_value_; + data_size_t* cuda_predict_row_ptr_; + double* cuda_result_buffer_; + double* cuda_data_; + + int* cuda_tree_num_leaves_; + const int** cuda_left_child_; + const int** cuda_right_child_; + const double** cuda_threshold_; + const int8_t** cuda_decision_type_; + const double** cuda_leaf_value_; + const int** cuda_split_feature_index_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_APPLICATION_CUDA_CUDA_PREDICTOR_HPP_ diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp index 472a4ab1414e..808bd2bbe85a 100644 --- a/src/application/predictor.hpp +++ b/src/application/predictor.hpp @@ -160,7 +160,7 @@ class Predictor { * \param data_filename Filename of data * \param result_filename Filename of output result */ - void Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check) { + virtual void Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check) { auto writer = VirtualFileWriter::Make(result_filename); if (!writer->Init()) { Log::Fatal("Prediction results file %s cannot be found", result_filename); @@ -249,10 +249,14 @@ class Predictor { writer->Write("\n", 1); } }; + auto start = std::chrono::steady_clock::now(); predict_data_reader.ReadAllAndProcessParallel(process_fun); + auto end = std::chrono::steady_clock::now(); + auto duration = static_cast>(end - start); + Log::Warning("duration cpu = %f", duration.count()); } - private: + protected: void CopyToPredictBuffer(double* pred_buf, const std::vector>& features) { for (const auto &feature : features) { if (feature.first < num_feature_) { diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index ea9413a17e37..23c776531187 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -525,10 +525,14 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { } std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { - std::vector tmp_score(num_data, 0.0f); - CopyFromCUDADeviceToHostOuter(tmp_score.data(), score, static_cast(num_data), __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - return metric->Eval(tmp_score.data(), objective_function_); + if (config_->device_type == std::string("cuda")) { + std::vector tmp_score(num_data, 0.0f); + CopyFromCUDADeviceToHostOuter(tmp_score.data(), score, static_cast(num_data), __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + return metric->Eval(tmp_score.data(), objective_function_); + } else { + return metric->Eval(score, objective_function_); + } } std::string GBDT::OutputMetric(int iter) { diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 8da261389cf1..9611a2cf1a7d 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -396,6 +396,8 @@ class GBDT : public GBDTBase { bool IsLinear() const override { return linear_tree_; } + const std::vector>& models() const override { return models_; } + protected: virtual bool GetIsConstHessian(const ObjectiveFunction* objective_function) { if (objective_function != nullptr) { diff --git a/src/c_api.cpp b/src/c_api.cpp index b0f828f5072f..4ac496108e03 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -26,6 +26,7 @@ #include #include +#include "application/cuda/cuda_predictor.hpp" #include "application/predictor.hpp" #include #include @@ -416,8 +417,13 @@ class Booster { is_raw_score = false; } - return Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + if (config.device_type == "cuda") { + return CUDAPredictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); + } else { + return Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); + } } void Predict(int start_iteration, int num_iteration, int predict_type, int nrow, int ncol, @@ -706,10 +712,17 @@ class Booster { } else { is_raw_score = false; } - Predictor predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, - config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); + std::unique_ptr predictor; + if (config.device_type == std::string("cuda")) { + predictor.reset(new CUDAPredictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin)); + } else { + Log::Warning("predict with cpu"); + predictor.reset(new Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin)); + } bool bool_data_has_header = data_has_header > 0 ? true : false; - predictor.Predict(data_filename, result_filename, bool_data_has_header, config.predict_disable_shape_check); + predictor->Predict(data_filename, result_filename, bool_data_has_header, config.predict_disable_shape_check); } void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) const { diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp index 34a3631d1271..39b02ed6916a 100644 --- a/src/io/cuda/cuda_column_data.cpp +++ b/src/io/cuda/cuda_column_data.cpp @@ -25,28 +25,28 @@ void CUDAColumnData::InitOneColumnData(const void* in_column_data, BinIterator* expanded_column_data[i] = static_cast((in_column_data_reintrepreted[i >> 1] >> ((i & 1) << 2)) & 0xf); } InitCUDAMemoryFromHostMemoryOuter(&cuda_column_data, - expanded_column_data.data(), - static_cast(num_data_), - __FILE__, - __LINE__); + expanded_column_data.data(), + static_cast(num_data_), + __FILE__, + __LINE__); } else { InitCUDAMemoryFromHostMemoryOuter(&cuda_column_data, - reinterpret_cast(in_column_data), - static_cast(num_data_), - __FILE__, - __LINE__); + reinterpret_cast(in_column_data), + static_cast(num_data_), + __FILE__, + __LINE__); } } else { // need to iterate bin iterator std::vector expanded_column_data(num_data_, 0); for (data_size_t i = 0; i < num_data_; ++i) { - expanded_column_data[i] = static_cast(bin_iterator->Get(i)); + expanded_column_data[i] = static_cast(bin_iterator->RawGet(i)); } InitCUDAMemoryFromHostMemoryOuter(&cuda_column_data, - reinterpret_cast(in_column_data), - static_cast(num_data_), - __FILE__, - __LINE__); + expanded_column_data.data(), + static_cast(num_data_), + __FILE__, + __LINE__); } *out_column_data_pointer = reinterpret_cast(cuda_column_data); } @@ -84,30 +84,30 @@ void CUDAColumnData::Init(const int num_columns, // is dense column if (bit_type == 4) { column_bit_type_[column_index] = 8; - InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); } else if (bit_type == 8) { - InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); } else if (bit_type == 16) { - InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); } else if (bit_type == 32) { - InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + InitOneColumnData(column_data[column_index], nullptr, &data_by_column_[column_index]); } else { Log::Fatal("Unknow column bit type %d", bit_type); } } else { // is sparse column if (bit_type == 8) { - InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + InitOneColumnData(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]); } else if (bit_type == 16) { - InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + InitOneColumnData(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]); } else if (bit_type == 32) { - InitOneColumnData(column_data[column_index], column_bin_iterator[column_index], &data_by_column_[column_index]); + InitOneColumnData(nullptr, column_bin_iterator[column_index], &data_by_column_[column_index]); } else { Log::Fatal("Unknow column bit type %d", bit_type); } } - feature_to_column_ = feature_to_column; } + feature_to_column_ = feature_to_column; InitCUDAMemoryFromHostMemoryOuter(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index d0a2ce697090..6fecde8f149a 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -209,6 +209,8 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha __FILE__, __LINE__); + Log::Warning("num_columns_ = %d", column_index); + Log::Warning("column_hist_offsets_.size() = %d", column_hist_offsets_.size()); InitCUDAMemoryFromHostMemoryOuter(&cuda_column_hist_offsets_, column_hist_offsets_.data(), column_hist_offsets_.size(), diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index 4e290b235945..f222600c7a31 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -84,10 +84,13 @@ void CUDATree::InitCUDAMemory() { static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_split_gain_, + AllocateCUDAMemoryOuter(&cuda_split_gain_, static_cast(max_leaves_), __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_leaf_parent_, 0, 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_leaf_value_, 0.0f, 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_leaf_weight_, 0.0f, 1, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_leaf_parent_, -1, 1, __FILE__, __LINE__); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); } @@ -166,4 +169,42 @@ inline void CUDATree::Shrinkage(double rate) { LaunchShrinkageKernel(rate); } +void CUDATree::ToHost() { + left_child_.resize(max_leaves_ - 1); + right_child_.resize(max_leaves_ - 1); + split_feature_inner_.resize(max_leaves_ - 1); + split_feature_.resize(max_leaves_ - 1); + threshold_in_bin_.resize(max_leaves_ - 1); + threshold_.resize(max_leaves_ - 1); + decision_type_.resize(max_leaves_ - 1, 0); + split_gain_.resize(max_leaves_ - 1); + leaf_parent_.resize(max_leaves_); + leaf_value_.resize(max_leaves_); + leaf_weight_.resize(max_leaves_); + leaf_count_.resize(max_leaves_); + internal_value_.resize(max_leaves_ - 1); + internal_weight_.resize(max_leaves_ - 1); + internal_count_.resize(max_leaves_ - 1); + leaf_depth_.resize(max_leaves_); + + const size_t num_leaves_size = static_cast(num_leaves_); + CopyFromCUDADeviceToHostOuter(left_child_.data(), cuda_left_child_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(right_child_.data(), cuda_right_child_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(split_feature_inner_.data(), cuda_split_feature_inner_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(split_feature_.data(), cuda_split_feature_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(threshold_in_bin_.data(), cuda_threshold_in_bin_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(threshold_.data(), cuda_threshold_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(decision_type_.data(), cuda_decision_type_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(split_gain_.data(), cuda_split_gain_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(leaf_parent_.data(), cuda_leaf_parent_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(leaf_value_.data(), cuda_leaf_value_, num_leaves_size, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(leaf_weight_.data(), cuda_leaf_weight_, num_leaves_size, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(leaf_count_.data(), cuda_leaf_count_, num_leaves_size, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(internal_value_.data(), cuda_internal_value_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(internal_weight_.data(), cuda_internal_weight_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(internal_count_.data(), cuda_internal_count_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(leaf_depth_.data(), cuda_leaf_depth_, num_leaves_size, __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + } // namespace LightGBM diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index bdebf2f3a0ba..c600411ec19a 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -7,7 +7,7 @@ namespace LightGBM { -__device__ void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) { +__device__ void SetDecisionTypeCUDA(int8_t* decision_type, bool input, int8_t mask) { if (input) { (*decision_type) |= mask; } else { @@ -15,11 +15,23 @@ __device__ void SetDecisionType(int8_t* decision_type, bool input, int8_t mask) } } -__device__ void SetMissingType(int8_t* decision_type, int8_t input) { +__device__ void SetMissingTypeCUDA(int8_t* decision_type, int8_t input) { (*decision_type) &= 3; (*decision_type) |= (input << 2); } +__device__ bool GetDecisionTypeCUDA(int8_t decision_type, int8_t mask) { + return (decision_type & mask) > 0; +} + +__device__ int8_t GetMissingTypeCUDA(int8_t decision_type) { + return (decision_type >> 2) & 3; +} + +__device__ bool IsZeroCUDA(double fval) { + return (fval >= -kZeroThreshold && fval <= kZeroThreshold); +} + __global__ void SplitKernel(// split information const int leaf_index, const int real_feature_index, @@ -34,7 +46,7 @@ __global__ void SplitKernel(// split information int* right_child, int* split_feature_inner, int* split_feature, - double* split_gain, + float* split_gain, double* internal_weight, double* internal_value, data_size_t* internal_count, @@ -62,7 +74,7 @@ __global__ void SplitKernel(// split information } else if (thread_index == 2) { split_feature[new_node_index] = real_feature_index; } else if (thread_index == 3) { - split_gain[new_node_index] = cuda_split_info->gain; + split_gain[new_node_index] = static_cast(cuda_split_info->gain); } else if (thread_index == 4) { // add two new leaves left_child[new_node_index] = ~leaf_index; @@ -79,13 +91,13 @@ __global__ void SplitKernel(// split information leaf_weight[leaf_index] = cuda_split_info->left_sum_hessians; } else if (thread_index == 9) { internal_value[new_node_index] = leaf_value[leaf_index]; - leaf_value[leaf_index] = std::isnan(cuda_split_info->left_value) ? 0.0f : cuda_split_info->left_value; + leaf_value[leaf_index] = isnan(cuda_split_info->left_value) ? 0.0f : cuda_split_info->left_value; } else if (thread_index == 10) { internal_count[new_node_index] = cuda_split_info->left_count + cuda_split_info->right_count; } else if (thread_index == 11) { leaf_count[leaf_index] = cuda_split_info->left_count; } else if (thread_index == 12) { - leaf_value[num_leaves] = std::isnan(cuda_split_info->right_value) ? 0.0f : cuda_split_info->right_value; + leaf_value[num_leaves] = isnan(cuda_split_info->right_value) ? 0.0f : cuda_split_info->right_value; } else if (thread_index == 13) { leaf_weight[num_leaves] = cuda_split_info->right_sum_hessians; } else if (thread_index == 14) { @@ -96,9 +108,9 @@ __global__ void SplitKernel(// split information leaf_depth[leaf_index]++; } else if (thread_index == 16) { decision_type[new_node_index] = 0; - SetDecisionType(&decision_type[new_node_index], false, kCategoricalMask); - SetDecisionType(&decision_type[new_node_index], cuda_split_info->default_left, kDefaultLeftMask); - SetMissingType(&decision_type[new_node_index], static_cast(missing_type)); + SetDecisionTypeCUDA(&decision_type[new_node_index], false, kCategoricalMask); + SetDecisionTypeCUDA(&decision_type[new_node_index], cuda_split_info->default_left, kDefaultLeftMask); + SetMissingTypeCUDA(&decision_type[new_node_index], static_cast(missing_type)); } else if (thread_index == 17) { threshold_in_bin[new_node_index] = cuda_split_info->threshold; } else if (thread_index == 18) { @@ -164,7 +176,6 @@ __global__ void AddPredictionToScoreKernel( const data_size_t data_index = USE_INDICES ? cuda_used_indices[inner_data_index] : inner_data_index; if (data_index < num_data) { int node = 0; - int num_iter = 0; while (node >= 0) { const int split_feature_inner = cuda_split_feature_inner[node]; const int column = cuda_feature_to_column[split_feature_inner]; @@ -204,10 +215,6 @@ __global__ void AddPredictionToScoreKernel( node = cuda_right_child[node]; } } - ++num_iter; - if (num_iter >= 1000) { - printf("error num_iter = %d, node = %d, ~node = %d\n", num_iter, node, ~node); - } } score[data_index] += cuda_leaf_value[~node]; } @@ -219,9 +226,6 @@ void CUDATree::LaunchAddPredictionToScoreKernel( data_size_t num_data, double* score) const { const CUDAColumnData* cuda_column_data = data->cuda_column_data(); - if (cuda_column_data == nullptr) { - Log::Warning("error cuda_column_data is nullptr"); - } const int num_blocks = (num_data + num_threads_per_block_add_prediction_to_score_ - 1) / num_threads_per_block_add_prediction_to_score_; if (used_data_indices == nullptr) { AddPredictionToScoreKernel<<>>( diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 9e32a0ea5bfb..aa21cfa319d6 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1515,6 +1515,7 @@ void Dataset::CreateCUDAColumnData() { std::vector feature_missing_is_na(num_features_, 0); std::vector feature_mfb_is_zero(num_features_, 0); std::vector feature_mfb_is_na(num_features_, 0); + Log::Warning("num_groups_ = %d", num_groups_); for (int feature_group_index = 0; feature_group_index < num_groups_; ++feature_group_index) { if (feature_groups_[feature_group_index]->is_multi_val_) { for (int sub_feature_index = 0; sub_feature_index < feature_groups_[feature_group_index]->num_feature_; ++sub_feature_index) { @@ -1562,26 +1563,28 @@ void Dataset::CreateCUDAColumnData() { column_data.emplace_back(one_column_data); column_bin_iterator.emplace_back(bin_iterator); column_bit_type.emplace_back(bit_type); - feature_to_column[feature_index] = num_columns; + for (int sub_feature_index = 0; sub_feature_index < feature_groups_[feature_group_index]->num_feature_; ++sub_feature_index) { + feature_to_column[feature_index] = num_columns; + const BinMapper* feature_bin_mapper = FeatureBinMapper(feature_index); + feature_max_bins[feature_index] = feature_max_bin(feature_index); + feature_min_bins[feature_index] = feature_min_bin(feature_index); + const uint32_t most_freq_bin = feature_bin_mapper->GetMostFreqBin(); + feature_offsets[feature_index] = static_cast(most_freq_bin == 0); + feature_most_freq_bins[feature_index] = most_freq_bin; + feature_default_bin[feature_index] = feature_bin_mapper->GetDefaultBin(); + if (feature_bin_mapper->missing_type() == MissingType::Zero) { + feature_missing_is_zero.emplace_back(1); + feature_missing_is_na.emplace_back(0); + } else if (feature_bin_mapper->missing_type() == MissingType::NaN) { + feature_missing_is_zero.emplace_back(0); + feature_missing_is_na.emplace_back(1); + } else { + feature_missing_is_zero.emplace_back(0); + feature_missing_is_na.emplace_back(0); + } + ++feature_index; + } ++num_columns; - const BinMapper* feature_bin_mapper = FeatureBinMapper(feature_index); - feature_max_bins[feature_index] = feature_max_bin(feature_index); - feature_min_bins[feature_index] = feature_min_bin(feature_index); - const uint32_t most_freq_bin = feature_bin_mapper->GetMostFreqBin(); - feature_offsets[feature_index] = static_cast(most_freq_bin == 0); - feature_most_freq_bins[feature_index] = most_freq_bin; - feature_default_bin[feature_index] = feature_bin_mapper->GetDefaultBin(); - if (feature_bin_mapper->missing_type() == MissingType::Zero) { - feature_missing_is_zero.emplace_back(1); - feature_missing_is_na.emplace_back(0); - } else if (feature_bin_mapper->missing_type() == MissingType::NaN) { - feature_missing_is_zero.emplace_back(0); - feature_missing_is_na.emplace_back(1); - } else { - feature_missing_is_zero.emplace_back(0); - feature_missing_is_na.emplace_back(0); - } - ++feature_index; } } cuda_column_data_->Init(num_columns, diff --git a/src/objective/cuda/cuda_binary_objective.hpp b/src/objective/cuda/cuda_binary_objective.hpp index d67f70eb3a1d..43826f4ba36d 100644 --- a/src/objective/cuda/cuda_binary_objective.hpp +++ b/src/objective/cuda/cuda_binary_objective.hpp @@ -40,10 +40,11 @@ class CUDABinaryLogloss : public CUDAObjectiveInterface, public BinaryLogloss { // CUDA memory, held by other objects const label_t* cuda_label_; + // TODO(shiyu1994): add weighted gradients const label_t* cuda_weights_; // CUDA memory, held by this object - mutable double* cuda_boost_from_score_; + double* cuda_boost_from_score_; }; } // namespace LightGBM diff --git a/src/objective/cuda/cuda_rank_objective.cpp b/src/objective/cuda/cuda_rank_objective.cpp new file mode 100644 index 000000000000..7a9342c3e2f9 --- /dev/null +++ b/src/objective/cuda/cuda_rank_objective.cpp @@ -0,0 +1,58 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_rank_objective.hpp" + +namespace LightGBM { + +CUDALambdarankNDCG::CUDALambdarankNDCG(const Config& config): +LambdarankNDCG(config) {} + +void CUDALambdarankNDCG::Init(const Metadata& metadata, data_size_t num_data) { + const int num_threads = OMP_NUM_THREADS(); + LambdarankNDCG::Init(metadata, num_data); + + std::vector thread_max_num_items_in_query(num_threads); + Threading::For(0, num_queries_, 1, + [this, &thread_max_num_items_in_query] (int thread_index, data_size_t start, data_size_t end) { + for (data_size_t query_index = start; query_index < end; ++query_index) { + const data_size_t query_item_count = query_boundaries_[query_index + 1] - query_boundaries_[query_index]; + if (query_item_count > thread_max_num_items_in_query[thread_index]) { + thread_max_num_items_in_query[thread_index] = query_item_count; + } + } + }); + data_size_t max_items_in_query = 0; + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + if (thread_max_num_items_in_query[thread_index] > max_items_in_query) { + max_items_in_query = thread_max_num_items_in_query[thread_index]; + } + } + max_items_in_query_aligned_ = 1; + --max_items_in_query; + while (max_items_in_query > 0) { + max_items_in_query >>= 1; + max_items_in_query_aligned_ <<= 1; + } + if (max_items_in_query_aligned_ > MAX_NUM_ITEM_IN_QUERY) { + Log::Warning("Too many items (%d) in a query.", max_items_in_query_aligned_); + } + cuda_labels_ = metadata.cuda_metadata()->cuda_label(); + cuda_query_boundaries_ = metadata.cuda_metadata()->cuda_query_boundaries(); + AllocateCUDAMemoryOuter(&cuda_lambdas_, num_data_, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_inverse_max_dcgs_, num_queries_, __FILE__, __LINE__); + LaunchCalcInverseMaxDCGKernel(); +} + +void CUDALambdarankNDCG::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { + LaunchGetGradientsKernel(score, gradients, hessians); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_rank_objective.cu b/src/objective/cuda/cuda_rank_objective.cu new file mode 100644 index 000000000000..637c1838b81b --- /dev/null +++ b/src/objective/cuda/cuda_rank_objective.cu @@ -0,0 +1,517 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_rank_objective.hpp" + +namespace LightGBM { + +__device__ void ArgSort(const score_t* scores, uint16_t* indices, const uint16_t num_items) { + uint16_t num_items_aligned = 1; + uint16_t num_items_ref = num_items - 1; + uint16_t depth = 1; + while (num_items_ref > 0) { + num_items_aligned <<= 1; + num_items_ref >>= 1; + ++depth; + } + for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { + const uint16_t outer_segment_length = 1 << (depth - outer_depth); + const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; + const bool ascending = (outer_segment_index % 2 > 0); + for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { + const uint16_t segment_length = 1 << (depth - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (threadIdx.x < num_items_aligned) { + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length; + if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) { + const uint16_t index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } + } + __syncthreads(); + } + } +} + +__device__ void ArgSort_Partial(const score_t* scores, uint16_t* indices, const uint16_t num_items, const bool outer_decending) { + uint16_t num_items_aligned = 1; + uint16_t num_items_ref = num_items - 1; + uint16_t depth = 1; + while (num_items_ref > 0) { + num_items_aligned <<= 1; + num_items_ref >>= 1; + ++depth; + } + for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { + const uint16_t outer_segment_length = 1 << (depth - outer_depth); + const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; + const bool ascending = outer_decending ? (outer_segment_index % 2 > 0) : (outer_segment_index % 2 == 0); + for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { + const uint16_t segment_length = 1 << (depth - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (threadIdx.x < num_items_aligned) { + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length; + if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) { + const uint16_t index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } + } + __syncthreads(); + } + } +} + +__device__ void ArgSort_2048(const score_t* scores, uint16_t* indices, const uint16_t num_items) { + const uint16_t depth = 11; + const uint16_t half_num_items_aligned = 1024; + ArgSort_Partial(scores, indices, half_num_items_aligned, true); + ArgSort_Partial(scores + half_num_items_aligned, indices + half_num_items_aligned, half_num_items_aligned, false); + const unsigned int index_to_compare = threadIdx.x + half_num_items_aligned; + if (scores[indices[index_to_compare]] > scores[indices[threadIdx.x]]) { + const uint16_t temp_index = indices[index_to_compare]; + indices[index_to_compare] = indices[threadIdx.x]; + indices[threadIdx.x] = temp_index; + } + __syncthreads(); + for (uint16_t inner_depth = 1; inner_depth < depth; ++inner_depth) { + const uint16_t segment_length = 1 << (depth - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (threadIdx.x < half_num_items_aligned) { + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length; + if (scores[indices[threadIdx.x]] < scores[indices[index_to_compare]]) { + const uint16_t index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } + } + __syncthreads(); + } + const score_t* scores_ptr = scores + half_num_items_aligned; + uint16_t* indices_ptr = indices + half_num_items_aligned; + for (uint16_t inner_depth = 1; inner_depth < depth; ++inner_depth) { + const uint16_t segment_length = 1 << (depth - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (threadIdx.x < half_num_items_aligned) { + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length; + if (scores_ptr[indices_ptr[threadIdx.x]] < scores_ptr[indices_ptr[index_to_compare]]) { + const uint16_t index = indices_ptr[threadIdx.x]; + indices_ptr[threadIdx.x] = indices_ptr[index_to_compare]; + indices_ptr[index_to_compare] = index; + } + } + } + __syncthreads(); + } +} + +__global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, + const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, + const bool norm, const double sigmoid, const int truncation_level, + score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + __shared__ score_t shared_scores[MAX_NUM_ITEM_IN_QUERY]; + __shared__ uint16_t shared_indices[MAX_NUM_ITEM_IN_QUERY]; + __shared__ score_t shared_lambdas[MAX_NUM_ITEM_IN_QUERY]; + __shared__ score_t shared_hessians[MAX_NUM_ITEM_IN_QUERY]; + const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; + const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); + const double min_score = kMinScore; + for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { + const double inverse_max_dcg = cuda_inverse_max_dcgs[query_index]; + const data_size_t query_start = cuda_query_boundaries[query_index]; + const data_size_t query_end = cuda_query_boundaries[query_index + 1]; + const data_size_t query_item_count = query_end - query_start; + const double* cuda_scores_pointer = cuda_scores + query_start; + score_t* cuda_out_gradients_pointer = cuda_out_gradients + query_start; + score_t* cuda_out_hessians_pointer = cuda_out_hessians + query_start; + const label_t* cuda_label_pointer = cuda_labels + query_start; + if (threadIdx.x < query_item_count) { + shared_scores[threadIdx.x] = cuda_scores_pointer[threadIdx.x]; + shared_indices[threadIdx.x] = static_cast(threadIdx.x); + shared_lambdas[threadIdx.x] = 0.0f; + shared_hessians[threadIdx.x] = 0.0f; + } else { + shared_scores[threadIdx.x] = min_score; + shared_indices[threadIdx.x] = static_cast(threadIdx.x); + } + __syncthreads(); + ArgSort(shared_scores, shared_indices, static_cast(query_item_count)); + __syncthreads(); + // get best and worst score + const double best_score = shared_scores[shared_indices[0]]; + data_size_t worst_idx = query_item_count - 1; + if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == min_score) { + worst_idx -= 1; + } + const double worst_score = shared_scores[shared_indices[worst_idx]]; + __shared__ double sum_lambdas; + if (threadIdx.x == 0) { + sum_lambdas = 0.0f; + } + __syncthreads(); + // start accumulate lambdas by pairs that contain at least one document above truncation level + const data_size_t num_items_i = min(query_item_count - 1, truncation_level); + const data_size_t num_j_per_i = query_item_count - 1; + const data_size_t num_pairs = num_items_i * num_j_per_i; + const data_size_t num_pairs_per_thread = (num_pairs + blockDim.x - 1) / blockDim.x; + const data_size_t thread_start = static_cast(threadIdx.x) * num_pairs_per_thread; + const data_size_t thread_end = min(thread_start + num_pairs_per_thread, num_pairs); + for (data_size_t pair_index = thread_start; pair_index < thread_end; ++pair_index) { + const data_size_t i = pair_index / num_j_per_i; + const data_size_t j = pair_index % num_j_per_i + 1; + if (j > i) { + // skip pairs with the same labels + if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != min_score) { + data_size_t high_rank, low_rank; + if (cuda_label_pointer[shared_indices[i]] > cuda_label_pointer[shared_indices[j]]) { + high_rank = i; + low_rank = j; + } else { + high_rank = j; + low_rank = i; + } + const data_size_t high = shared_indices[high_rank]; + const int high_label = static_cast(cuda_label_pointer[high]); + const double high_score = shared_scores[high]; + const double high_label_gain = static_cast((1 << high_label) - 1); + const double high_discount = log2(2.0f + high_rank); + const data_size_t low = shared_indices[low_rank]; + const int low_label = static_cast(cuda_label_pointer[low]); + const double low_score = shared_scores[low]; + const double low_label_gain = static_cast((1 << low_label) - 1); + const double low_discount = log2(2.0f + low_rank); + + const double delta_score = high_score - low_score; + + // get dcg gap + const double dcg_gap = high_label_gain - low_label_gain; + // get discount of this pair + const double paired_discount = fabs(high_discount - low_discount); + // get delta NDCG + double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; + // regular the delta_pair_NDCG by score distance + if (norm && best_score != worst_score) { + delta_pair_NDCG /= (0.01f + fabs(delta_score)); + } + // calculate lambda for this pair + double p_lambda = 1.0f / (1.0f + exp(sigmoid * delta_score)); + double p_hessian = p_lambda * (1.0f - p_lambda); + // update + p_lambda *= -sigmoid * delta_pair_NDCG; + p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; + atomicAdd_block(shared_lambdas + low, -static_cast(p_lambda)); + atomicAdd_block(shared_hessians + low, static_cast(p_hessian)); + atomicAdd_block(shared_lambdas + high, static_cast(p_lambda)); + atomicAdd_block(shared_hessians + high, static_cast(p_hessian)); + // lambda is negative, so use minus to accumulate + atomicAdd_block(&sum_lambdas, -2 * p_lambda); + } + } + } + __syncthreads(); + if (norm && sum_lambdas > 0) { + double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; + if (threadIdx.x < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x] * norm_factor); + cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x] * norm_factor); + } + } else { + if (threadIdx.x < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x]); + cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x]); + } + } + __syncthreads(); + } +} + +__global__ void GetGradientsKernel_Ranking_2048(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, + const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, + const bool norm, const double sigmoid, const int truncation_level, + score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + __shared__ score_t shared_scores[MAX_NUM_ITEM_IN_QUERY]; + __shared__ uint16_t shared_indices[MAX_NUM_ITEM_IN_QUERY]; + __shared__ score_t shared_lambdas[MAX_NUM_ITEM_IN_QUERY]; + __shared__ score_t shared_hessians[MAX_NUM_ITEM_IN_QUERY]; + const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; + const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); + const double min_score = kMinScore; + for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { + const double inverse_max_dcg = cuda_inverse_max_dcgs[query_index]; + const data_size_t query_start = cuda_query_boundaries[query_index]; + const data_size_t query_end = cuda_query_boundaries[query_index + 1]; + const data_size_t query_item_count = query_end - query_start; + const double* cuda_scores_pointer = cuda_scores + query_start; + score_t* cuda_out_gradients_pointer = cuda_out_gradients + query_start; + score_t* cuda_out_hessians_pointer = cuda_out_hessians + query_start; + const label_t* cuda_label_pointer = cuda_labels + query_start; + if (threadIdx.x < query_item_count) { + shared_scores[threadIdx.x] = cuda_scores_pointer[threadIdx.x]; + shared_indices[threadIdx.x] = static_cast(threadIdx.x); + shared_lambdas[threadIdx.x] = 0.0f; + shared_hessians[threadIdx.x] = 0.0f; + } else { + shared_scores[threadIdx.x] = min_score; + shared_indices[threadIdx.x] = static_cast(threadIdx.x); + } + if (query_item_count > 1024) { + const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; + if (threadIdx_x_plus_1024 < query_item_count) { + shared_scores[threadIdx_x_plus_1024] = cuda_scores_pointer[threadIdx_x_plus_1024]; + shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); + shared_lambdas[threadIdx_x_plus_1024] = 0.0f; + shared_hessians[threadIdx_x_plus_1024] = 0.0f; + } else { + shared_scores[threadIdx_x_plus_1024] = min_score; + shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); + } + } + __syncthreads(); + if (query_item_count > 1024) { + ArgSort_2048(shared_scores, shared_indices, static_cast(query_item_count)); + } else { + ArgSort(shared_scores, shared_indices, static_cast(query_item_count)); + } + __syncthreads(); + // get best and worst score + const double best_score = shared_scores[shared_indices[0]]; + data_size_t worst_idx = query_item_count - 1; + if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == min_score) { + worst_idx -= 1; + } + const double worst_score = shared_scores[shared_indices[worst_idx]]; + __shared__ double sum_lambdas; + if (threadIdx.x == 0) { + sum_lambdas = 0.0f; + } + __syncthreads(); + // start accumulate lambdas by pairs that contain at least one document above truncation level + const data_size_t num_items_i = min(query_item_count - 1, truncation_level); + const data_size_t num_j_per_i = query_item_count - 1; + const data_size_t num_pairs = num_items_i * num_j_per_i; + const data_size_t num_pairs_per_thread = (num_pairs + blockDim.x - 1) / blockDim.x; + const data_size_t thread_start = static_cast(threadIdx.x) * num_pairs_per_thread; + const data_size_t thread_end = min(thread_start + num_pairs_per_thread, num_pairs); + double thread_sum_lambdas = 0.0f; + for (data_size_t pair_index = thread_start; pair_index < thread_end; ++pair_index) { + const data_size_t i = pair_index / num_j_per_i; + const data_size_t j = pair_index % num_j_per_i + 1; + if (j > i) { + // skip pairs with the same labels + if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != min_score) { + data_size_t high_rank, low_rank; + if (cuda_label_pointer[shared_indices[i]] > cuda_label_pointer[shared_indices[j]]) { + high_rank = i; + low_rank = j; + } else { + high_rank = j; + low_rank = i; + } + const data_size_t high = shared_indices[high_rank]; + const int high_label = static_cast(cuda_label_pointer[high]); + const double high_score = shared_scores[high]; + const double high_label_gain = static_cast((1 << high_label) - 1); + const double high_discount = log2(2.0f + high_rank); + const data_size_t low = shared_indices[low_rank]; + const int low_label = static_cast(cuda_label_pointer[low]); + const double low_score = shared_scores[low]; + const double low_label_gain = static_cast((1 << low_label) - 1); + const double low_discount = log2(2.0f + low_rank); + + const double delta_score = high_score - low_score; + + // get dcg gap + const double dcg_gap = high_label_gain - low_label_gain; + // get discount of this pair + const double paired_discount = fabs(high_discount - low_discount); + // get delta NDCG + double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; + // regular the delta_pair_NDCG by score distance + if (norm && best_score != worst_score) { + delta_pair_NDCG /= (0.01f + fabs(delta_score)); + } + // calculate lambda for this pair + double p_lambda = 1.0f / (1.0f + exp(sigmoid * delta_score)); + double p_hessian = p_lambda * (1.0f - p_lambda); + // update + p_lambda *= -sigmoid * delta_pair_NDCG; + p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; + atomicAdd_block(shared_lambdas + low, -static_cast(p_lambda)); + atomicAdd_block(shared_hessians + low, static_cast(p_hessian)); + atomicAdd_block(shared_lambdas + high, static_cast(p_lambda)); + atomicAdd_block(shared_hessians + high, static_cast(p_hessian)); + // lambda is negative, so use minus to accumulate + thread_sum_lambdas -= 2 * p_lambda; + } + } + } + atomicAdd_block(&sum_lambdas, thread_sum_lambdas); + __syncthreads(); + if (norm && sum_lambdas > 0) { + double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; + if (threadIdx.x < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x] * norm_factor); + cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x] * norm_factor); + } + if (query_item_count > 1024) { + const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; + if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024] * norm_factor); + cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024] * norm_factor); + } + } + } else { + if (threadIdx.x < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x]); + cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x]); + } + if (query_item_count > 1024) { + const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; + if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024]); + cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024]); + } + } + } + __syncthreads(); + } +} + +void CUDALambdarankNDCG::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; + if (max_items_in_query_aligned_ <= 1024) { + GetGradientsKernel_Ranking<<>>(score, cuda_labels_, num_data_, + num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, + norm_, sigmoid_, truncation_level_, + gradients, hessians); + } else if (max_items_in_query_aligned_ <= 2048) { + GetGradientsKernel_Ranking_2048<<>>(score, cuda_labels_, num_data_, + num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, + norm_, sigmoid_, truncation_level_, + gradients, hessians); + } else { + Log::Fatal("Too large max_items_in_query_aligned_ = %d", max_items_in_query_aligned_); + } +} + +__device__ void PrefixSumBankConflict(uint16_t* elements, unsigned int n) { + unsigned int offset = 1; + unsigned int threadIdx_x = threadIdx.x; + const uint16_t last_element = elements[n - 1]; + __syncthreads(); + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + elements[dst_pos] += elements[src_pos]; + } + offset <<= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + elements[n - 1] = 0; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const uint32_t src_val = elements[src_pos]; + elements[src_pos] = elements[dst_pos]; + elements[dst_pos] += src_val; + } + __syncthreads(); + } + if (threadIdx.x == 0) { + elements[n] = elements[n - 1] + last_element; + } + __syncthreads(); +} + +__global__ void CalcInverseMaxDCGKernel( + const data_size_t* cuda_query_boundaries, + const label_t* cuda_labels, + const int truncation_level, + const data_size_t num_queries, + double* cuda_inverse_max_dcgs) { + __shared__ uint32_t label_sum[MAX_RANK_LABEL]; + __shared__ uint16_t label_pos[MAX_RANK_LABEL + 1]; + const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; + const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); + for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { + const data_size_t query_start = cuda_query_boundaries[query_index]; + const data_size_t query_end = cuda_query_boundaries[query_index + 1]; + const data_size_t query_count = query_end - query_start; + if (threadIdx.x < MAX_RANK_LABEL) { + label_sum[threadIdx.x] = 0; + } + __syncthreads(); + const label_t* label_pointer = cuda_labels + query_start; + if (threadIdx.x < static_cast(query_count)) { + atomicAdd_system(label_sum + (MAX_RANK_LABEL - 1 - static_cast(label_pointer[threadIdx.x])), 1); + } + __syncthreads(); + if (threadIdx.x < MAX_RANK_LABEL) { + label_pos[threadIdx.x] = label_sum[threadIdx.x]; + } + __syncthreads(); + PrefixSumBankConflict(label_pos, MAX_RANK_LABEL); + __syncthreads(); + __shared__ double gain; + if (threadIdx.x == 0) { + gain = 0.0f; + } + __syncthreads(); + if (threadIdx.x < MAX_RANK_LABEL && label_sum[threadIdx.x] > 0) { + const uint16_t start_pos = label_pos[threadIdx.x]; + const uint16_t end_pos = min(label_pos[threadIdx.x + 1], truncation_level); + double label_gain = 0.0f; + for (uint16_t k = start_pos; k < end_pos; ++k) { + label_gain += ((1 << (MAX_RANK_LABEL - 1 - threadIdx.x)) - 1) / log(2.0f + k); + } + atomicAdd_system(&gain, label_gain); + } + __syncthreads(); + if (threadIdx.x == 0) { + if (gain > 0.0f) { + cuda_inverse_max_dcgs[query_index] = 1.0f / gain; + } else { + cuda_inverse_max_dcgs[query_index] = 0.0f; + } + } + __syncthreads(); + } +} + +void CUDALambdarankNDCG::LaunchCalcInverseMaxDCGKernel() { + const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; + CalcInverseMaxDCGKernel<<>>( + cuda_query_boundaries_, + cuda_labels_, + truncation_level_, + num_queries_, + cuda_inverse_max_dcgs_); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_rank_objective.hpp b/src/objective/cuda/cuda_rank_objective.hpp new file mode 100644 index 000000000000..94b08e724ebe --- /dev/null +++ b/src/objective/cuda/cuda_rank_objective.hpp @@ -0,0 +1,54 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifndef LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ +#define LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ + +#ifdef USE_CUDA + +#define MAX_NUM_ITEM_IN_QUERY (2048) +#define NUM_QUERY_PER_BLOCK (10) +#define MAX_RANK_LABEL (32) + +#include "cuda_objective_function.hpp" +#include "../rank_objective.hpp" +#include + +namespace LightGBM { + +class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG { + public: + explicit CUDALambdarankNDCG(const Config& config); + + explicit CUDALambdarankNDCG(const std::vector& strs); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; + + private: + + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; + + void LaunchCalcInverseMaxDCGKernel(); + + // CUDA memory, held by this object + double* cuda_lambdas_; + double* cuda_inverse_max_dcgs_; + + // CUDA memory, held by other objects + const label_t* cuda_labels_; + const data_size_t* cuda_query_boundaries_; + + // Host memory + label_t max_label_; + int max_items_in_query_aligned_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ diff --git a/src/objective/cuda/cuda_regression_objective.cpp b/src/objective/cuda/cuda_regression_objective.cpp new file mode 100644 index 000000000000..11def19749db --- /dev/null +++ b/src/objective/cuda/cuda_regression_objective.cpp @@ -0,0 +1,42 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_regression_objective.hpp" + +namespace LightGBM { + +CUDARegressionL2loss::CUDARegressionL2loss(const Config& config): +RegressionL2loss(config) {} + +CUDARegressionL2loss::CUDARegressionL2loss(const std::vector& strs): +RegressionL2loss(strs) {} + +CUDARegressionL2loss::~CUDARegressionL2loss() {} + +void CUDARegressionL2loss::Init(const Metadata& metadata, data_size_t num_data) { + RegressionL2loss::Init(metadata, num_data); + cuda_labels_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + AllocateCUDAMemoryOuter(&cuda_boost_from_score_, 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_boost_from_score_, 0, 1, __FILE__, __LINE__); +} + +void CUDARegressionL2loss::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { + LaunchGetGradientsKernel(score, gradients, hessians); +} + +double CUDARegressionL2loss::BoostFromScore(int) const { + LaunchCalcInitScoreKernel(); + double boost_from_score = 0.0f; + CopyFromCUDADeviceToHostOuter(&boost_from_score, cuda_boost_from_score_, 1, __FILE__, __LINE__); + return boost_from_score; +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_regression_objective.cu b/src/objective/cuda/cuda_regression_objective.cu new file mode 100644 index 000000000000..5ccf13f50009 --- /dev/null +++ b/src/objective/cuda/cuda_regression_objective.cu @@ -0,0 +1,68 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_regression_objective.hpp" + +namespace LightGBM { + +__global__ void CalcInitScoreKernel_1_Regression(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_boost_from_score) { + __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION]; + const unsigned int tid = threadIdx.x; + const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; + shared_label[tid] = 0.0f; + __syncthreads(); + for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; ++j) { + if (i + j < num_data) { + shared_label[tid] += cuda_labels[i + j]; + } + } + __syncthreads(); + for (unsigned int s = 1; s < blockDim.x; s *= 2) { + if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION) { + shared_label[tid] += shared_label[tid + s]; + } + __syncthreads(); + } + if (tid == 0) { + atomicAdd_system(out_cuda_boost_from_score, shared_label[0]); + } +} + +__global__ void CalcInitScoreKernel_2_Regression(double* out_cuda_boost_from_score, const data_size_t num_data) { + const double suml = *out_cuda_boost_from_score; + const double sumw = static_cast(num_data); + const double init_score = suml / sumw; + *out_cuda_boost_from_score = init_score; +} + +void CUDARegressionL2loss::LaunchCalcInitScoreKernel() const { + const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; + const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; + CalcInitScoreKernel_1_Regression<<>>(cuda_labels_, num_data_, cuda_boost_from_score_); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + CalcInitScoreKernel_2_Regression<<<1, 1>>>(cuda_boost_from_score_, num_data_); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +__global__ void GetGradientsKernel_Regression(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, + score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + cuda_out_gradients[data_index] = static_cast(cuda_scores[data_index] - cuda_labels[data_index]); + cuda_out_hessians[data_index] = 1.0f; + } +} + +void CUDARegressionL2loss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + GetGradientsKernel_Regression<<>>(score, cuda_labels_, num_data_, gradients, hessians); +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_regression_objective.hpp b/src/objective/cuda/cuda_regression_objective.hpp new file mode 100644 index 000000000000..d033f3e102d3 --- /dev/null +++ b/src/objective/cuda/cuda_regression_objective.hpp @@ -0,0 +1,49 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifndef LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ +#define LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ + +#ifdef USE_CUDA + +#define GET_GRADIENTS_BLOCK_SIZE_REGRESSION (1024) +#define CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION (1024) +#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION (6) + +#include "cuda_objective_function.hpp" +#include "../regression_objective.hpp" + +namespace LightGBM { + +class CUDARegressionL2loss : public CUDAObjectiveInterface, public RegressionL2loss { + public: + explicit CUDARegressionL2loss(const Config& config); + + explicit CUDARegressionL2loss(const std::vector& strs); + + ~CUDARegressionL2loss(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; + + double BoostFromScore(int) const override; + + private: + void LaunchCalcInitScoreKernel() const; + + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; + + const label_t* cuda_labels_; + // TODO(shiyu1994): add weighted gradients + const label_t* cuda_weights_; + double* cuda_boost_from_score_; +}; + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index d0de0c371ece..c03feb9a2b88 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -11,6 +11,8 @@ #include "xentropy_objective.hpp" #include "cuda/cuda_binary_objective.hpp" +#include "cuda/cuda_regression_objective.hpp" +#include "cuda/cuda_rank_objective.hpp" namespace LightGBM { @@ -18,6 +20,10 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& if (config.device_type == std::string("cuda")) { if (type == std::string("binary")) { return new CUDABinaryLogloss(config); + } else if (type == std::string("regression")) { + return new CUDARegressionL2loss(config); + } else if (type == std::string("lambdarank")) { + return new CUDALambdarankNDCG(config); } } else { if (type == std::string("regression")) { diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 9bd7b7d99cf6..c088a0ff2147 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -255,7 +255,7 @@ class LambdarankNDCG : public RankingObjective { const char* GetName() const override { return "lambdarank"; } - private: + protected: /*! \brief Simgoid param */ double sigmoid_; /*! \brief Normalize the lambdas or not */ diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 30c00bccf13a..da6e1519d37d 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -672,7 +672,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( } __global__ void FindBestFromAllSplitsKernel(const int cur_num_leaves, - const CUDASplitInfo* cuda_leaf_best_split_info, + CUDASplitInfo* cuda_leaf_best_split_info, int* cuda_best_split_info_buffer) { __shared__ double thread_best_gain[NUM_THREADS_FIND_BEST_LEAF]; __shared__ int thread_best_leaf[NUM_THREADS_FIND_BEST_LEAF]; @@ -695,7 +695,12 @@ __global__ void FindBestFromAllSplitsKernel(const int cur_num_leaves, __syncthreads(); ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf, cur_num_valid_threads); if (threadIdx_x == 0) { - cuda_best_split_info_buffer[6] = thread_best_leaf[0]; + const int best_leaf_index = thread_best_leaf[0]; + cuda_best_split_info_buffer[6] = best_leaf_index; + if (best_leaf_index != -1) { + cuda_leaf_best_split_info[best_leaf_index].is_valid = false; + cuda_leaf_best_split_info[cur_num_leaves].is_valid = false; + } } } diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 0c0edd3fc4c8..6ca28afc1de7 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -207,6 +207,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, Log::Warning("find best split time %f", find_best_split_time); Log::Warning("find best split time from all leaves %f", find_best_split_from_all_leaves_time); Log::Warning("split data indices time %f", split_data_indices_time); + tree->ToHost(); return tree.release(); } diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index ff3375d22c07..9a30e37a03dc 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -359,9 +359,6 @@ void SerialTreeLearner::ConstructHistograms( smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), ptr_smaller_leaf_hist_data); - for (int i = 0; i < 100; ++i) { - Log::Warning("bin %d grad %f hess %f", i, ptr_smaller_leaf_hist_data[2 * i], ptr_smaller_leaf_hist_data[2 * i + 1]); - } if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { // construct larger leaf hist_t* ptr_larger_leaf_hist_data = @@ -576,8 +573,6 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, } *left_leaf = best_leaf; auto next_leaf_id = tree->NextLeafId(); - Log::Warning("best_split_info.feature = %d, best_split_info.threshold = %d", - best_split_info.feature, best_split_info.threshold); // update before tree split constraints_->BeforeSplit(best_leaf, next_leaf_id, best_split_info.monotone_type); @@ -596,8 +591,6 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, best_split_info.left_count = data_partition_->leaf_count(*left_leaf); best_split_info.right_count = data_partition_->leaf_count(next_leaf_id); } - Log::Warning("data_partition_->leaf_count(*left_leaf) = %d, data_partition_->leaf_count(next_leaf_id) = %d", - data_partition_->leaf_count(*left_leaf), data_partition_->leaf_count(next_leaf_id)); // split tree, will return right leaf *right_leaf = tree->Split( best_leaf, inner_feature_index, best_split_info.feature, From c8a6fabb4560871bc1fa310f216f7f070a475451 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 3 Aug 2021 09:47:51 +0000 Subject: [PATCH 049/166] remove useless cuda_tree_predictor --- include/LightGBM/boosting.h | 2 +- src/treelearner/cuda/cuda_tree_predictor.cpp | 36 ------------ src/treelearner/cuda/cuda_tree_predictor.cu | 0 src/treelearner/cuda/cuda_tree_predictor.hpp | 62 -------------------- 4 files changed, 1 insertion(+), 99 deletions(-) delete mode 100644 src/treelearner/cuda/cuda_tree_predictor.cpp delete mode 100644 src/treelearner/cuda/cuda_tree_predictor.cu delete mode 100644 src/treelearner/cuda/cuda_tree_predictor.hpp diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index bea12658b28e..fe9adf8e13ac 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -316,7 +316,7 @@ class LIGHTGBM_EXPORT Boosting { virtual bool IsLinear() const { return false; } - virtual const std::vector>& models() const { return std::vector>(); } + virtual const std::vector>& models() const = 0; }; class GBDTBase : public Boosting { diff --git a/src/treelearner/cuda/cuda_tree_predictor.cpp b/src/treelearner/cuda/cuda_tree_predictor.cpp deleted file mode 100644 index 701dc640ff9f..000000000000 --- a/src/treelearner/cuda/cuda_tree_predictor.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_tree_predictor.hpp" - -namespace LightGBM { - -CUDATreePredictor::CUDATreePredictor(const Config* config, - const int* tree_split_leaf_index, - const int* tree_inner_feature_index, - const uint32_t* tree_threshold, - const double* tree_threshold_real, - const double* tree_left_output, - const double* tree_right_output, - const data_size_t* tree_left_count, - const data_size_t* tree_right_count, - const double* tree_left_sum_hessian, - const double* tree_right_sum_hessian, - const double* tree_gain, - const uint8_t* tree_default_left, - const double* leaf_output): -tree_split_leaf_index_(tree_split_leaf_index), -tree_inner_feature_index_(tree_inner_feature_index), -tree_threshold_(tree_threshold) - { - -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_tree_predictor.cu b/src/treelearner/cuda/cuda_tree_predictor.cu deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/src/treelearner/cuda/cuda_tree_predictor.hpp b/src/treelearner/cuda/cuda_tree_predictor.hpp deleted file mode 100644 index be58b8cf353e..000000000000 --- a/src/treelearner/cuda/cuda_tree_predictor.hpp +++ /dev/null @@ -1,62 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ -#ifndef LIGHTGBM_CUDA_TREE_PREDICTOR_HPP_ -#define LIGHTGBM_CUDA_TREE_PREDICTOR_HPP_ - -#ifdef USE_CUDA - -#include -#include -#include - -#include - -namespace LightGBM { - -class CUDATreePredictor { - public: - CUDATreePredictor(const Config* config, - const int* tree_split_leaf_index, - const int* tree_inner_feature_index, - const uint32_t* tree_threshold, - const double* tree_threshold_real, - const double* tree_left_output, - const double* tree_right_output, - const data_size_t* tree_left_count, - const data_size_t* tree_right_count, - const double* tree_left_sum_hessian, - const double* tree_right_sum_hessian, - const double* tree_gain, - const uint8_t* tree_default_left, - const double* leaf_output); - - void Predict(const double* data, double* out_score) const; - - private: - void BuildTree(); - - void LaunchPredictKernel(const double* data, double* out_score) const; - - // CUDA memory, held by other objects - const int* tree_split_leaf_index_; - const int* tree_inner_feature_index_; - const uint32_t* tree_threshold_; - const double* tree_threshold_real_; - const double* tree_left_output_; - const double* tree_right_output_; - const data_size_t* tree_left_count_; - const data_size_t* tree_right_count_; - const double* tree_left_sum_hessian_; - const double* tree_right_sum_hessian_; - const double* tree_gain_; - const uint8_t* tree_default_left_; - const double* leaf_output_; -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_CUDA_TREE_PREDICTOR_HPP_ From a7504dc6b49414023fbf4dd6d06cf36f7f0961fe Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 4 Aug 2021 14:46:18 +0000 Subject: [PATCH 050/166] predict on CUDA with pipeline --- include/LightGBM/boosting.h | 4 + .../cuda/cuda_objective_function.hpp | 5 +- include/LightGBM/cuda/cuda_tree.hpp | 4 + include/LightGBM/cuda/cuda_utils.h | 4 + include/LightGBM/objective_function.h | 4 + include/LightGBM/tree.h | 2 +- src/application/cuda/cuda_predictor.cpp | 301 ++++++++---------- src/application/cuda/cuda_predictor.cu | 74 +++-- src/application/cuda/cuda_predictor.hpp | 39 ++- src/application/predictor.hpp | 20 +- src/boosting/cuda/cuda_score_updater.cu | 3 - src/boosting/gbdt.h | 6 + src/c_api.cpp | 28 +- src/io/cuda/cuda_tree.cpp | 19 +- src/io/cuda/cuda_tree.cu | 13 + src/objective/cuda/cuda_binary_objective.cpp | 4 + src/objective/cuda/cuda_binary_objective.cu | 12 + src/objective/cuda/cuda_binary_objective.hpp | 12 +- .../cuda/cuda_objective_function.cpp | 2 +- src/objective/cuda/cuda_rank_objective.hpp | 2 +- .../cuda/cuda_regression_objective.cpp | 4 + .../cuda/cuda_regression_objective.cu | 19 ++ .../cuda/cuda_regression_objective.hpp | 12 +- 23 files changed, 342 insertions(+), 251 deletions(-) rename {src/objective => include/LightGBM}/cuda/cuda_objective_function.hpp (76%) diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index fe9adf8e13ac..8cb416cd8169 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -317,6 +317,10 @@ class LIGHTGBM_EXPORT Boosting { virtual bool IsLinear() const { return false; } virtual const std::vector>& models() const = 0; + + virtual int num_tree_per_iteration() const = 0; + + virtual std::function GetCUDAConvertOutputFunc() const = 0; }; class GBDTBase : public Boosting { diff --git a/src/objective/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp similarity index 76% rename from src/objective/cuda/cuda_objective_function.hpp rename to include/LightGBM/cuda/cuda_objective_function.hpp index 1888ba9e4fcd..830ec29f98c8 100644 --- a/src/objective/cuda/cuda_objective_function.hpp +++ b/include/LightGBM/cuda/cuda_objective_function.hpp @@ -16,7 +16,10 @@ namespace LightGBM { class CUDAObjectiveInterface { - + public: + virtual void ConvertOutputCUDA(const data_size_t /*num_data*/, const double* /*input*/, double* /*output*/) const { + Log::Warning("in naive convert output"); + }; }; } // namespace LightGBM diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index 679cf02a7db5..f1c41222b3d1 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -82,6 +82,8 @@ class CUDATree : public Tree { inline void Shrinkage(double rate) override; + inline void AddBias(double val) override; + void ToHost(); private: @@ -101,6 +103,8 @@ class CUDATree : public Tree { void LaunchShrinkageKernel(const double rate); + void LaunchAddBiasKernel(const double val); + int* cuda_left_child_; int* cuda_right_child_; int* cuda_split_feature_inner_; diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 7bb6a14d1df5..5ea95a417727 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -94,6 +94,10 @@ void CopyFromCUDADeviceToCUDADeviceAsyncOuter(T* dst_ptr, const T* src_ptr, size void SynchronizeCUDADeviceOuter(const char* file, const int line); +void SynchronizeCUDADeviceOuter(cudaStream_t cuda_stream, const char* file, const int line) { + CUDASUCCESS_OR_FATAL_OUTER(cudaStreamSynchronize(cuda_stream)); +} + template void SetCUDAMemoryOuter(T* dst_ptr, int value, size_t size, const char* file, const int line) { CUDASUCCESS_OR_FATAL_OUTER(cudaMemset(reinterpret_cast(dst_ptr), value, size * sizeof(T))); diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index 5ea838dece23..7fc9123cc49a 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -88,6 +88,10 @@ class ObjectiveFunction { * \brief Load objective function from string object */ LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& str); + + virtual std::function GetCUDAConvertOutputFunc() const { + return [] (data_size_t, const double*, double*) {}; + } }; } // namespace LightGBM diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 4a80f9823c0b..6ad25ed4b980 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -209,7 +209,7 @@ class Tree { inline double shrinkage() const { return shrinkage_; } - inline void AddBias(double val) { + virtual inline void AddBias(double val) { #pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048) for (int i = 0; i < num_leaves_ - 1; ++i) { leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] + val); diff --git a/src/application/cuda/cuda_predictor.cpp b/src/application/cuda/cuda_predictor.cpp index 676dcac4e2ea..b563501c745b 100644 --- a/src/application/cuda/cuda_predictor.cpp +++ b/src/application/cuda/cuda_predictor.cpp @@ -11,53 +11,19 @@ namespace LightGBM { CUDAPredictor::CUDAPredictor(Boosting* boosting, int start_iteration, int num_iteration, bool is_raw_score, bool predict_leaf_index, bool predict_contrib, bool early_stop, int early_stop_freq, double early_stop_margin): - Predictor(boosting, start_iteration, num_iteration, is_raw_score, predict_leaf_index, predict_contrib, early_stop, early_stop_freq, early_stop_margin) { - auto start = std::chrono::steady_clock::now(); - InitCUDAModel(); - auto end = std::chrono::steady_clock::now(); - auto duration = static_cast>(end - start); - Log::Warning("init model time = %f", duration.count()); + Predictor(boosting, start_iteration, num_iteration, is_raw_score, predict_leaf_index, predict_contrib, early_stop, early_stop_freq, early_stop_margin), + is_raw_score_(is_raw_score), predict_leaf_index_(predict_leaf_index), predict_contrib_(predict_contrib) { + InitCUDAModel(start_iteration, num_iteration); + num_pred_in_one_row_ = static_cast(boosting_->NumPredictOneRow(start_iteration, num_iteration, predict_leaf_index, predict_contrib)); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); } CUDAPredictor::~CUDAPredictor() {} void CUDAPredictor::Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check) { - auto start = std::chrono::steady_clock::now(); - const data_size_t num_data = ReadDataToCUDADevice(data_filename, header, disable_shape_check); - auto end = std::chrono::steady_clock::now(); - auto duration = static_cast>(end - start); - Log::Warning("read data to cuda device time = %f", duration.count()); - result_buffer_.resize(num_data, 0.0f); - // TODO(shiyu1994): free memory when prediction is finished - AllocateCUDAMemoryOuter(&cuda_data_, static_cast(num_data * num_feature_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_result_buffer_, static_cast(num_data), __FILE__, __LINE__); - start = std::chrono::steady_clock::now(); - LaunchPredictKernel(num_data); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); - Log::Warning("duration = %f", duration.count()); - start = std::chrono::steady_clock::now(); - CopyFromCUDADeviceToHostOuter(result_buffer_.data(), cuda_result_buffer_, static_cast(num_data), __FILE__, __LINE__); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); - Log::Warning("copy result time = %f", duration.count()); - auto writer = VirtualFileWriter::Make(result_filename); - if (!writer->Init()) { - Log::Fatal("Prediction results file %s cannot be found", result_filename); - } - start = std::chrono::steady_clock::now(); - for (data_size_t i = 0; i < static_cast(result_buffer_.size()); ++i) { - std::string result = Common::Join({result_buffer_[i]}, "\t"); - writer->Write(result.c_str(), result.size()); - writer->Write("\n", 1); + if (predict_leaf_index_) { + CHECK_EQ(num_pred_in_one_row_, static_cast(num_iteration_)); } - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); - Log::Warning("write result time = %f", duration.count()); -} - -int CUDAPredictor::ReadDataToCUDADevice(const char* data_filename, const bool header, const bool disable_shape_check) { auto label_idx = header ? -1 : boosting_->LabelIdx(); auto parser = std::unique_ptr(Parser::CreateParser(data_filename, header, boosting_->MaxFeatureIdx() + 1, label_idx)); if (parser == nullptr) { @@ -116,146 +82,145 @@ int CUDAPredictor::ReadDataToCUDADevice(const char* data_filename, const bool he feature->resize(i); } }; - const int num_threads = OMP_NUM_THREADS(); - std::vector> feature_index_buffer(num_threads); - std::vector> feature_value_buffer(num_threads); - std::vector> feature_value_num_buffer(num_threads); - predict_feature_index_.clear(); - predict_feature_value_.clear(); - predict_row_ptr_.clear(); - predict_row_ptr_.emplace_back(0); - auto start = std::chrono::steady_clock::now(); - std::function&)> - process_fun = [&parser_fun, this, &feature_index_buffer, &feature_value_buffer, &feature_value_num_buffer, num_threads]( - data_size_t /*start_index*/, const std::vector& lines) { - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - feature_index_buffer[thread_index].clear(); - feature_value_buffer[thread_index].clear(); - feature_value_num_buffer[thread_index].clear(); - } - std::vector thread_value_num_offset(num_threads + 1, 0); - std::vector thread_line_num_offset(num_threads + 1, 0); - Threading::For(0, static_cast(lines.size()), 512, - [parser_fun, &lines, &feature_index_buffer, &feature_value_buffer, &feature_value_num_buffer, &thread_value_num_offset, &thread_line_num_offset] - (int thread_index, data_size_t start, data_size_t end) { - std::vector> oneline_features; - data_size_t num_values = 0; - for (data_size_t i = start; i < end; ++i) { + auto writer = VirtualFileWriter::Make(result_filename); + if (!writer->Init()) { + Log::Fatal("Prediction results file %s cannot be found", result_filename); + } + PredictWithParserFun(parser_fun, &predict_data_reader, writer.get()); +} + +void CUDAPredictor::PredictWithParserFun(std::function>*)> parser_fun, + TextReader* predict_data_reader, + VirtualFileWriter* writer) { + // use lager buffer size to reduce the time spent in copying from Host to CUDA + const data_size_t buffer_size = 50000; + AllocateCUDAMemoryOuter(&cuda_data_, static_cast(buffer_size) * static_cast(num_feature_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_result_buffer_, static_cast(buffer_size) * static_cast(num_pred_in_one_row_), __FILE__, __LINE__); + std::vector buffer(buffer_size * num_feature_, 0.0f); + std::vector result_buffer(buffer_size * num_pred_in_one_row_, 0.0f); + auto process_fun = [&parser_fun, &writer, &buffer, &result_buffer, buffer_size, this] + (data_size_t /*start_index*/, const std::vector& lines) { + std::vector> oneline_features; + std::vector result_to_write(lines.size()); + const data_size_t num_lines = static_cast(lines.size()); + const int num_blocks = (num_lines + buffer_size - 1) / buffer_size; + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const data_size_t block_start = block_index * buffer_size; + const data_size_t block_end = std::min(block_start + buffer_size, num_lines); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) firstprivate(oneline_features) + for (data_size_t i = block_start; i < block_end; ++i) { + OMP_LOOP_EX_BEGIN(); oneline_features.clear(); // parser parser_fun(lines[i].c_str(), &oneline_features); + // predict + const data_size_t index_in_block = i - block_start; + double* one_row_data = buffer.data() + index_in_block * num_feature_; + for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { + one_row_data[feature_index] = 0.0f; + } for (const auto& pair : oneline_features) { - feature_index_buffer[thread_index].emplace_back(pair.first); - feature_value_buffer[thread_index].emplace_back(pair.second); + one_row_data[pair.first] = pair.second; } - feature_value_num_buffer[thread_index].emplace_back(static_cast(oneline_features.size())); - num_values += static_cast(oneline_features.size()); - } - thread_value_num_offset[thread_index + 1] = num_values; - thread_line_num_offset[thread_index + 1] = end; - }); - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - thread_value_num_offset[thread_index + 1] += thread_value_num_offset[thread_index]; - } - const size_t old_num_value_size = predict_feature_index_.size(); - CHECK_EQ(old_num_value_size, predict_feature_value_.size()); - const size_t old_num_line_size = predict_row_ptr_.size(); - predict_feature_index_.resize(old_num_value_size + static_cast(thread_value_num_offset.back()), 0); - predict_feature_value_.resize(old_num_value_size + static_cast(thread_value_num_offset.back()), 0.0f); - predict_row_ptr_.resize(predict_row_ptr_.size() + lines.size(), 0); - int* predict_feature_index_ptr = predict_feature_index_.data() + old_num_value_size; - double* predict_feature_value_ptr = predict_feature_value_.data() + old_num_value_size; - data_size_t* predict_row_ptr_ptr = predict_row_ptr_.data() + old_num_line_size; - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - OMP_LOOP_EX_BEGIN(); - int* predict_feature_index_thread_ptr = predict_feature_index_ptr + thread_value_num_offset[thread_index]; - double* predict_feature_value_thread_ptr = predict_feature_value_ptr + thread_value_num_offset[thread_index]; - data_size_t* predict_row_ptr_thread_ptr = predict_row_ptr_ptr + thread_line_num_offset[thread_index]; - for (size_t i = 0; i < feature_index_buffer[thread_index].size(); ++i) { - predict_feature_index_thread_ptr[i] = feature_index_buffer[thread_index][i]; - predict_feature_value_thread_ptr[i] = feature_value_buffer[thread_index][i]; + OMP_LOOP_EX_END(); } - for (size_t i = 0; i < feature_value_num_buffer[thread_index].size(); ++i) { - predict_row_ptr_thread_ptr[i] = feature_value_num_buffer[thread_index][i]; + OMP_THROW_EX(); + SynchronizeCUDADeviceOuter(cuda_stream_, __FILE__, __LINE__); + CopyFromHostToCUDADeviceAsyncOuter(cuda_data_, buffer.data(), static_cast(buffer_size * num_feature_), cuda_stream_, __FILE__, __LINE__); + LaunchPredictKernelAsync(buffer_size, false); + CopyFromCUDADeviceToHostAsyncOuter(result_buffer.data(), + cuda_result_buffer_, + static_cast(buffer_size) * static_cast(num_pred_in_one_row_), + cuda_stream_, + __FILE__, + __LINE__); + #pragma omp parallel for schedule(static) + for (data_size_t i = block_start; i < block_end; ++i) { + OMP_LOOP_EX_BEGIN(); + const data_size_t index_in_block = i - block_start; + const double* begin = result_buffer.data() + index_in_block * num_pred_in_one_row_; + const double* end = begin + num_pred_in_one_row_; + result_to_write[i] = Common::Join(std::vector(begin, end), "\t"); + OMP_LOOP_EX_END(); } - OMP_LOOP_EX_END(); + OMP_THROW_EX(); + } + for (data_size_t i = 0; i < static_cast(result_to_write.size()); ++i) { + writer->Write(result_to_write[i].c_str(), result_to_write[i].size()); + writer->Write("\n", 1); } - OMP_THROW_EX(); }; - predict_data_reader.ReadAllAndProcessParallel(process_fun); - auto end = std::chrono::steady_clock::now(); - auto duration = static_cast>(end - start); - Log::Warning("read data to cpu duration = %f", duration.count()); - const data_size_t num_data = static_cast(predict_row_ptr_.size()) - 1; - GetPredictRowPtr(); - start = std::chrono::steady_clock::now(); - InitCUDAMemoryFromHostMemoryOuter(&cuda_predict_feature_value_, - predict_feature_value_.data(), - predict_feature_value_.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_predict_feature_index_, - predict_feature_index_.data(), - predict_feature_index_.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_predict_row_ptr_, - predict_row_ptr_.data(), - predict_row_ptr_.size(), - __FILE__, - __LINE__); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); - Log::Warning("read data to gpu duration = %f", duration.count()); - return num_data; + predict_data_reader->ReadAllAndProcessParallel(process_fun); } -void CUDAPredictor::GetPredictRowPtr() { - const int num_threads = OMP_NUM_THREADS(); - std::vector thread_offset(num_threads + 1, 0); - const data_size_t len = static_cast(predict_row_ptr_.size()); - Threading::For(0, len, 512, - [this, &thread_offset] (int thread_index, data_size_t start, data_size_t end) { - int num_value_in_thread = 0; - for (data_size_t i = start; i < end; ++i) { - num_value_in_thread += predict_row_ptr_[i]; - } - thread_offset[thread_index + 1] = num_value_in_thread; - }); - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - thread_offset[thread_index + 1] += thread_offset[thread_index]; +void CUDAPredictor::Predict(const data_size_t num_data, + const int64_t num_pred_in_one_row, + const std::function>(int row_idx)>& get_row_fun, + double* out_result) { + const data_size_t buffer_size = 50000; + CHECK_EQ(num_pred_in_one_row_, num_pred_in_one_row); + if (predict_leaf_index_) { + CHECK_EQ(num_pred_in_one_row_, static_cast(num_iteration_)); + } + AllocateCUDAMemoryOuter(&cuda_data_, static_cast(buffer_size) * static_cast(num_feature_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_result_buffer_, static_cast(buffer_size) * static_cast(num_pred_in_one_row_), __FILE__, __LINE__); + std::vector buffer(buffer_size * num_feature_, 0.0f); + const int num_blocks = (num_data + buffer_size - 1) / buffer_size; + data_size_t block_offset = 0; + for (int block_index = 0; block_index < num_blocks; ++block_index) { + Threading::For(0, buffer_size, 512, + [block_offset, get_row_fun, &buffer, this] (int /*thread_index*/, data_size_t start, data_size_t end) { + std::vector> oneline_feature; + for (data_size_t i = start; i < end; ++i) { + oneline_feature = get_row_fun(i + block_offset); + double* one_row_data = buffer.data() + i * num_feature_; + for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { + one_row_data[feature_index] = 0.0f; + } + for (const auto& pair : oneline_feature) { + one_row_data[pair.first] = pair.second; + } + } + }); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + CopyFromHostToCUDADeviceAsyncOuter(cuda_data_, buffer.data(), static_cast(buffer_size * num_feature_), cuda_stream_, __FILE__, __LINE__); + LaunchPredictKernelAsync(buffer_size, false); + CopyFromCUDADeviceToHostAsyncOuter(out_result + static_cast(block_offset) * static_cast(num_pred_in_one_row_), + cuda_result_buffer_, + static_cast(buffer_size) * static_cast(num_pred_in_one_row_), + cuda_stream_, + __FILE__, + __LINE__); + block_offset += buffer_size; } - Threading::For(0, len, 512, - [this, &thread_offset] (int thread_index, data_size_t start, data_size_t end) { - int offset = thread_offset[thread_index]; - for (data_size_t i = start; i < end; ++i) { - const data_size_t num_feature_values = predict_row_ptr_[i]; - predict_row_ptr_[i] += offset; - offset += num_feature_values; - } - CHECK_EQ(offset, thread_offset[thread_index + 1]); - }); } -void CUDAPredictor::InitCUDAModel() { +void CUDAPredictor::InitCUDAModel(const int start_iteration, const int num_iteration) { const std::vector>& models = boosting_->models(); - const int num_trees = static_cast(models.size()); - num_trees_ = num_trees; - std::vector tree_num_leaves(num_trees, 0); - std::vector tree_left_child(num_trees, nullptr); - std::vector tree_right_child(num_trees, nullptr); - std::vector tree_leaf_value(num_trees, nullptr); - std::vector tree_threshold(num_trees, nullptr); - std::vector tree_decision_type(num_trees, nullptr); - std::vector tree_split_feature_index(num_trees, nullptr); + cuda_convert_output_function_ = boosting_->GetCUDAConvertOutputFunc(); + const int num_tree_per_iteration = boosting_->num_tree_per_iteration(); + num_iteration_ = static_cast(models.size()) / num_tree_per_iteration; + start_iteration_ = std::max(start_iteration, 0); + start_iteration_ = std::min(start_iteration_, num_iteration_); + if (num_iteration > 0) { + num_iteration_ = std::min(num_iteration, num_iteration_ - start_iteration_); + } else { + num_iteration_ = num_iteration_ - start_iteration_; + } + std::vector tree_num_leaves(num_iteration_, 0); + std::vector tree_left_child(num_iteration_, nullptr); + std::vector tree_right_child(num_iteration_, nullptr); + std::vector tree_leaf_value(num_iteration_, nullptr); + std::vector tree_threshold(num_iteration_, nullptr); + std::vector tree_decision_type(num_iteration_, nullptr); + std::vector tree_split_feature_index(num_iteration_, nullptr); const int num_threads = OMP_NUM_THREADS(); - #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_trees >= 1024) - for (int tree_index = 0; tree_index < num_trees; ++tree_index) { - tree_num_leaves[tree_index] = models[tree_index]->num_leaves(); + #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_iteration_ >= 1024) + for (int tree_index = 0; tree_index < num_iteration_; ++tree_index) { CHECK(models[tree_index]->is_cuda_tree()); - const CUDATree* cuda_tree = reinterpret_cast(models[tree_index].get()); + const CUDATree* cuda_tree = reinterpret_cast(models[tree_index + start_iteration_].get()); + tree_num_leaves[tree_index] = cuda_tree->num_leaves(); tree_left_child[tree_index] = cuda_tree->cuda_left_child(); tree_right_child[tree_index] = cuda_tree->cuda_right_child(); tree_leaf_value[tree_index] = cuda_tree->cuda_leaf_value(); @@ -289,10 +254,10 @@ void CUDAPredictor::InitCUDAModel() { __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_decision_type_, - tree_decision_type.data(), - tree_decision_type.size(), - __FILE__, - __LINE__); + tree_decision_type.data(), + tree_decision_type.size(), + __FILE__, + __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_split_feature_index_, tree_split_feature_index.data(), tree_split_feature_index.size(), diff --git a/src/application/cuda/cuda_predictor.cu b/src/application/cuda/cuda_predictor.cu index 68901a0950f6..24a2f8c94846 100644 --- a/src/application/cuda/cuda_predictor.cu +++ b/src/application/cuda/cuda_predictor.cu @@ -7,11 +7,9 @@ namespace LightGBM { +template __global__ void PredictKernel(const data_size_t num_data, const int num_feature, - const int* feature_index, - const double* feature_value, - const data_size_t* row_ptr, const int* num_leaves, const int** left_child, const int** right_child, @@ -26,16 +24,9 @@ __global__ void PredictKernel(const data_size_t num_data, const unsigned int thread_index = threadIdx.x; double* data_pointer = nullptr; if (data_index < num_data) { - const data_size_t offset = row_ptr[data_index]; - data_pointer = data + offset; - for (int i = 0; i < num_feature; ++i) { - data_pointer[i] = 0.0f; - } - const data_size_t num_value = row_ptr[data_index + 1] - offset; - const int* data_feature_index = feature_index + offset; - const double* data_feature_value = feature_value + offset; - for (int value_index = 0; value_index < num_value; ++value_index) { - data_pointer[data_feature_index[value_index]] = data_feature_value[value_index]; + data_pointer = data + data_index * num_feature; + if (!PREDICT_LEAF_INDEX) { + cuda_result_buffer[data_index] = 0.0f; } } __shared__ double shared_tree_threshold[CUDA_PREDICTOR_MAX_TREE_SIZE]; @@ -87,29 +78,50 @@ __global__ void PredictKernel(const data_size_t num_data, } } } - cuda_result_buffer[data_index] += shared_tree_leaf_value[~node]; + if (PREDICT_LEAF_INDEX) { + cuda_result_buffer[data_index * num_trees + tree_index] = ~node; + } else { + cuda_result_buffer[data_index] += shared_tree_leaf_value[~node]; + } } + __syncthreads(); } } -void CUDAPredictor::LaunchPredictKernel(const data_size_t num_data) { +#define PREDICT_KERNEL_ARGS \ + num_data, \ + num_feature_, \ + cuda_tree_num_leaves_, \ + cuda_left_child_, \ + cuda_right_child_, \ + cuda_threshold_, \ + cuda_decision_type_, \ + cuda_leaf_value_, \ + cuda_split_feature_index_, \ + num_iteration_, \ + cuda_data_, \ + cuda_result_buffer_ + +void CUDAPredictor::LaunchPredictKernelAsync(const data_size_t num_data, const bool is_csr) { const int num_blocks = (num_data + CUAA_PREDICTOR_PREDICT_BLOCK_SIZE - 1) / CUAA_PREDICTOR_PREDICT_BLOCK_SIZE; - PredictKernel<<>>( - num_data, - num_feature_, - cuda_predict_feature_index_, - cuda_predict_feature_value_, - cuda_predict_row_ptr_, - cuda_tree_num_leaves_, - cuda_left_child_, - cuda_right_child_, - cuda_threshold_, - cuda_decision_type_, - cuda_leaf_value_, - cuda_split_feature_index_, - num_trees_, - cuda_data_, - cuda_result_buffer_); + if (is_csr) { + if (predict_leaf_index_) { + PredictKernel<<>>(PREDICT_KERNEL_ARGS); + } else { + PredictKernel<<>>(PREDICT_KERNEL_ARGS); + } + } else { + if (predict_leaf_index_) { + PredictKernel<<>>(PREDICT_KERNEL_ARGS); + } else { + PredictKernel<<>>(PREDICT_KERNEL_ARGS); + } + } + if (!is_raw_score_ && !predict_leaf_index_) { + cuda_convert_output_function_(num_data, cuda_result_buffer_, cuda_result_buffer_); + } } +#undef PREDICT_KERNEL_ARGS + } // namespace LightGBM diff --git a/src/application/cuda/cuda_predictor.hpp b/src/application/cuda/cuda_predictor.hpp index dd7033ff560e..1148f59c64fa 100644 --- a/src/application/cuda/cuda_predictor.hpp +++ b/src/application/cuda/cuda_predictor.hpp @@ -5,8 +5,10 @@ #ifndef LIGHTGBM_APPLICATION_CUDA_CUDA_PREDICTOR_HPP_ #define LIGHTGBM_APPLICATION_CUDA_CUDA_PREDICTOR_HPP_ +#include #include #include +#include #include #include "../predictor.hpp" @@ -25,24 +27,25 @@ class CUDAPredictor : public Predictor { ~CUDAPredictor(); virtual void Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check) override; - private: - void InitCUDAModel(); - data_size_t ReadDataToCUDADevice(const char* data_filename, const bool header, const bool diable_shape_check); + virtual void Predict(const data_size_t num_data, + const int64_t num_pred_in_one_row, + const std::function>(int row_idx)>& get_row_fun, + double* out_result) override; + + private: + void InitCUDAModel(const int start_iteration, const int num_iteration); - void LaunchPredictKernel(const data_size_t num_data); + void LaunchPredictKernelAsync(const data_size_t num_data, const bool is_csr); - void GetPredictRowPtr(); + void PredictWithParserFun(std::function>*)> parser_fun, + TextReader* predict_data_reader, + VirtualFileWriter* writer); - std::vector predict_feature_index_; - std::vector predict_feature_value_; - std::vector predict_row_ptr_; - std::vector result_buffer_; - int num_trees_; + std::function>*)> GetParserFun(const char* data_filename, + const bool header, + const bool disable_shape_check); - int* cuda_predict_feature_index_; - double* cuda_predict_feature_value_; - data_size_t* cuda_predict_row_ptr_; double* cuda_result_buffer_; double* cuda_data_; @@ -53,6 +56,16 @@ class CUDAPredictor : public Predictor { const int8_t** cuda_decision_type_; const double** cuda_leaf_value_; const int** cuda_split_feature_index_; + + cudaStream_t cuda_stream_; + + int start_iteration_; + int num_iteration_; + int64_t num_pred_in_one_row_; + const bool is_raw_score_; + const bool predict_leaf_index_; + const bool predict_contrib_; + std::function cuda_convert_output_function_; }; } // namespace LightGBM diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp index 808bd2bbe85a..36013f0dd700 100644 --- a/src/application/predictor.hpp +++ b/src/application/predictor.hpp @@ -249,11 +249,23 @@ class Predictor { writer->Write("\n", 1); } }; - auto start = std::chrono::steady_clock::now(); predict_data_reader.ReadAllAndProcessParallel(process_fun); - auto end = std::chrono::steady_clock::now(); - auto duration = static_cast>(end - start); - Log::Warning("duration cpu = %f", duration.count()); + } + + virtual void Predict(const data_size_t num_data, + const int64_t num_pred_in_one_row, + const std::function>(int row_idx)>& get_row_fun, + double* out_result) { + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) + for (int i = 0; i < num_data; ++i) { + OMP_LOOP_EX_BEGIN(); + auto one_row = get_row_fun(i); + auto pred_wrt_ptr = out_result + static_cast(num_pred_in_one_row) * i; + predict_fun_(one_row, pred_wrt_ptr); + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); } protected: diff --git a/src/boosting/cuda/cuda_score_updater.cu b/src/boosting/cuda/cuda_score_updater.cu index 72eca4b635d7..caa5c12d658a 100644 --- a/src/boosting/cuda/cuda_score_updater.cu +++ b/src/boosting/cuda/cuda_score_updater.cu @@ -20,9 +20,6 @@ __global__ void AddScoreConstantKernel( void CUDAScoreUpdater::LaunchAddScoreConstantKernel(const double val, const size_t offset) { const int num_blocks = (num_data_ + num_threads_per_block_) / num_threads_per_block_; - double cuda_score = 0.0f; - CopyFromCUDADeviceToHostOuter(&cuda_score, cuda_score_, 1, __FILE__, __LINE__); - Log::Warning("adding constant to cuda score updater, num_blocks = %d, num_data_ = %d, cuda_score_ = %f", num_blocks, num_data_, cuda_score); Log::Warning("adding init score = %f", val); AddScoreConstantKernel<<>>(val, offset, num_data_, cuda_score_); } diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 9611a2cf1a7d..756dbe7dd18d 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -398,6 +398,12 @@ class GBDT : public GBDTBase { const std::vector>& models() const override { return models_; } + int num_tree_per_iteration() const override { return num_tree_per_iteration_; } + + virtual std::function GetCUDAConvertOutputFunc() const { + return objective_function_->GetCUDAConvertOutputFunc(); + } + protected: virtual bool GetIsConstHessian(const ObjectiveFunction* objective_function) { if (objective_function != nullptr) { diff --git a/src/c_api.cpp b/src/c_api.cpp index 4ac496108e03..e2a480ddaab4 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -399,7 +399,7 @@ class Booster { *out_len = single_row_predictor->num_pred_in_one_row; } - Predictor CreatePredictor(int start_iteration, int num_iteration, int predict_type, int ncol, const Config& config) const { + Predictor* CreatePredictor(int start_iteration, int num_iteration, int predict_type, int ncol, const Config& config) const { if (!config.predict_disable_shape_check && ncol != boosting_->MaxFeatureIdx() + 1) { Log::Fatal("The number of features in data (%d) is not the same as it was in training data (%d).\n" \ "You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", ncol, boosting_->MaxFeatureIdx() + 1); @@ -418,10 +418,10 @@ class Booster { } if (config.device_type == "cuda") { - return CUDAPredictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + return new CUDAPredictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); } else { - return Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + return new Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); } } @@ -431,7 +431,7 @@ class Booster { const Config& config, double* out_result, int64_t* out_len) const { SHARED_LOCK(mutex_); - auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config); + auto predictor = std::unique_ptr(CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config)); bool is_predict_leaf = false; bool predict_contrib = false; if (predict_type == C_API_PREDICT_LEAF_INDEX) { @@ -440,17 +440,7 @@ class Booster { predict_contrib = true; } int64_t num_pred_in_one_row = boosting_->NumPredictOneRow(start_iteration, num_iteration, is_predict_leaf, predict_contrib); - auto pred_fun = predictor.GetPredictFunction(); - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) - for (int i = 0; i < nrow; ++i) { - OMP_LOOP_EX_BEGIN(); - auto one_row = get_row_fun(i); - auto pred_wrt_ptr = out_result + static_cast(num_pred_in_one_row) * i; - pred_fun(one_row, pred_wrt_ptr); - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); + predictor->Predict(nrow, num_pred_in_one_row, get_row_fun, out_result); *out_len = num_pred_in_one_row * nrow; } @@ -460,8 +450,8 @@ class Booster { std::vector>>* agg_ptr, int32_t** out_indices, void** out_data, int data_type, bool* is_data_float32_ptr, int num_matrices) const { - auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config); - auto pred_sparse_fun = predictor.GetPredictSparseFunction(); + auto predictor = std::unique_ptr(CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config)); + auto pred_sparse_fun = predictor->GetPredictSparseFunction(); std::vector>>& agg = *agg_ptr; OMP_INIT_EX(); #pragma omp parallel for schedule(static) @@ -595,8 +585,8 @@ class Booster { SHARED_LOCK(mutex_); // Get the number of trees per iteration (for multiclass scenario we output multiple sparse matrices) int num_matrices = boosting_->NumModelPerIteration(); - auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config); - auto pred_sparse_fun = predictor.GetPredictSparseFunction(); + auto predictor = std::unique_ptr(CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config)); + auto pred_sparse_fun = predictor->GetPredictSparseFunction(); bool is_col_ptr_int32 = false; bool is_data_float32 = false; int num_output_cols = ncol + 1; diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index f222600c7a31..aa4f6989a114 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -85,9 +85,9 @@ void CUDATree::InitCUDAMemory() { __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_split_gain_, - static_cast(max_leaves_), - __FILE__, - __LINE__); + static_cast(max_leaves_), + __FILE__, + __LINE__); SetCUDAMemoryOuter(cuda_leaf_parent_, 0, 1, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_leaf_value_, 0.0f, 1, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_leaf_weight_, 0.0f, 1, __FILE__, __LINE__); @@ -127,10 +127,10 @@ void CUDATree::InitCUDA() { __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_decision_type_, - decision_type_.data(), - decision_type_.size(), - __FILE__, - __LINE__); + decision_type_.data(), + decision_type_.size(), + __FILE__, + __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_leaf_value_, leaf_value_.data(), leaf_value_.size(), @@ -169,6 +169,11 @@ inline void CUDATree::Shrinkage(double rate) { LaunchShrinkageKernel(rate); } +inline void CUDATree::AddBias(double val) { + Tree::AddBias(val); + LaunchAddBiasKernel(val); +} + void CUDATree::ToHost() { left_child_.resize(max_leaves_ - 1); right_child_.resize(max_leaves_ - 1); diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index c600411ec19a..dea410e205d5 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -288,4 +288,17 @@ void CUDATree::LaunchShrinkageKernel(const double rate) { ShrinkageKernel<<>>(rate, cuda_leaf_value_, num_leaves_); } +__global__ void AddBiasKernel(const double val, double* cuda_leaf_value, const int num_leaves) { + const int leaf_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (leaf_index < num_leaves) { + cuda_leaf_value[leaf_index] += val; + } +} + +void CUDATree::LaunchAddBiasKernel(const double val) { + const int num_threads_per_block = 1024; + const int num_blocks = (num_leaves_ + num_threads_per_block - 1) / num_threads_per_block; + AddBiasKernel<<>>(val, cuda_leaf_value_, num_leaves_); +} + } // namespace LightGBM diff --git a/src/objective/cuda/cuda_binary_objective.cpp b/src/objective/cuda/cuda_binary_objective.cpp index 2210e93278ba..3fa313af789d 100644 --- a/src/objective/cuda/cuda_binary_objective.cpp +++ b/src/objective/cuda/cuda_binary_objective.cpp @@ -39,6 +39,10 @@ double CUDABinaryLogloss::BoostFromScore(int) const { return boost_from_score; } +void CUDABinaryLogloss::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { + LaunchConvertOutputCUDAKernel(num_data, input, output); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/objective/cuda/cuda_binary_objective.cu b/src/objective/cuda/cuda_binary_objective.cu index cfef05dacb05..d49e2260f0c8 100644 --- a/src/objective/cuda/cuda_binary_objective.cu +++ b/src/objective/cuda/cuda_binary_objective.cu @@ -75,6 +75,18 @@ void CUDABinaryLogloss::LaunchGetGradientsKernel(const double* scores, score_t* hessians); } +__global__ void ConvertOutputCUDAKernel(const double sigmoid, const data_size_t num_data, const double* input, double* output) { + const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (data_index < num_data) { + output[data_index] = 1.0f / (1.0f + exp(-sigmoid * input[data_index])); + } +} + +void CUDABinaryLogloss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { + const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY; + ConvertOutputCUDAKernel<<>>(sigmoid_, num_data, input, output); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/objective/cuda/cuda_binary_objective.hpp b/src/objective/cuda/cuda_binary_objective.hpp index 43826f4ba36d..4d75790ec630 100644 --- a/src/objective/cuda/cuda_binary_objective.hpp +++ b/src/objective/cuda/cuda_binary_objective.hpp @@ -13,7 +13,7 @@ #define CALC_INIT_SCORE_BLOCK_SIZE_BINARY (1024) #define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY (6) -#include "cuda_objective_function.hpp" +#include #include "../binary_objective.hpp" namespace LightGBM { @@ -33,11 +33,21 @@ class CUDABinaryLogloss : public CUDAObjectiveInterface, public BinaryLogloss { double BoostFromScore(int) const override; + void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; + + std::function GetCUDAConvertOutputFunc() const override { + return [this] (data_size_t num_data, const double* input, double* output) { + ConvertOutputCUDA(num_data, input, output); + }; + } + private: void LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const; void LaunchBoostFromScoreKernel() const; + void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; + // CUDA memory, held by other objects const label_t* cuda_label_; // TODO(shiyu1994): add weighted gradients diff --git a/src/objective/cuda/cuda_objective_function.cpp b/src/objective/cuda/cuda_objective_function.cpp index 713e96550506..733586f7d441 100644 --- a/src/objective/cuda/cuda_objective_function.cpp +++ b/src/objective/cuda/cuda_objective_function.cpp @@ -4,7 +4,7 @@ * license information. */ -#include "cuda_objective_function.hpp" +#include #include "cuda_binary_objective.hpp" namespace LightGBM { diff --git a/src/objective/cuda/cuda_rank_objective.hpp b/src/objective/cuda/cuda_rank_objective.hpp index 94b08e724ebe..a36977adae3a 100644 --- a/src/objective/cuda/cuda_rank_objective.hpp +++ b/src/objective/cuda/cuda_rank_objective.hpp @@ -13,7 +13,7 @@ #define NUM_QUERY_PER_BLOCK (10) #define MAX_RANK_LABEL (32) -#include "cuda_objective_function.hpp" +#include #include "../rank_objective.hpp" #include diff --git a/src/objective/cuda/cuda_regression_objective.cpp b/src/objective/cuda/cuda_regression_objective.cpp index 11def19749db..0c3a5333c016 100644 --- a/src/objective/cuda/cuda_regression_objective.cpp +++ b/src/objective/cuda/cuda_regression_objective.cpp @@ -37,6 +37,10 @@ double CUDARegressionL2loss::BoostFromScore(int) const { return boost_from_score; } +void CUDARegressionL2loss::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { + LaunchConvertOutputCUDAKernel(num_data, input, output); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/objective/cuda/cuda_regression_objective.cu b/src/objective/cuda/cuda_regression_objective.cu index 5ccf13f50009..002610a7bae1 100644 --- a/src/objective/cuda/cuda_regression_objective.cu +++ b/src/objective/cuda/cuda_regression_objective.cu @@ -52,6 +52,7 @@ void CUDARegressionL2loss::LaunchCalcInitScoreKernel() const { __global__ void GetGradientsKernel_Regression(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + // TODO(shiyu1994): consider sqrt_ if (data_index < num_data) { cuda_out_gradients[data_index] = static_cast(cuda_scores[data_index] - cuda_labels[data_index]); cuda_out_hessians[data_index] = 1.0f; @@ -63,6 +64,24 @@ void CUDARegressionL2loss::LaunchGetGradientsKernel(const double* score, score_t GetGradientsKernel_Regression<<>>(score, cuda_labels_, num_data_, gradients, hessians); } +// TODO(shiyu1994): try to use global kernels as class methods +__global__ void ConvertOutputCUDAKernel(const bool sqrt, const data_size_t num_data, const double* input, double* output) { + const int data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (data_index < num_data) { + if (sqrt) { + const double sign = input[0] >= 0.0f ? 1 : -1; + output[0] = sign * input[0] * input[0]; + } else { + output[0] = input[0]; + } + } +} + +void CUDARegressionL2loss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { + const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + ConvertOutputCUDAKernel<<>>(sqrt_, num_data, input, output); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/objective/cuda/cuda_regression_objective.hpp b/src/objective/cuda/cuda_regression_objective.hpp index d033f3e102d3..c1131706bfd7 100644 --- a/src/objective/cuda/cuda_regression_objective.hpp +++ b/src/objective/cuda/cuda_regression_objective.hpp @@ -13,7 +13,7 @@ #define CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION (1024) #define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION (6) -#include "cuda_objective_function.hpp" +#include #include "../regression_objective.hpp" namespace LightGBM { @@ -32,11 +32,21 @@ class CUDARegressionL2loss : public CUDAObjectiveInterface, public RegressionL2l double BoostFromScore(int) const override; + void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; + + std::function GetCUDAConvertOutputFunc() const override { + return [this] (data_size_t num_data, const double* input, double* output) { + ConvertOutputCUDA(num_data, input, output); + }; + } + private: void LaunchCalcInitScoreKernel() const; void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; + void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; + const label_t* cuda_labels_; // TODO(shiyu1994): add weighted gradients const label_t* cuda_weights_; From 896d47b28279c5e1530877533330fbc648f50e19 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 9 Aug 2021 05:55:39 +0000 Subject: [PATCH 051/166] add global sort algorithms --- include/LightGBM/cuda/cuda_algorithms.hpp | 13 + include/LightGBM/cuda/cuda_utils.h | 4 +- src/application/cuda/cuda_predictor.cpp | 39 +- src/boosting/cuda/cuda_score_updater.cu | 2 +- src/boosting/gbdt.cpp | 5 +- src/c_api.cpp | 1 - src/cuda/cuda_algorithms.cu | 436 ++++++++++++++++++ src/cuda/cuda_utils.cpp | 4 + src/io/cuda/cuda_metadata.cpp | 36 +- src/objective/cuda/cuda_binary_objective.cpp | 10 +- src/objective/cuda/cuda_binary_objective.cu | 77 +++- src/objective/cuda/cuda_binary_objective.hpp | 6 +- .../cuda/cuda_multiclass_objective.cpp | 63 +++ .../cuda/cuda_multiclass_objective.cu | 108 +++++ .../cuda/cuda_multiclass_objective.hpp | 62 +++ src/objective/cuda/cuda_rank_objective.cpp | 22 + src/objective/cuda/cuda_rank_objective.cu | 202 ++++++-- src/objective/cuda/cuda_rank_objective.hpp | 18 + src/objective/multiclass_objective.hpp | 12 +- src/objective/objective_function.cpp | 3 + src/objective/rank_objective.hpp | 6 +- 21 files changed, 1051 insertions(+), 78 deletions(-) create mode 100644 src/objective/cuda/cuda_multiclass_objective.cpp create mode 100644 src/objective/cuda/cuda_multiclass_objective.cu create mode 100644 src/objective/cuda/cuda_multiclass_objective.hpp diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 7c9b4fd35bbf..14819d6b0686 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #define NUM_BANKS_DATA_PARTITION (16) @@ -100,6 +101,18 @@ __device__ void PrefixSumConflictFree(T* values, size_t n) { PrefixSumInner(values, n, T); } +template +void CUDAQuickSort(T* values, const size_t n); + +template +void CUDAMergeSort(T* values, const size_t n); + +template +__device__ void BitonicArgSort(const VAL_T* values, INDEX_T* indices, const size_t len); + +template +void BitonicSortGlobal(VAL_T* values, const size_t len); + } // namespace LightGBM #endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 5ea95a417727..3337373827d5 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -94,9 +94,7 @@ void CopyFromCUDADeviceToCUDADeviceAsyncOuter(T* dst_ptr, const T* src_ptr, size void SynchronizeCUDADeviceOuter(const char* file, const int line); -void SynchronizeCUDADeviceOuter(cudaStream_t cuda_stream, const char* file, const int line) { - CUDASUCCESS_OR_FATAL_OUTER(cudaStreamSynchronize(cuda_stream)); -} +void SynchronizeCUDADeviceOuter(cudaStream_t cuda_stream, const char* file, const int line); template void SetCUDAMemoryOuter(T* dst_ptr, int value, size_t size, const char* file, const int line) { diff --git a/src/application/cuda/cuda_predictor.cpp b/src/application/cuda/cuda_predictor.cpp index b563501c745b..aecd5cc440c8 100644 --- a/src/application/cuda/cuda_predictor.cpp +++ b/src/application/cuda/cuda_predictor.cpp @@ -13,6 +13,9 @@ CUDAPredictor::CUDAPredictor(Boosting* boosting, int start_iteration, int num_it int early_stop_freq, double early_stop_margin): Predictor(boosting, start_iteration, num_iteration, is_raw_score, predict_leaf_index, predict_contrib, early_stop, early_stop_freq, early_stop_margin), is_raw_score_(is_raw_score), predict_leaf_index_(predict_leaf_index), predict_contrib_(predict_contrib) { + if (predict_contrib_) { + Log::Fatal("pred_contrib=True is not supported by CUDA version yet."); + } InitCUDAModel(start_iteration, num_iteration); num_pred_in_one_row_ = static_cast(boosting_->NumPredictOneRow(start_iteration, num_iteration, predict_leaf_index, predict_contrib)); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); @@ -93,6 +96,7 @@ void CUDAPredictor::PredictWithParserFun(std::function* predict_data_reader, VirtualFileWriter* writer) { // use lager buffer size to reduce the time spent in copying from Host to CUDA + // TODO(shiyu1994): optimize the pipeline and asynchronization behavior const data_size_t buffer_size = 50000; AllocateCUDAMemoryOuter(&cuda_data_, static_cast(buffer_size) * static_cast(num_feature_), __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_result_buffer_, static_cast(buffer_size) * static_cast(num_pred_in_one_row_), __FILE__, __LINE__); @@ -126,25 +130,28 @@ void CUDAPredictor::PredictWithParserFun(std::function(cuda_data_, buffer.data(), static_cast(buffer_size * num_feature_), cuda_stream_, __FILE__, __LINE__); LaunchPredictKernelAsync(buffer_size, false); CopyFromCUDADeviceToHostAsyncOuter(result_buffer.data(), - cuda_result_buffer_, - static_cast(buffer_size) * static_cast(num_pred_in_one_row_), - cuda_stream_, - __FILE__, - __LINE__); - #pragma omp parallel for schedule(static) - for (data_size_t i = block_start; i < block_end; ++i) { - OMP_LOOP_EX_BEGIN(); - const data_size_t index_in_block = i - block_start; - const double* begin = result_buffer.data() + index_in_block * num_pred_in_one_row_; - const double* end = begin + num_pred_in_one_row_; - result_to_write[i] = Common::Join(std::vector(begin, end), "\t"); - OMP_LOOP_EX_END(); + cuda_result_buffer_, + static_cast(buffer_size) * static_cast(num_pred_in_one_row_), + cuda_stream_, + __FILE__, + __LINE__); + SynchronizeCUDADeviceOuter(cuda_stream_, __FILE__, __LINE__); + { + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) + for (data_size_t i = block_start; i < block_end; ++i) { + OMP_LOOP_EX_BEGIN(); + const data_size_t index_in_block = i - block_start; + const double* begin = result_buffer.data() + index_in_block * num_pred_in_one_row_; + const double* end = begin + num_pred_in_one_row_; + result_to_write[i] = Common::Join(std::vector(begin, end), "\t"); + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); } - OMP_THROW_EX(); } for (data_size_t i = 0; i < static_cast(result_to_write.size()); ++i) { writer->Write(result_to_write[i].c_str(), result_to_write[i].size()); @@ -183,7 +190,6 @@ void CUDAPredictor::Predict(const data_size_t num_data, } } }); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CopyFromHostToCUDADeviceAsyncOuter(cuda_data_, buffer.data(), static_cast(buffer_size * num_feature_), cuda_stream_, __FILE__, __LINE__); LaunchPredictKernelAsync(buffer_size, false); CopyFromCUDADeviceToHostAsyncOuter(out_result + static_cast(block_offset) * static_cast(num_pred_in_one_row_), @@ -192,6 +198,7 @@ void CUDAPredictor::Predict(const data_size_t num_data, cuda_stream_, __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); block_offset += buffer_size; } } diff --git a/src/boosting/cuda/cuda_score_updater.cu b/src/boosting/cuda/cuda_score_updater.cu index caa5c12d658a..009a1873bd2d 100644 --- a/src/boosting/cuda/cuda_score_updater.cu +++ b/src/boosting/cuda/cuda_score_updater.cu @@ -14,7 +14,7 @@ __global__ void AddScoreConstantKernel( double* score) { const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (data_index < num_data) { - score[data_index] += val; + score[data_index + offset] += val; } } diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 23c776531187..cb76ef445c05 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -441,6 +441,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } new_tree->AsConstantTree(output); // updates scores + // TODO(shiyu1994): check here, default score has been added in BoostFromAverage ? train_score_updater_->AddScore(output, cur_tree_id); for (auto& score_updater : valid_score_updater_) { score_updater->AddScore(output, cur_tree_id); @@ -526,8 +527,8 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { if (config_->device_type == std::string("cuda")) { - std::vector tmp_score(num_data, 0.0f); - CopyFromCUDADeviceToHostOuter(tmp_score.data(), score, static_cast(num_data), __FILE__, __LINE__); + std::vector tmp_score(num_data * num_class_, 0.0f); + CopyFromCUDADeviceToHostOuter(tmp_score.data(), score, static_cast(num_data * num_class_), __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); return metric->Eval(tmp_score.data(), objective_function_); } else { diff --git a/src/c_api.cpp b/src/c_api.cpp index e2a480ddaab4..ad336929eeb1 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -707,7 +707,6 @@ class Booster { predictor.reset(new CUDAPredictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin)); } else { - Log::Warning("predict with cpu"); predictor.reset(new Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin)); } diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 7b168c10fa17..a259aaf14d2c 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -7,4 +7,440 @@ namespace LightGBM { +#define QUICKSORT_MAX_DEPTH (12) +#define BITONIC_SORT_NUM_ELEMENTS (1024) +#define BITONIC_SORT_DEPTH (11) + +template +__global__ void BitonicSort(T* values, const int low, const int high) { + const int thread_index = static_cast(threadIdx.x); + T* values_pointer = values + low; + const int num_data = high - low; + __shared__ T shared_values[BITONIC_SORT_NUM_ELEMENTS]; + if (thread_index < num_data) { + shared_values[thread_index] = values_pointer[thread_index]; + } + __syncthreads(); + for (int depth = BITONIC_SORT_DEPTH - 1; depth >= 1; --depth) { + const int segment_length = 1 << (BITONIC_SORT_DEPTH - depth); + const int segment_index = thread_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + for (int inner_depth = depth; inner_depth < BITONIC_SORT_DEPTH; ++inner_depth) { + const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); + const int inner_segment_index_half = thread_index / inner_segment_length_half; + if (inner_segment_index_half % 2 == 0) { + const int index_to_compare = thread_index + inner_segment_length_half; + if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { + const T tmp = shared_values[thread_index]; + shared_values[thread_index] = shared_values[index_to_compare]; + shared_values[index_to_compare] = tmp; + } + } + __syncthreads(); + } + } + if (thread_index < num_data) { + values_pointer[thread_index] = shared_values[thread_index]; + } +} + +template +__global__ void BitonicSortForMergeSort(T* values, const int num_total_data) { + const int thread_index = static_cast(threadIdx.x); + const int low = static_cast(blockIdx.x * BITONIC_SORT_NUM_ELEMENTS); + T* values_pointer = values + low; + const int num_data = min(BITONIC_SORT_NUM_ELEMENTS, num_total_data - low); + __shared__ T shared_values[BITONIC_SORT_NUM_ELEMENTS]; + if (thread_index < num_data) { + shared_values[thread_index] = values_pointer[thread_index]; + } + __syncthreads(); + for (int depth = BITONIC_SORT_DEPTH - 1; depth >= 1; --depth) { + const int segment_length = 1 << (BITONIC_SORT_DEPTH - depth); + const int segment_index = thread_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + for (int inner_depth = depth; inner_depth < BITONIC_SORT_DEPTH; ++inner_depth) { + const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); + const int inner_segment_index_half = thread_index / inner_segment_length_half; + if (inner_segment_index_half % 2 == 0) { + const int index_to_compare = thread_index + inner_segment_length_half; + if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { + const T tmp = shared_values[thread_index]; + shared_values[thread_index] = shared_values[index_to_compare]; + shared_values[index_to_compare] = tmp; + } + } + __syncthreads(); + } + } + if (thread_index < num_data) { + values_pointer[thread_index] = shared_values[thread_index]; + } +} + +template +__global__ void CUDAQuickSortHelper(T* values, const int low, const int high, const int depth) { + if (high - low <= BITONIC_SORT_NUM_ELEMENTS) { + cudaStream_t cuda_stream; + cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking); + BitonicSort<<<1, BITONIC_SORT_NUM_ELEMENTS, 0, cuda_stream>>>(values, low, high); + cudaStreamDestroy(cuda_stream); + return; + } + int i = low - 1; + int j = high - 1; + int p = i; + int q = j; + const T pivot = values[high - 1]; + while (i < j) { + if (ASCENDING) { + while (values[++i] < pivot); + } else { + while (values[++i] > pivot); + } + if (ASCENDING) { + while (j > low && values[--j] > pivot); + } else { + while (j > low && values[--j] < pivot); + } + if (i < j) { + const T tmp = values[j]; + values[j] = values[i]; + values[i] = tmp; + if (values[i] == pivot) { + ++p; + const T tmp = values[i]; + values[i] = values[p]; + values[p] = tmp; + } + if (values[j] == pivot) { + --q; + const T tmp = values[j]; + values[j] = values[q]; + values[q] = tmp; + } + } + } + values[high - 1] = values[i]; + values[i] = pivot; + j = i - 1; + i = i + 1; + for (int k = low; k <= p; ++k, --j) { + const T tmp = values[k]; + values[k] = values[j]; + values[j] = tmp; + } + for (int k = high - 2; k >= q; --k, ++i) { + const T tmp = values[k]; + values[k] = values[i]; + values[i] = tmp; + } + if (j > low) { + cudaStream_t cuda_stream; + cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking); + CUDAQuickSortHelper<<<1, 1>>>(values, low, j + 1, depth + 1); + cudaStreamDestroy(cuda_stream); + } + if (i + 1 < high) { + cudaStream_t cuda_stream; + cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking); + CUDAQuickSortHelper<<<1, 1>>>(values, i, high, depth + 1); + cudaStreamDestroy(cuda_stream); + } +} + +template <> +void CUDAQuickSort(int* values, const size_t n) { + CUDAQuickSortHelper<<<1, 1>>>(values, 0, static_cast(n), 0); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +template +__global__ void CUDAMergeKernel(T* values, T* buffer, int block_size, int len) { + const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const int low = block_size * 2 * thread_index; + T* values_first_part = values + low; + int num_data_first_part = min(block_size, len - low); + T* values_second_part = values + low + block_size; + int num_data_second_part = min(block_size, len - low - block_size); + T* buffer_pointer = buffer + low; + int first_part_index = 0; + int second_part_index = 0; + int buffer_index = 0; + while (first_part_index < num_data_first_part && second_part_index < num_data_second_part) { + if (ASCENDING) { + if (values_first_part[first_part_index] > values_second_part[second_part_index]) { + buffer_pointer[buffer_index++] = values_second_part[second_part_index++]; + } else { + buffer_pointer[buffer_index++] = values_first_part[first_part_index++]; + } + } else { + if (values_first_part[first_part_index] < values_second_part[second_part_index]) { + buffer_pointer[buffer_index++] = values_second_part[second_part_index++]; + } else { + buffer_pointer[buffer_index++] = values_first_part[first_part_index++]; + } + } + } + while (first_part_index < num_data_first_part) { + buffer_pointer[buffer_index++] = values_first_part[first_part_index++]; + } + for (int data_index = 0; data_index < buffer_index; ++data_index) { + values_first_part[data_index] = buffer_pointer[data_index]; + } +} + +template <> +void CUDAMergeSort(int* values, const size_t n) { + const int bitonic_num_blocks = (static_cast(n) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; + auto start = std::chrono::steady_clock::now(); + BitonicSortForMergeSort<<>>(values, n); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + auto end = std::chrono::steady_clock::now(); + auto duration = static_cast>(end - start); + Log::Warning("bitonic sort time = %f", duration.count()); + int num_blocks_to_merge = bitonic_num_blocks; + int* buffer = nullptr; + AllocateCUDAMemoryOuter(&buffer, n, __FILE__, __LINE__); + int block_size = BITONIC_SORT_NUM_ELEMENTS; + start = std::chrono::steady_clock::now(); + while (num_blocks_to_merge > 1) { + num_blocks_to_merge = (num_blocks_to_merge + 1) / 2; + const int block_dim = 32; + const int num_kernel_blocks = (num_blocks_to_merge + block_dim - 1) / block_dim; + CUDAMergeKernel<<>>(values, buffer, block_size, static_cast(n)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + block_size <<= 1; + } + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + Log::Warning("merge time = %f", duration.count()); +} + +template +__device__ void BitonicArgSort_1024(const VAL_T* values, INDEX_T* indices, const size_t len) { + const int thread_index = static_cast(threadIdx.x); + for (int depth = 9; depth >= 1; --depth) { + const int segment_length = 1 << (10 - depth); + const int segment_index = thread_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 1) : (segment_index % 2 == 0); + for (int inner_depth = depth; inner_depth < 10; ++inner_depth) { + const int inner_segment_length_half = 1 << (9 - inner_depth); + const int inner_segment_index_half = thread_index / inner_segment_length_half; + if (inner_segment_index_half % 2 == 0) { + const int index_to_compare = thread_index + inner_segment_length_half; + const INDEX_T this_index = indices[thread_index]; + const INDEX_T other_index = indices[index_to_compare]; + if ((values[this_index] > values[other_index]) == ascending && (index_to_compare < static_cast(len))) { + indices[thread_index] = other_index; + indices[index_to_compare] = this_index; + } + } + __syncthreads(); + } + } +} + +template +__device__ void BitonicArgSort(const VAL_T* values, INDEX_T* indices, size_t len) { + const int num_segments = (static_cast(len) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; + int max_depth = 1; + int num_segments_to_move = num_segments - 1; + while (num_segments_to_move > 0) { + ++max_depth; + } + for (int depth = max_depth - 1; depth >= 1; --depth) { + const int segment_length = 1 << (max_depth - depth); + const int num_segments_in_level = 1 << depth; + for (int segment_index = 0; segment_index < num_segments_in_level; ++segment_index) { + const bool ascending = (segment_index % 2 == 0); + const size_t segment_start = segment_index * segment_length * BITONIC_SORT_NUM_ELEMENTS; + //const size_t segment_end = min(segment_start + ) + } + } +} + +template +__global__ void BitonicSortGlobalKernel(T* values, const int num_total_data) { + const int thread_index = static_cast(threadIdx.x); + const int low = static_cast(blockIdx.x * BITONIC_SORT_NUM_ELEMENTS); + const bool outer_ascending = ASCENDING ? (blockIdx.x % 2 == 0) : (blockIdx.x % 2 == 1); + T* values_pointer = values + low; + const int num_data = min(BITONIC_SORT_NUM_ELEMENTS, num_total_data - low); + __shared__ T shared_values[BITONIC_SORT_NUM_ELEMENTS]; + if (thread_index < num_data) { + shared_values[thread_index] = values_pointer[thread_index]; + } + __syncthreads(); + for (int depth = BITONIC_SORT_DEPTH - 1; depth >= 1; --depth) { + const int segment_length = 1 << (BITONIC_SORT_DEPTH - depth); + const int segment_index = thread_index / segment_length; + const bool ascending = outer_ascending ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + const int num_total_segment = (num_data + segment_length - 1) / segment_length; + { + const int inner_depth = depth; + const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); + const int inner_segment_index_half = thread_index / inner_segment_length_half; + const int offset = ((inner_segment_index_half >> 1) == num_total_segment - 1 && ascending == ASCENDING) ? + (num_total_segment * segment_length - num_data) : 0; + const int segment_start = segment_index * segment_length; + if (inner_segment_index_half % 2 == 0) { + if (thread_index >= offset + segment_start) { + const int index_to_compare = thread_index + inner_segment_length_half - offset; + if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { + const T tmp = shared_values[thread_index]; + shared_values[thread_index] = shared_values[index_to_compare]; + shared_values[index_to_compare] = tmp; + } + } + } + __syncthreads(); + } + for (int inner_depth = depth + 1; inner_depth < BITONIC_SORT_DEPTH; ++inner_depth) { + const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); + const int inner_segment_index_half = thread_index / inner_segment_length_half; + if (inner_segment_index_half % 2 == 0) { + const int index_to_compare = thread_index + inner_segment_length_half; + if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { + const T tmp = shared_values[thread_index]; + shared_values[thread_index] = shared_values[index_to_compare]; + shared_values[index_to_compare] = tmp; + } + } + __syncthreads(); + } + } + if (thread_index < num_data) { + values_pointer[thread_index] = shared_values[thread_index]; + } +} + +template +__global__ void BitonicSortMergeKernel(VAL_T* values, const int segment_length, const int len) { + const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const int segment_index = thread_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; + const int offset = static_cast(blockIdx.x * blockDim.x); + const int local_len = min(BITONIC_SORT_NUM_ELEMENTS, len - offset); + if (thread_index < len) { + shared_values[threadIdx.x] = values[thread_index]; + } + __syncthreads(); + int half_segment_length = BITONIC_SORT_NUM_ELEMENTS / 2; + while (half_segment_length >= 1) { + const int half_segment_index = static_cast(threadIdx.x) / half_segment_length; + if (half_segment_index % 2 == 0) { + const int index_to_compare = static_cast(threadIdx.x) + half_segment_length; + if (index_to_compare < local_len && ((shared_values[threadIdx.x] > shared_values[index_to_compare]) == ascending)) { + const VAL_T tmp = shared_values[index_to_compare]; + shared_values[index_to_compare] = shared_values[threadIdx.x]; + shared_values[threadIdx.x] = tmp; + } + } + __syncthreads(); + half_segment_length >>= 1; + } + if (thread_index < len) { + values[thread_index] = shared_values[threadIdx.x]; + } +} + +template +__global__ void BitonicCompareKernel(VAL_T* values, const int half_segment_length, const int outer_segment_length, const int len) { + const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const int segment_index = thread_index / outer_segment_length; + const int half_segment_index = thread_index / half_segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + if (half_segment_index % 2 == 0) { + const int num_total_segment = (len + outer_segment_length - 1) / outer_segment_length; + if (BEGIN && (half_segment_index >> 1) == num_total_segment - 1 && ascending == ASCENDING) { + const int offset = num_total_segment * outer_segment_length - len; + const int segment_start = segment_index * outer_segment_length; + if (thread_index >= offset + segment_start) { + const int index_to_compare = thread_index + half_segment_length - offset; + if (index_to_compare < len && (values[thread_index] > values[index_to_compare]) == ascending) { + const VAL_T tmp = values[index_to_compare]; + values[index_to_compare] = values[thread_index]; + values[thread_index] = tmp; + } + } + } else { + const int index_to_compare = thread_index + half_segment_length; + if (index_to_compare < len) { + if ((values[thread_index] > values[index_to_compare]) == ascending) { + const VAL_T tmp = values[index_to_compare]; + values[index_to_compare] = values[thread_index]; + values[thread_index] = tmp; + } + } + } + } +} + +template +void BitonicSortGlobalHelper(VAL_T* values, const size_t len) { + int max_depth = 1; + int len_to_shift = static_cast(len) - 1; + while (len_to_shift > 0) { + ++max_depth; + len_to_shift >>= 1; + } + /*std::vector tmp_result(len); + CopyFromCUDADeviceToHostOuter(tmp_result.data(), values, len, __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + Log::Warning("=============================== before sorting ==============================="); + for (size_t i = 0; i < len; ++i) { + Log::Warning("tmp_result[%d] = %d", i, tmp_result[i]); + }*/ + const int num_blocks = (static_cast(len) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; + BitonicSortGlobalKernel<<>>(values, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + /*CopyFromCUDADeviceToHostOuter(tmp_result.data(), values, len, __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + Log::Warning("=============================== after block sort stage ==============================="); + for (size_t i = 0; i < len; ++i) { + Log::Warning("tmp_result[%d] = %d", i, tmp_result[i]); + }*/ + for (int depth = max_depth - 11; depth >= 1; --depth) { + const int segment_length = (1 << (max_depth - depth)); + int half_segment_length = (segment_length >> 1); + { + BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + /*CopyFromCUDADeviceToHostOuter(tmp_result.data(), values, len, __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + Log::Warning("=============================== after compare stage depth %d inner depth = %d ===============================", depth, depth); + for (size_t i = 0; i < len; ++i) { + Log::Warning("tmp_result[%d] = %d", i, tmp_result[i]); + }*/ + half_segment_length >>= 1; + } + for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { + BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + /*CopyFromCUDADeviceToHostOuter(tmp_result.data(), values, len, __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + Log::Warning("=============================== after compare stage depth %d inner depth = %d ===============================", depth, inner_depth); + for (size_t i = 0; i < len; ++i) { + Log::Warning("tmp_result[%d] = %d", i, tmp_result[i]); + }*/ + half_segment_length >>= 1; + } + BitonicSortMergeKernel<<>>(values, segment_length, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + /*CopyFromCUDADeviceToHostOuter(tmp_result.data(), values, len, __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + Log::Warning("=============================== after merge stage depth %d ===============================", depth); + for (size_t i = 0; i < len; ++i) { + Log::Warning("tmp_result[%d] = %d", i, tmp_result[i]); + }*/ + } +} + +template <> +void BitonicSortGlobal(int* values, const size_t len) { + BitonicSortGlobalHelper(values, len); +} + } // namespace LightGBM diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp index a5bcf2bc1a98..1ccff8d76257 100644 --- a/src/cuda/cuda_utils.cpp +++ b/src/cuda/cuda_utils.cpp @@ -13,6 +13,10 @@ void SynchronizeCUDADeviceOuter(const char* file, const int line) { CUDASUCCESS_OR_FATAL_OUTER(cudaDeviceSynchronize()); } +void SynchronizeCUDADeviceOuter(cudaStream_t cuda_stream, const char* file, const int line) { + CUDASUCCESS_OR_FATAL_OUTER(cudaStreamSynchronize(cuda_stream)); +} + void PrintLastCUDAErrorOuter(const char* /*file*/, const int /*line*/) { const char* error_name = cudaGetErrorName(cudaGetLastError()); Log::Warning(error_name); diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp index 29b66ef51b7f..eb3ee6f09f37 100644 --- a/src/io/cuda/cuda_metadata.cpp +++ b/src/io/cuda/cuda_metadata.cpp @@ -17,12 +17,36 @@ void CUDAMetadata::Init(const std::vector& label, const std::vector& query_weights, const std::vector& init_score, const std::vector& queries) { - InitCUDAMemoryFromHostMemoryOuter(&cuda_label_, label.data(), label.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_weights_, weight.data(), weight.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_query_boundaries_, query_boundaries.data(), query_boundaries.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_query_weights_, query_weights.data(), query_weights.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_init_score_, init_score.data(), init_score.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_queries_, queries.data(), queries.size(), __FILE__, __LINE__); + if (label.size() == 0) { + cuda_label_ = nullptr; + } else { + InitCUDAMemoryFromHostMemoryOuter(&cuda_label_, label.data(), label.size(), __FILE__, __LINE__); + } + if (weight.size() == 0) { + cuda_weights_ = nullptr; + } else { + InitCUDAMemoryFromHostMemoryOuter(&cuda_weights_, weight.data(), weight.size(), __FILE__, __LINE__); + } + if (query_boundaries.size() == 0) { + cuda_query_boundaries_ = nullptr; + } else { + InitCUDAMemoryFromHostMemoryOuter(&cuda_query_boundaries_, query_boundaries.data(), query_boundaries.size(), __FILE__, __LINE__); + } + if (query_weights.size() == 0) { + cuda_query_weights_ = nullptr; + } else { + InitCUDAMemoryFromHostMemoryOuter(&cuda_query_weights_, query_weights.data(), query_weights.size(), __FILE__, __LINE__); + } + if (init_score.size() == 0) { + cuda_init_score_ = nullptr; + } else { + InitCUDAMemoryFromHostMemoryOuter(&cuda_init_score_, init_score.data(), init_score.size(), __FILE__, __LINE__); + } + if (queries.size() == 0) { + cuda_queries_ = nullptr; + } else { + InitCUDAMemoryFromHostMemoryOuter(&cuda_queries_, queries.data(), queries.size(), __FILE__, __LINE__); + } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } diff --git a/src/objective/cuda/cuda_binary_objective.cpp b/src/objective/cuda/cuda_binary_objective.cpp index 3fa313af789d..51e8e24436ad 100644 --- a/src/objective/cuda/cuda_binary_objective.cpp +++ b/src/objective/cuda/cuda_binary_objective.cpp @@ -10,9 +10,8 @@ namespace LightGBM { -CUDABinaryLogloss::CUDABinaryLogloss(const Config& config, - std::function is_pos): -BinaryLogloss(config, is_pos) {} +CUDABinaryLogloss::CUDABinaryLogloss(const Config& config, const int ova_class_id): +BinaryLogloss(config), ova_class_id_(ova_class_id) {} CUDABinaryLogloss::CUDABinaryLogloss(const std::vector& strs): BinaryLogloss(strs) {} @@ -24,6 +23,11 @@ void CUDABinaryLogloss::Init(const Metadata& metadata, data_size_t num_data) { cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); AllocateCUDAMemoryOuter(&cuda_boost_from_score_, 1, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_boost_from_score_, 0, 1, __FILE__, __LINE__); + if (label_weights_[0] != 1.0f || label_weights_[1] != 1.0f) { + InitCUDAMemoryFromHostMemoryOuter(&cuda_label_weights_, label_weights_, 2, __FILE__, __LINE__); + } else { + cuda_label_weights_ = nullptr; + } } void CUDABinaryLogloss::GetGradients(const double* scores, score_t* gradients, score_t* hessians) const { diff --git a/src/objective/cuda/cuda_binary_objective.cu b/src/objective/cuda/cuda_binary_objective.cu index d49e2260f0c8..4b25fc09e8e8 100644 --- a/src/objective/cuda/cuda_binary_objective.cu +++ b/src/objective/cuda/cuda_binary_objective.cu @@ -50,32 +50,85 @@ void CUDABinaryLogloss::LaunchBoostFromScoreKernel() const { SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } +template __global__ void GetGradientsKernel_BinaryLogloss(const double* cuda_scores, const label_t* cuda_labels, + const double* cuda_label_weights, const label_t* cuda_weights, const int ova_class_id, const double sigmoid, const data_size_t num_data, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); if (data_index < num_data) { const label_t cuda_label = static_cast(cuda_labels[data_index]); - const int label = cuda_label == 0 ? -1 : 1; + const int label = IS_OVA ? (cuda_label > 0 ? 1 : -1) : (cuda_label == ova_class_id ? 1 : -1); const double response = -label * sigmoid / (1.0f + std::exp(label * sigmoid * cuda_scores[data_index])); const double abs_response = fabs(response); - cuda_out_gradients[data_index] = static_cast(response); - cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response)); + if (USE_WEIGHT) { + if (USE_LABEL_WEIGHT) { + const double label_weight = cuda_label_weights[label]; + cuda_out_gradients[data_index] = static_cast(response * label_weight); + cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response) * label_weight); + } else { + cuda_out_gradients[data_index] = static_cast(response); + cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response)); + } + } else { + const double sample_weight = cuda_weights[data_index]; + if (USE_LABEL_WEIGHT) { + const double label_weight = cuda_label_weights[label]; + cuda_out_gradients[data_index] = static_cast(response * label_weight * sample_weight); + cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response) * label_weight * sample_weight); + } else { + cuda_out_gradients[data_index] = static_cast(response * sample_weight); + cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response) * sample_weight); + } + } } } +#define GetGradientsKernel_BinaryLogloss_ARGS \ + scores, \ + cuda_label_, \ + cuda_label_weights_, \ + cuda_weights_, \ + ova_class_id_, \ + sigmoid_, \ + num_data_, \ + gradients, \ + hessians + void CUDABinaryLogloss::LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const { const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY; - GetGradientsKernel_BinaryLogloss<<>>( - scores, - cuda_label_, - sigmoid_, - num_data_, - gradients, - hessians); + if (ova_class_id_ == -1) { + if (cuda_label_weights_ == nullptr) { + if (cuda_weights_ == nullptr) { + GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); + } else { + GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); + } + } else { + if (cuda_weights_ == nullptr) { + GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); + } else { + GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); + } + } + } else { + if (cuda_label_weights_ == nullptr) { + if (cuda_weights_ == nullptr) { + GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); + } else { + GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); + } + } else { + if (cuda_weights_ == nullptr) { + GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); + } else { + GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); + } + } + } } -__global__ void ConvertOutputCUDAKernel(const double sigmoid, const data_size_t num_data, const double* input, double* output) { +__global__ void ConvertOutputCUDAKernel_BinaryLogloss(const double sigmoid, const data_size_t num_data, const double* input, double* output) { const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); if (data_index < num_data) { output[data_index] = 1.0f / (1.0f + exp(-sigmoid * input[data_index])); @@ -84,7 +137,7 @@ __global__ void ConvertOutputCUDAKernel(const double sigmoid, const data_size_t void CUDABinaryLogloss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY; - ConvertOutputCUDAKernel<<>>(sigmoid_, num_data, input, output); + ConvertOutputCUDAKernel_BinaryLogloss<<>>(sigmoid_, num_data, input, output); } } // namespace LightGBM diff --git a/src/objective/cuda/cuda_binary_objective.hpp b/src/objective/cuda/cuda_binary_objective.hpp index 4d75790ec630..7cec5637e73d 100644 --- a/src/objective/cuda/cuda_binary_objective.hpp +++ b/src/objective/cuda/cuda_binary_objective.hpp @@ -20,8 +20,7 @@ namespace LightGBM { class CUDABinaryLogloss : public CUDAObjectiveInterface, public BinaryLogloss { public: - explicit CUDABinaryLogloss(const Config& config, - std::function is_pos = nullptr); + explicit CUDABinaryLogloss(const Config& config, const int ova_class_id = -1); explicit CUDABinaryLogloss(const std::vector& strs); @@ -50,11 +49,12 @@ class CUDABinaryLogloss : public CUDAObjectiveInterface, public BinaryLogloss { // CUDA memory, held by other objects const label_t* cuda_label_; - // TODO(shiyu1994): add weighted gradients const label_t* cuda_weights_; // CUDA memory, held by this object double* cuda_boost_from_score_; + double* cuda_label_weights_; + const int ova_class_id_ = -1; }; } // namespace LightGBM diff --git a/src/objective/cuda/cuda_multiclass_objective.cpp b/src/objective/cuda/cuda_multiclass_objective.cpp new file mode 100644 index 000000000000..930fd7dd07f4 --- /dev/null +++ b/src/objective/cuda/cuda_multiclass_objective.cpp @@ -0,0 +1,63 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#include "cuda_multiclass_objective.hpp" + +namespace LightGBM { + +CUDAMulticlassSoftmax::CUDAMulticlassSoftmax(const Config& config): MulticlassSoftmax(config) {} + +CUDAMulticlassSoftmax::CUDAMulticlassSoftmax(const std::vector& strs): MulticlassSoftmax(strs) {} + +CUDAMulticlassSoftmax::~CUDAMulticlassSoftmax() {} + +void CUDAMulticlassSoftmax::Init(const Metadata& metadata, data_size_t num_data) { + MulticlassSoftmax::Init(metadata, num_data); + cuda_label_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + AllocateCUDAMemoryOuter(&cuda_boost_from_score_, num_class_, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_softmax_buffer_, static_cast(num_data) * static_cast(num_class_), __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_boost_from_score_, 0, num_class_, __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +void CUDAMulticlassSoftmax::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { + LaunchGetGradientsKernel(score, gradients, hessians); + /*std::vector cpu_gradients(100, 0.0f); + std::vector cpu_hessians(100, 0.0f); + CopyFromCUDADeviceToHostOuter(cpu_gradients.data(), gradients, 100, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(cpu_hessians.data(), hessians, 100, __FILE__, __LINE__); + for (size_t i = 0; i < 100; ++i) { + Log::Warning("class 0 data %d gradient = %f, hessians = %f", i, cpu_gradients[i], cpu_hessians[i]); + } + CopyFromCUDADeviceToHostOuter(cpu_gradients.data(), gradients + num_data_ - 100, 100, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(cpu_hessians.data(), hessians + num_data_ - 100, 100, __FILE__, __LINE__); + for (size_t i = 0; i < 100; ++i) { + Log::Warning("class 0 data %d gradient = %f, hessians = %f", i + num_data_ - 100, cpu_gradients[i], cpu_hessians[i]); + } + CopyFromCUDADeviceToHostOuter(cpu_gradients.data(), gradients + num_data_, 100, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(cpu_hessians.data(), hessians + num_data_, 100, __FILE__, __LINE__); + for (size_t i = 0; i < 100; ++i) { + Log::Warning("class 1 data %d gradient = %f, hessians = %f", i, cpu_gradients[i], cpu_hessians[i]); + }*/ + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +void CUDAMulticlassSoftmax::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { + LaunchConvertOutputCUDAKernel(num_data, input, output); +} + +CUDAMulticlassOVA::CUDAMulticlassOVA(const Config& config) { + num_class_ = config.num_class; + for (int i = 0; i < num_class_; ++i) { + binary_loss_.emplace_back(new CUDABinaryLogloss(config, i)); + } + sigmoid_ = config.sigmoid; +} + +CUDAMulticlassOVA::CUDAMulticlassOVA(const std::vector& strs): MulticlassOVA(strs) {} + +CUDAMulticlassOVA::~CUDAMulticlassOVA() {} + +} // namespace LightGBM diff --git a/src/objective/cuda/cuda_multiclass_objective.cu b/src/objective/cuda/cuda_multiclass_objective.cu new file mode 100644 index 000000000000..e00842b32eb2 --- /dev/null +++ b/src/objective/cuda/cuda_multiclass_objective.cu @@ -0,0 +1,108 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#include "cuda_multiclass_objective.hpp" + +namespace LightGBM { + +__device__ void SoftmaxCUDA(double* softmax_buffer, int len) { + double wmax = softmax_buffer[0]; + for (int i = 1; i < len; ++i) { + wmax = max(softmax_buffer[i], wmax); + } + double wsum = 0.0f; + for (int i = 0; i < len; ++i) { + softmax_buffer[i] = exp(softmax_buffer[i] - wmax); + wsum += softmax_buffer[i]; + } + for (int i = 0; i < len; ++i) { + softmax_buffer[i] /= static_cast(wsum); + } +} + +__device__ void SoftmaxCUDA(const double* input, int len, double* output) { + double wmax = input[0]; + for (int i = 1; i < len; ++i) { + wmax = max(input[i], wmax); + } + double wsum = 0.0f; + for (int i = 0; i < len; ++i) { + output[i] = exp(input[i] - wmax); + wsum += output[i]; + } + for (int i = 0; i < len; ++i) { + output[i] /= static_cast(wsum); + } +} + +template +__global__ void GetGradientsKernel_MulticlassSoftmax( + const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, + const double factor, const int num_class, const data_size_t num_data, + double* cuda_softmax_buffer, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + const data_size_t offset = data_index * num_class; + double* softmax_result = cuda_softmax_buffer + offset; + for (int k = 0; k < num_class; ++k) { + softmax_result[k] = cuda_scores[k * num_data + data_index]; + } + SoftmaxCUDA(softmax_result, num_class); + if (!USE_WEIGHT) { + for (int k = 0; k < num_class; ++k) { + const double p = softmax_result[k]; + size_t idx = static_cast(num_data) * k + data_index; + if (static_cast(cuda_labels[data_index]) == k) { + cuda_out_gradients[idx] = static_cast(p - 1.0f); + } else { + cuda_out_gradients[idx] = static_cast(p); + } + cuda_out_hessians[idx] = static_cast(factor * p * (1.0f - p)); + } + } else { + for (int k = 0; k < num_class; ++k) { + const double p = softmax_result[k]; + const double weight = cuda_weights[data_index]; + size_t idx = static_cast(num_data) * k + data_index; + if (static_cast(cuda_labels[data_index]) == k) { + cuda_out_gradients[idx] = static_cast((p - 1.0f) * weight); + } else { + cuda_out_gradients[idx] = static_cast(p * weight); + } + cuda_out_hessians[idx] = static_cast((factor * p * (1.0f - p)) * weight); + } + } + } +} + +void CUDAMulticlassSoftmax::LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_MULTICLASS - 1) / GET_GRADIENTS_BLOCK_SIZE_MULTICLASS; + if (cuda_weights_ == nullptr) { + GetGradientsKernel_MulticlassSoftmax<<>>( + scores, cuda_label_, cuda_weights_, factor_, num_class_, num_data_, + cuda_softmax_buffer_, gradients, hessians); + } else { + GetGradientsKernel_MulticlassSoftmax<<>>( + scores, cuda_label_, cuda_weights_, factor_, num_class_, num_data_, + cuda_softmax_buffer_, gradients, hessians); + } +} + +__global__ void ConvertOutputCUDAKernel_MulticlassSoftmax( + const int num_class, const data_size_t num_data, const double* input, double* output) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + const data_size_t offset = data_index * num_class; + SoftmaxCUDA(input + offset, num_class, output + offset); + } +} + +void CUDAMulticlassSoftmax::LaunchConvertOutputCUDAKernel( + const data_size_t num_data, const double* input, double* output) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_MULTICLASS - 1) / GET_GRADIENTS_BLOCK_SIZE_MULTICLASS; + ConvertOutputCUDAKernel_MulticlassSoftmax<<>>( + num_class_, num_data, input, output); +} + +} // namespace LightGBM diff --git a/src/objective/cuda/cuda_multiclass_objective.hpp b/src/objective/cuda/cuda_multiclass_objective.hpp new file mode 100644 index 000000000000..d6e75c5f9129 --- /dev/null +++ b/src/objective/cuda/cuda_multiclass_objective.hpp @@ -0,0 +1,62 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_ +#define LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_ + +#include +#include "cuda_binary_objective.hpp" +#include "../multiclass_objective.hpp" + +#define GET_GRADIENTS_BLOCK_SIZE_MULTICLASS (1024) + +namespace LightGBM { + +class CUDAMulticlassSoftmax: public CUDAObjectiveInterface, public MulticlassSoftmax { + public: + explicit CUDAMulticlassSoftmax(const Config& config); + + explicit CUDAMulticlassSoftmax(const std::vector& strs); + + ~CUDAMulticlassSoftmax(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; + + void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; + + std::function GetCUDAConvertOutputFunc() const override { + return [this] (data_size_t num_data, const double* input, double* output) { + ConvertOutputCUDA(num_data, input, output); + }; + } + + private: + void LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const; + + void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; + + // CUDA memory, held by other objects + const label_t* cuda_label_; + // TODO(shiyu1994): add weighted gradients + const label_t* cuda_weights_; + + // CUDA memory, held by this object + double* cuda_boost_from_score_; + double* cuda_softmax_buffer_; +}; + +class CUDAMulticlassOVA: public CUDAObjectiveInterface, public MulticlassOVA { + public: + explicit CUDAMulticlassOVA(const Config& config); + + explicit CUDAMulticlassOVA(const std::vector& strs); + + ~CUDAMulticlassOVA(); +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_ diff --git a/src/objective/cuda/cuda_rank_objective.cpp b/src/objective/cuda/cuda_rank_objective.cpp index 7a9342c3e2f9..9bc97ffb7545 100644 --- a/src/objective/cuda/cuda_rank_objective.cpp +++ b/src/objective/cuda/cuda_rank_objective.cpp @@ -15,6 +15,7 @@ LambdarankNDCG(config) {} void CUDALambdarankNDCG::Init(const Metadata& metadata, data_size_t num_data) { const int num_threads = OMP_NUM_THREADS(); + TestCUDAQuickSort(); LambdarankNDCG::Init(metadata, num_data); std::vector thread_max_num_items_in_query(num_threads); @@ -51,6 +52,27 @@ void CUDALambdarankNDCG::Init(const Metadata& metadata, data_size_t num_data) { void CUDALambdarankNDCG::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { LaunchGetGradientsKernel(score, gradients, hessians); + std::vector host_gradients(100, 0.0f); + std::vector host_hessians(100, 0.0f); + CopyFromCUDADeviceToHostOuter(host_gradients.data(), gradients, 100, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(host_hessians.data(), hessians, 100, __FILE__, __LINE__); + for (int i = 0; i < 100; ++i) { + Log::Warning("host_gradient[%d] = %f, host_hessians[%d] = %f", i, host_gradients[i], host_hessians[i]); + } +} + +CUDARankXENDCG::CUDARankXENDCG(const Config& config): RankXENDCG(config) {} + +CUDARankXENDCG::CUDARankXENDCG(const std::vector& strs): RankXENDCG(strs) {} + +CUDARankXENDCG::~CUDARankXENDCG() {} + +void CUDARankXENDCG::Init(const Metadata& metadata, data_size_t num_data) { + RankXENDCG::Init(metadata, num_data); +} + +void CUDARankXENDCG::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { + LaunchGetGradientsKernel(score, gradients, hessians); } } // namespace LightGBM diff --git a/src/objective/cuda/cuda_rank_objective.cu b/src/objective/cuda/cuda_rank_objective.cu index 637c1838b81b..2cfeb218568d 100644 --- a/src/objective/cuda/cuda_rank_objective.cu +++ b/src/objective/cuda/cuda_rank_objective.cu @@ -8,9 +8,83 @@ #include "cuda_rank_objective.hpp" +#include +#include +#include + +#define BITONIC_SORT_NUM_ELEMENTS_LOCAL (1024) +#define BITONIC_SORT_DEPTH_LOCAL (11) + namespace LightGBM { -__device__ void ArgSort(const score_t* scores, uint16_t* indices, const uint16_t num_items) { +template +__global__ void BitonicSortForMergeSortLocal(T* values, const int num_total_data) { + const int thread_index = static_cast(threadIdx.x); + const int low = static_cast(blockIdx.x * BITONIC_SORT_NUM_ELEMENTS_LOCAL); + T* values_pointer = values + low; + const int num_data = min(BITONIC_SORT_NUM_ELEMENTS_LOCAL, num_total_data - low); + __shared__ T shared_values[BITONIC_SORT_NUM_ELEMENTS_LOCAL]; + if (thread_index < num_data) { + shared_values[thread_index] = values_pointer[thread_index]; + } + __syncthreads(); + for (int depth = BITONIC_SORT_DEPTH_LOCAL - 1; depth >= 1; --depth) { + const int segment_length = 1 << (BITONIC_SORT_DEPTH_LOCAL - depth); + const int segment_index = thread_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + for (int inner_depth = depth; inner_depth < BITONIC_SORT_DEPTH_LOCAL; ++inner_depth) { + const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH_LOCAL - 1 - inner_depth); + const int inner_segment_index_half = thread_index / inner_segment_length_half; + if (inner_segment_index_half % 2 == 0) { + const int index_to_compare = thread_index + inner_segment_length_half; + if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { + const T tmp = shared_values[thread_index]; + shared_values[thread_index] = shared_values[index_to_compare]; + shared_values[index_to_compare] = tmp; + } + } + __syncthreads(); + } + } + if (thread_index < num_data) { + values_pointer[thread_index] = shared_values[thread_index]; + } +} + +template +__global__ void BitonicSortLocal(T* values, const int low, const int high) { + const int thread_index = static_cast(threadIdx.x); + T* values_pointer = values + low; + const int num_data = high - low; + __shared__ T shared_values[1024]; + if (thread_index < num_data) { + shared_values[thread_index] = values_pointer[thread_index]; + } + __syncthreads(); + for (int depth = 10; depth >= 1; --depth) { + const int segment_length = 1 << (11 - depth); + const int segment_index = thread_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + for (int inner_depth = depth; inner_depth < 11; ++inner_depth) { + const int inner_segment_length_half = 1 << (10 - inner_depth); + const int inner_segment_index_half = thread_index / inner_segment_length_half; + if (inner_segment_index_half % 2 == 0) { + const int index_to_compare = thread_index + inner_segment_length_half; + if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { + const T tmp = shared_values[thread_index]; + shared_values[thread_index] = shared_values[index_to_compare]; + shared_values[index_to_compare] = tmp; + } + } + __syncthreads(); + } + } + if (thread_index < num_data) { + values_pointer[thread_index] = shared_values[thread_index]; + } +} + +__device__ __forceinline__ void ArgSort(const score_t* scores, uint16_t* indices, const uint16_t num_items) { uint16_t num_items_aligned = 1; uint16_t num_items_ref = num_items - 1; uint16_t depth = 1; @@ -42,7 +116,7 @@ __device__ void ArgSort(const score_t* scores, uint16_t* indices, const uint16_t } } -__device__ void ArgSort_Partial(const score_t* scores, uint16_t* indices, const uint16_t num_items, const bool outer_decending) { +__device__ __forceinline__ void ArgSort_Partial(const score_t* scores, uint16_t* indices, const uint16_t num_items, const bool outer_decending) { uint16_t num_items_aligned = 1; uint16_t num_items_ref = num_items - 1; uint16_t depth = 1; @@ -74,23 +148,21 @@ __device__ void ArgSort_Partial(const score_t* scores, uint16_t* indices, const } } -__device__ void ArgSort_2048(const score_t* scores, uint16_t* indices, const uint16_t num_items) { - const uint16_t depth = 11; - const uint16_t half_num_items_aligned = 1024; - ArgSort_Partial(scores, indices, half_num_items_aligned, true); - ArgSort_Partial(scores + half_num_items_aligned, indices + half_num_items_aligned, half_num_items_aligned, false); - const unsigned int index_to_compare = threadIdx.x + half_num_items_aligned; +__device__ __forceinline__ void ArgSort_2048(const score_t* scores, uint16_t* indices, const uint16_t num_items) { + ArgSort_Partial(scores, indices, 1024, true); + ArgSort_Partial(scores + 1024, indices + 1024, 1024, false); + const unsigned int index_to_compare = threadIdx.x + 1024; if (scores[indices[index_to_compare]] > scores[indices[threadIdx.x]]) { const uint16_t temp_index = indices[index_to_compare]; indices[index_to_compare] = indices[threadIdx.x]; indices[threadIdx.x] = temp_index; } __syncthreads(); - for (uint16_t inner_depth = 1; inner_depth < depth; ++inner_depth) { - const uint16_t segment_length = 1 << (depth - inner_depth); + for (uint16_t inner_depth = 1; inner_depth < 11; ++inner_depth) { + const uint16_t segment_length = 1 << (11 - inner_depth); const uint16_t half_segment_length = segment_length >> 1; const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < half_num_items_aligned) { + if (threadIdx.x < 1024) { if (half_segment_index % 2 == 0) { const uint16_t index_to_compare = threadIdx.x + half_segment_length; if (scores[indices[threadIdx.x]] < scores[indices[index_to_compare]]) { @@ -102,13 +174,13 @@ __device__ void ArgSort_2048(const score_t* scores, uint16_t* indices, const uin } __syncthreads(); } - const score_t* scores_ptr = scores + half_num_items_aligned; - uint16_t* indices_ptr = indices + half_num_items_aligned; - for (uint16_t inner_depth = 1; inner_depth < depth; ++inner_depth) { - const uint16_t segment_length = 1 << (depth - inner_depth); + const score_t* scores_ptr = scores + 1024; + uint16_t* indices_ptr = indices + 1024; + for (uint16_t inner_depth = 1; inner_depth < 11; ++inner_depth) { + const uint16_t segment_length = 1 << (11 - inner_depth); const uint16_t half_segment_length = segment_length >> 1; const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < half_num_items_aligned) { + if (threadIdx.x < 1024) { if (half_segment_index % 2 == 0) { const uint16_t index_to_compare = threadIdx.x + half_segment_length; if (scores_ptr[indices_ptr[threadIdx.x]] < scores_ptr[indices_ptr[index_to_compare]]) { @@ -122,7 +194,7 @@ __device__ void ArgSort_2048(const score_t* scores, uint16_t* indices, const uin } } -__global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, +__global__ void GetGradientsKernel_LambdarankNDCG(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, const bool norm, const double sigmoid, const int truncation_level, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { @@ -242,7 +314,7 @@ __global__ void GetGradientsKernel_Ranking(const double* cuda_scores, const labe } } -__global__ void GetGradientsKernel_Ranking_2048(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, +__global__ void GetGradientsKernel_LambdarankNDCG_2048(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, const bool norm, const double sigmoid, const int truncation_level, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { @@ -252,7 +324,6 @@ __global__ void GetGradientsKernel_Ranking_2048(const double* cuda_scores, const __shared__ score_t shared_hessians[MAX_NUM_ITEM_IN_QUERY]; const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); - const double min_score = kMinScore; for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { const double inverse_max_dcg = cuda_inverse_max_dcgs[query_index]; const data_size_t query_start = cuda_query_boundaries[query_index]; @@ -268,7 +339,7 @@ __global__ void GetGradientsKernel_Ranking_2048(const double* cuda_scores, const shared_lambdas[threadIdx.x] = 0.0f; shared_hessians[threadIdx.x] = 0.0f; } else { - shared_scores[threadIdx.x] = min_score; + shared_scores[threadIdx.x] = kMinScore; shared_indices[threadIdx.x] = static_cast(threadIdx.x); } if (query_item_count > 1024) { @@ -279,7 +350,7 @@ __global__ void GetGradientsKernel_Ranking_2048(const double* cuda_scores, const shared_lambdas[threadIdx_x_plus_1024] = 0.0f; shared_hessians[threadIdx_x_plus_1024] = 0.0f; } else { - shared_scores[threadIdx_x_plus_1024] = min_score; + shared_scores[threadIdx_x_plus_1024] = kMinScore; shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); } } @@ -293,7 +364,7 @@ __global__ void GetGradientsKernel_Ranking_2048(const double* cuda_scores, const // get best and worst score const double best_score = shared_scores[shared_indices[0]]; data_size_t worst_idx = query_item_count - 1; - if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == min_score) { + if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == kMinScore) { worst_idx -= 1; } const double worst_score = shared_scores[shared_indices[worst_idx]]; @@ -315,7 +386,7 @@ __global__ void GetGradientsKernel_Ranking_2048(const double* cuda_scores, const const data_size_t j = pair_index % num_j_per_i + 1; if (j > i) { // skip pairs with the same labels - if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != min_score) { + if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != kMinScore) { data_size_t high_rank, low_rank; if (cuda_label_pointer[shared_indices[i]] > cuda_label_pointer[shared_indices[j]]) { high_rank = i; @@ -365,7 +436,7 @@ __global__ void GetGradientsKernel_Ranking_2048(const double* cuda_scores, const atomicAdd_block(&sum_lambdas, thread_sum_lambdas); __syncthreads(); if (norm && sum_lambdas > 0) { - double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; + const double norm_factor = log2(1 + sum_lambdas) / sum_lambdas; if (threadIdx.x < static_cast(query_item_count)) { cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x] * norm_factor); cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x] * norm_factor); @@ -397,18 +468,19 @@ __global__ void GetGradientsKernel_Ranking_2048(const double* cuda_scores, const void CUDALambdarankNDCG::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; if (max_items_in_query_aligned_ <= 1024) { - GetGradientsKernel_Ranking<<>>(score, cuda_labels_, num_data_, + GetGradientsKernel_LambdarankNDCG<<>>(score, cuda_labels_, num_data_, num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, norm_, sigmoid_, truncation_level_, gradients, hessians); } else if (max_items_in_query_aligned_ <= 2048) { - GetGradientsKernel_Ranking_2048<<>>(score, cuda_labels_, num_data_, + GetGradientsKernel_LambdarankNDCG_2048<<>>(score, cuda_labels_, num_data_, num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, norm_, sigmoid_, truncation_level_, gradients, hessians); } else { Log::Fatal("Too large max_items_in_query_aligned_ = %d", max_items_in_query_aligned_); } + PrintLastCUDAErrorOuter(__FILE__, __LINE__); } __device__ void PrefixSumBankConflict(uint16_t* elements, unsigned int n) { @@ -512,6 +584,84 @@ void CUDALambdarankNDCG::LaunchCalcInverseMaxDCGKernel() { SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } +__global__ void GetGradientsKernel_RankXENDCG() {} + +void CUDARankXENDCG::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const {} + +void CUDALambdarankNDCG::TestCUDAQuickSort() const { + const int test_num_data = (1 << 24) + 13; + const int data_range = 1000; + const int num_threads = OMP_NUM_THREADS(); + std::vector rand_integers(test_num_data, 0); + std::vector distribution_prob(data_range, 1.0f / data_range); + std::discrete_distribution dist(distribution_prob.begin(), distribution_prob.end()); + std::vector rand_engines(num_threads); + Threading::For(0, test_num_data, 512, + [&rand_engines, &dist, &rand_integers] (int thread_index, int start, int end) { + rand_engines[thread_index] = std::mt19937(thread_index); + for (int i = start; i < end; ++i) { + rand_integers[i] = dist(rand_engines[thread_index]); + } + }); + + const int smaller_test_num_data = /*(1 << 11) +*/ 170; + std::vector bitonic_sort_integers(rand_integers.begin(), rand_integers.begin() + smaller_test_num_data); + std::vector cuda_bitonic_sort_integers = bitonic_sort_integers; + std::vector host_bitonic_sort_integers = bitonic_sort_integers; + int* cuda_bitonic_sort_integers_pointer = nullptr; + InitCUDAMemoryFromHostMemoryOuter(&cuda_bitonic_sort_integers_pointer, cuda_bitonic_sort_integers.data(), smaller_test_num_data, __FILE__, __LINE__); + auto start_1024 = std::chrono::steady_clock::now(); + BitonicSortGlobal(cuda_bitonic_sort_integers_pointer, smaller_test_num_data); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + auto end_1024 = std::chrono::steady_clock::now(); + auto duration_1024 = static_cast>(end_1024 - start_1024); + Log::Warning("bitonic sort 1024 time = %f", duration_1024.count()); + CopyFromCUDADeviceToHostOuter(cuda_bitonic_sort_integers.data(), cuda_bitonic_sort_integers_pointer, smaller_test_num_data, __FILE__, __LINE__); + start_1024 = std::chrono::steady_clock::now(); + std::sort(host_bitonic_sort_integers.begin(), host_bitonic_sort_integers.end()); + end_1024 = std::chrono::steady_clock::now(); + duration_1024 = static_cast>(end_1024 - start_1024); + Log::Warning("host sort 1024 time = %f", duration_1024.count()); + for (int i = 0; i < smaller_test_num_data; ++i) { + if (host_bitonic_sort_integers[i] != cuda_bitonic_sort_integers[i]) { + Log::Warning("error index %d host_bitonic_sort_integers = %d, cuda_bitonic_sort_integers = %d", i, host_bitonic_sort_integers[i], cuda_bitonic_sort_integers[i]); + } + } + + std::vector cuda_rand_integers = rand_integers; + std::vector host_rand_integers = rand_integers; + int* cuda_data = nullptr; + InitCUDAMemoryFromHostMemoryOuter(&cuda_data, rand_integers.data(), rand_integers.size(), __FILE__, __LINE__); + auto start = std::chrono::steady_clock::now(); + BitonicSortGlobal(cuda_data, static_cast(test_num_data)); + auto end = std::chrono::steady_clock::now(); + auto duration = static_cast>(end - start); + Log::Warning("cuda sort time = %f", duration.count()); + CopyFromCUDADeviceToHostOuter(cuda_rand_integers.data(), cuda_data, static_cast(test_num_data), __FILE__, __LINE__); + start = std::chrono::steady_clock::now(); + std::sort(host_rand_integers.begin(), host_rand_integers.end()); + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + Log::Warning("cpu sort time = %f", duration.count()); + std::vector parallel_rand_integers = rand_integers; + start = std::chrono::steady_clock::now(); + Common::ParallelSort(parallel_rand_integers.begin(), parallel_rand_integers.end(), [](int a, int b) { return a < b; }); + end = std::chrono::steady_clock::now(); + duration = static_cast>(end - start); + Log::Warning("parallel sort time = %f", duration.count()); + for (int i = 0; i < 100; ++i) { + Log::Warning("after sort cuda_rand_integers[%d] = %d", i, cuda_rand_integers[i]); + } + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int i = 0; i < test_num_data; ++i) { + if (cuda_rand_integers[i] != host_rand_integers[i]) { + Log::Warning("index %d cuda_rand_integers = %d, host_rand_integers = %d", i, cuda_rand_integers[i], host_rand_integers[i]); + } + CHECK_EQ(cuda_rand_integers[i], host_rand_integers[i]); + } + Log::Warning("cuda argsort test pass"); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/objective/cuda/cuda_rank_objective.hpp b/src/objective/cuda/cuda_rank_objective.hpp index a36977adae3a..dc2542dae4d8 100644 --- a/src/objective/cuda/cuda_rank_objective.hpp +++ b/src/objective/cuda/cuda_rank_objective.hpp @@ -35,6 +35,8 @@ class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG void LaunchCalcInverseMaxDCGKernel(); + void TestCUDAQuickSort() const; + // CUDA memory, held by this object double* cuda_lambdas_; double* cuda_inverse_max_dcgs_; @@ -48,6 +50,22 @@ class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG int max_items_in_query_aligned_; }; +class CUDARankXENDCG : public CUDAObjectiveInterface, public RankXENDCG { + public: + explicit CUDARankXENDCG(const Config& config); + + explicit CUDARankXENDCG(const std::vector& strs); + + ~CUDARankXENDCG(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; + + private: + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; +}; + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp index 88aa0ee040e6..c20fc43083cf 100644 --- a/src/objective/multiclass_objective.hpp +++ b/src/objective/multiclass_objective.hpp @@ -127,6 +127,12 @@ class MulticlassSoftmax: public ObjectiveFunction { } } } + for (int i = 0; i < 100; ++i) { + Log::Warning("class 0 data %d gradient %f hessian %f", i, gradients[i], hessians[i]); + } + for (int i = 0; i < 100; ++i) { + Log::Warning("class 1 data %d gradient %f hessian %f", i, gradients[i + num_data_], hessians[i + num_data_]); + } } void ConvertOutput(const double* input, double* output) const override { @@ -165,7 +171,7 @@ class MulticlassSoftmax: public ObjectiveFunction { } } - private: + protected: double factor_; /*! \brief Number of data */ data_size_t num_data_; @@ -266,7 +272,9 @@ class MulticlassOVA: public ObjectiveFunction { return binary_loss_[class_id]->ClassNeedTrain(0); } - private: + protected: + MulticlassOVA() {} + /*! \brief Number of data */ data_size_t num_data_; /*! \brief Number of classes */ diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index c03feb9a2b88..c436587c3ad0 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -11,6 +11,7 @@ #include "xentropy_objective.hpp" #include "cuda/cuda_binary_objective.hpp" +#include "cuda/cuda_multiclass_objective.hpp" #include "cuda/cuda_regression_objective.hpp" #include "cuda/cuda_rank_objective.hpp" @@ -24,6 +25,8 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new CUDARegressionL2loss(config); } else if (type == std::string("lambdarank")) { return new CUDALambdarankNDCG(config); + } else if (type == std::string("multiclass")) { + return new CUDAMulticlassSoftmax(config); } } else { if (type == std::string("regression")) { diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index c088a0ff2147..f11c76127009 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -352,13 +352,13 @@ class RankXENDCG : public RankingObjective { } } + const char* GetName() const override { return "rank_xendcg"; } + + protected: double Phi(const label_t l, double g) const { return Common::Pow(2, static_cast(l)) - g; } - const char* GetName() const override { return "rank_xendcg"; } - - private: mutable std::vector rands_; }; From fe6ed744c3023303cb7f3aa50100826ab889ad0d Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 9 Aug 2021 12:26:54 +0000 Subject: [PATCH 052/166] add global argsort for queries with many items in ranking tasks --- include/LightGBM/cuda/cuda_algorithms.hpp | 17 +- src/cuda/cuda_algorithms.cu | 590 ++++++++++++--------- src/objective/cuda/cuda_rank_objective.cpp | 7 - src/objective/cuda/cuda_rank_objective.cu | 67 --- 4 files changed, 337 insertions(+), 344 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 14819d6b0686..57a8fa2746c6 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -101,18 +101,17 @@ __device__ void PrefixSumConflictFree(T* values, size_t n) { PrefixSumInner(values, n, T); } -template -void CUDAQuickSort(T* values, const size_t n); - -template -void CUDAMergeSort(T* values, const size_t n); - -template -__device__ void BitonicArgSort(const VAL_T* values, INDEX_T* indices, const size_t len); - template void BitonicSortGlobal(VAL_T* values, const size_t len); +template +void BitonicArgSortGlobal(const VAL_T* values, INDEX_T* indices, const size_t len); + +void BitonicArgSortItemsGlobal(const double* values, + const int num_queries, + const data_size_t* cuda_query_boundaries, + data_size_t* out_indices); + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index a259aaf14d2c..5aa67c4a68e3 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -7,15 +7,17 @@ namespace LightGBM { -#define QUICKSORT_MAX_DEPTH (12) #define BITONIC_SORT_NUM_ELEMENTS (1024) #define BITONIC_SORT_DEPTH (11) +#define BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE (10) template -__global__ void BitonicSort(T* values, const int low, const int high) { +__global__ void BitonicSortGlobalKernel(T* values, const int num_total_data) { const int thread_index = static_cast(threadIdx.x); + const int low = static_cast(blockIdx.x * BITONIC_SORT_NUM_ELEMENTS); + const bool outer_ascending = ASCENDING ? (blockIdx.x % 2 == 0) : (blockIdx.x % 2 == 1); T* values_pointer = values + low; - const int num_data = high - low; + const int num_data = min(BITONIC_SORT_NUM_ELEMENTS, num_total_data - low); __shared__ T shared_values[BITONIC_SORT_NUM_ELEMENTS]; if (thread_index < num_data) { shared_values[thread_index] = values_pointer[thread_index]; @@ -24,42 +26,28 @@ __global__ void BitonicSort(T* values, const int low, const int high) { for (int depth = BITONIC_SORT_DEPTH - 1; depth >= 1; --depth) { const int segment_length = 1 << (BITONIC_SORT_DEPTH - depth); const int segment_index = thread_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - for (int inner_depth = depth; inner_depth < BITONIC_SORT_DEPTH; ++inner_depth) { + const bool ascending = outer_ascending ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + const int num_total_segment = (num_data + segment_length - 1) / segment_length; + { + const int inner_depth = depth; const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); const int inner_segment_index_half = thread_index / inner_segment_length_half; + const int offset = ((inner_segment_index_half >> 1) == num_total_segment - 1 && ascending == ASCENDING) ? + (num_total_segment * segment_length - num_data) : 0; + const int segment_start = segment_index * segment_length; if (inner_segment_index_half % 2 == 0) { - const int index_to_compare = thread_index + inner_segment_length_half; - if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { - const T tmp = shared_values[thread_index]; - shared_values[thread_index] = shared_values[index_to_compare]; - shared_values[index_to_compare] = tmp; + if (thread_index >= offset + segment_start) { + const int index_to_compare = thread_index + inner_segment_length_half - offset; + if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { + const T tmp = shared_values[thread_index]; + shared_values[thread_index] = shared_values[index_to_compare]; + shared_values[index_to_compare] = tmp; + } } } __syncthreads(); } - } - if (thread_index < num_data) { - values_pointer[thread_index] = shared_values[thread_index]; - } -} - -template -__global__ void BitonicSortForMergeSort(T* values, const int num_total_data) { - const int thread_index = static_cast(threadIdx.x); - const int low = static_cast(blockIdx.x * BITONIC_SORT_NUM_ELEMENTS); - T* values_pointer = values + low; - const int num_data = min(BITONIC_SORT_NUM_ELEMENTS, num_total_data - low); - __shared__ T shared_values[BITONIC_SORT_NUM_ELEMENTS]; - if (thread_index < num_data) { - shared_values[thread_index] = values_pointer[thread_index]; - } - __syncthreads(); - for (int depth = BITONIC_SORT_DEPTH - 1; depth >= 1; --depth) { - const int segment_length = 1 << (BITONIC_SORT_DEPTH - depth); - const int segment_index = thread_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - for (int inner_depth = depth; inner_depth < BITONIC_SORT_DEPTH; ++inner_depth) { + for (int inner_depth = depth + 1; inner_depth < BITONIC_SORT_DEPTH; ++inner_depth) { const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); const int inner_segment_index_half = thread_index / inner_segment_length_half; if (inner_segment_index_half % 2 == 0) { @@ -78,198 +66,116 @@ __global__ void BitonicSortForMergeSort(T* values, const int num_total_data) { } } -template -__global__ void CUDAQuickSortHelper(T* values, const int low, const int high, const int depth) { - if (high - low <= BITONIC_SORT_NUM_ELEMENTS) { - cudaStream_t cuda_stream; - cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking); - BitonicSort<<<1, BITONIC_SORT_NUM_ELEMENTS, 0, cuda_stream>>>(values, low, high); - cudaStreamDestroy(cuda_stream); - return; +template +__global__ void BitonicSortMergeKernel(VAL_T* values, const int segment_length, const int len) { + const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const int segment_index = thread_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; + const int offset = static_cast(blockIdx.x * blockDim.x); + const int local_len = min(BITONIC_SORT_NUM_ELEMENTS, len - offset); + if (thread_index < len) { + shared_values[threadIdx.x] = values[thread_index]; } - int i = low - 1; - int j = high - 1; - int p = i; - int q = j; - const T pivot = values[high - 1]; - while (i < j) { - if (ASCENDING) { - while (values[++i] < pivot); - } else { - while (values[++i] > pivot); - } - if (ASCENDING) { - while (j > low && values[--j] > pivot); - } else { - while (j > low && values[--j] < pivot); - } - if (i < j) { - const T tmp = values[j]; - values[j] = values[i]; - values[i] = tmp; - if (values[i] == pivot) { - ++p; - const T tmp = values[i]; - values[i] = values[p]; - values[p] = tmp; - } - if (values[j] == pivot) { - --q; - const T tmp = values[j]; - values[j] = values[q]; - values[q] = tmp; + __syncthreads(); + int half_segment_length = BITONIC_SORT_NUM_ELEMENTS / 2; + while (half_segment_length >= 1) { + const int half_segment_index = static_cast(threadIdx.x) / half_segment_length; + if (half_segment_index % 2 == 0) { + const int index_to_compare = static_cast(threadIdx.x) + half_segment_length; + if (index_to_compare < local_len && ((shared_values[threadIdx.x] > shared_values[index_to_compare]) == ascending)) { + const VAL_T tmp = shared_values[index_to_compare]; + shared_values[index_to_compare] = shared_values[threadIdx.x]; + shared_values[threadIdx.x] = tmp; } } + __syncthreads(); + half_segment_length >>= 1; } - values[high - 1] = values[i]; - values[i] = pivot; - j = i - 1; - i = i + 1; - for (int k = low; k <= p; ++k, --j) { - const T tmp = values[k]; - values[k] = values[j]; - values[j] = tmp; - } - for (int k = high - 2; k >= q; --k, ++i) { - const T tmp = values[k]; - values[k] = values[i]; - values[i] = tmp; - } - if (j > low) { - cudaStream_t cuda_stream; - cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking); - CUDAQuickSortHelper<<<1, 1>>>(values, low, j + 1, depth + 1); - cudaStreamDestroy(cuda_stream); - } - if (i + 1 < high) { - cudaStream_t cuda_stream; - cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking); - CUDAQuickSortHelper<<<1, 1>>>(values, i, high, depth + 1); - cudaStreamDestroy(cuda_stream); + if (thread_index < len) { + values[thread_index] = shared_values[threadIdx.x]; } } -template <> -void CUDAQuickSort(int* values, const size_t n) { - CUDAQuickSortHelper<<<1, 1>>>(values, 0, static_cast(n), 0); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -template -__global__ void CUDAMergeKernel(T* values, T* buffer, int block_size, int len) { +template +__global__ void BitonicCompareKernel(VAL_T* values, const int half_segment_length, const int outer_segment_length, const int len) { const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const int low = block_size * 2 * thread_index; - T* values_first_part = values + low; - int num_data_first_part = min(block_size, len - low); - T* values_second_part = values + low + block_size; - int num_data_second_part = min(block_size, len - low - block_size); - T* buffer_pointer = buffer + low; - int first_part_index = 0; - int second_part_index = 0; - int buffer_index = 0; - while (first_part_index < num_data_first_part && second_part_index < num_data_second_part) { - if (ASCENDING) { - if (values_first_part[first_part_index] > values_second_part[second_part_index]) { - buffer_pointer[buffer_index++] = values_second_part[second_part_index++]; - } else { - buffer_pointer[buffer_index++] = values_first_part[first_part_index++]; + const int segment_index = thread_index / outer_segment_length; + const int half_segment_index = thread_index / half_segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + if (half_segment_index % 2 == 0) { + const int num_total_segment = (len + outer_segment_length - 1) / outer_segment_length; + if (BEGIN && (half_segment_index >> 1) == num_total_segment - 1 && ascending == ASCENDING) { + const int offset = num_total_segment * outer_segment_length - len; + const int segment_start = segment_index * outer_segment_length; + if (thread_index >= offset + segment_start) { + const int index_to_compare = thread_index + half_segment_length - offset; + if (index_to_compare < len && (values[thread_index] > values[index_to_compare]) == ascending) { + const VAL_T tmp = values[index_to_compare]; + values[index_to_compare] = values[thread_index]; + values[thread_index] = tmp; + } } } else { - if (values_first_part[first_part_index] < values_second_part[second_part_index]) { - buffer_pointer[buffer_index++] = values_second_part[second_part_index++]; - } else { - buffer_pointer[buffer_index++] = values_first_part[first_part_index++]; - } - } - } - while (first_part_index < num_data_first_part) { - buffer_pointer[buffer_index++] = values_first_part[first_part_index++]; - } - for (int data_index = 0; data_index < buffer_index; ++data_index) { - values_first_part[data_index] = buffer_pointer[data_index]; - } -} - -template <> -void CUDAMergeSort(int* values, const size_t n) { - const int bitonic_num_blocks = (static_cast(n) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; - auto start = std::chrono::steady_clock::now(); - BitonicSortForMergeSort<<>>(values, n); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - auto end = std::chrono::steady_clock::now(); - auto duration = static_cast>(end - start); - Log::Warning("bitonic sort time = %f", duration.count()); - int num_blocks_to_merge = bitonic_num_blocks; - int* buffer = nullptr; - AllocateCUDAMemoryOuter(&buffer, n, __FILE__, __LINE__); - int block_size = BITONIC_SORT_NUM_ELEMENTS; - start = std::chrono::steady_clock::now(); - while (num_blocks_to_merge > 1) { - num_blocks_to_merge = (num_blocks_to_merge + 1) / 2; - const int block_dim = 32; - const int num_kernel_blocks = (num_blocks_to_merge + block_dim - 1) / block_dim; - CUDAMergeKernel<<>>(values, buffer, block_size, static_cast(n)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - block_size <<= 1; - } - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); - Log::Warning("merge time = %f", duration.count()); -} - -template -__device__ void BitonicArgSort_1024(const VAL_T* values, INDEX_T* indices, const size_t len) { - const int thread_index = static_cast(threadIdx.x); - for (int depth = 9; depth >= 1; --depth) { - const int segment_length = 1 << (10 - depth); - const int segment_index = thread_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 1) : (segment_index % 2 == 0); - for (int inner_depth = depth; inner_depth < 10; ++inner_depth) { - const int inner_segment_length_half = 1 << (9 - inner_depth); - const int inner_segment_index_half = thread_index / inner_segment_length_half; - if (inner_segment_index_half % 2 == 0) { - const int index_to_compare = thread_index + inner_segment_length_half; - const INDEX_T this_index = indices[thread_index]; - const INDEX_T other_index = indices[index_to_compare]; - if ((values[this_index] > values[other_index]) == ascending && (index_to_compare < static_cast(len))) { - indices[thread_index] = other_index; - indices[index_to_compare] = this_index; + const int index_to_compare = thread_index + half_segment_length; + if (index_to_compare < len) { + if ((values[thread_index] > values[index_to_compare]) == ascending) { + const VAL_T tmp = values[index_to_compare]; + values[index_to_compare] = values[thread_index]; + values[thread_index] = tmp; } } - __syncthreads(); } } } - -template -__device__ void BitonicArgSort(const VAL_T* values, INDEX_T* indices, size_t len) { - const int num_segments = (static_cast(len) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; + +template +void BitonicSortGlobalHelper(VAL_T* values, const size_t len) { int max_depth = 1; - int num_segments_to_move = num_segments - 1; - while (num_segments_to_move > 0) { + int len_to_shift = static_cast(len) - 1; + while (len_to_shift > 0) { ++max_depth; + len_to_shift >>= 1; } - for (int depth = max_depth - 1; depth >= 1; --depth) { - const int segment_length = 1 << (max_depth - depth); - const int num_segments_in_level = 1 << depth; - for (int segment_index = 0; segment_index < num_segments_in_level; ++segment_index) { - const bool ascending = (segment_index % 2 == 0); - const size_t segment_start = segment_index * segment_length * BITONIC_SORT_NUM_ELEMENTS; - //const size_t segment_end = min(segment_start + ) + const int num_blocks = (static_cast(len) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; + BitonicSortGlobalKernel<<>>(values, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + for (int depth = max_depth - 11; depth >= 1; --depth) { + const int segment_length = (1 << (max_depth - depth)); + int half_segment_length = (segment_length >> 1); + { + BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + half_segment_length >>= 1; } + for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { + BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + half_segment_length >>= 1; + } + BitonicSortMergeKernel<<>>(values, segment_length, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } } -template -__global__ void BitonicSortGlobalKernel(T* values, const int num_total_data) { +template <> +void BitonicSortGlobal(int* values, const size_t len) { + BitonicSortGlobalHelper(values, len); +} + +template +__global__ void BitonicArgSortGlobalKernel(const VAL_T* values, INDEX_T* indices, const int num_total_data) { const int thread_index = static_cast(threadIdx.x); const int low = static_cast(blockIdx.x * BITONIC_SORT_NUM_ELEMENTS); const bool outer_ascending = ASCENDING ? (blockIdx.x % 2 == 0) : (blockIdx.x % 2 == 1); - T* values_pointer = values + low; + const VAL_T* values_pointer = values + low; + INDEX_T* indices_pointer = indices + low; const int num_data = min(BITONIC_SORT_NUM_ELEMENTS, num_total_data - low); - __shared__ T shared_values[BITONIC_SORT_NUM_ELEMENTS]; + __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; + __shared__ INDEX_T shared_indices[BITONIC_SORT_NUM_ELEMENTS]; if (thread_index < num_data) { shared_values[thread_index] = values_pointer[thread_index]; + shared_indices[thread_index] = indices_pointer[thread_index]; } __syncthreads(); for (int depth = BITONIC_SORT_DEPTH - 1; depth >= 1; --depth) { @@ -287,10 +193,11 @@ __global__ void BitonicSortGlobalKernel(T* values, const int num_total_data) { if (inner_segment_index_half % 2 == 0) { if (thread_index >= offset + segment_start) { const int index_to_compare = thread_index + inner_segment_length_half - offset; - if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { - const T tmp = shared_values[thread_index]; - shared_values[thread_index] = shared_values[index_to_compare]; - shared_values[index_to_compare] = tmp; + const INDEX_T this_index = shared_indices[thread_index]; + const INDEX_T other_index = shared_indices[index_to_compare]; + if (index_to_compare < num_data && (shared_values[this_index] > shared_values[other_index]) == ascending) { + shared_indices[thread_index] = other_index; + shared_indices[index_to_compare] = this_index; } } } @@ -301,30 +208,33 @@ __global__ void BitonicSortGlobalKernel(T* values, const int num_total_data) { const int inner_segment_index_half = thread_index / inner_segment_length_half; if (inner_segment_index_half % 2 == 0) { const int index_to_compare = thread_index + inner_segment_length_half; - if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { - const T tmp = shared_values[thread_index]; - shared_values[thread_index] = shared_values[index_to_compare]; - shared_values[index_to_compare] = tmp; + const INDEX_T this_index = shared_indices[thread_index]; + const INDEX_T other_index = shared_indices[thread_index]; + if (index_to_compare < num_data && (shared_values[this_index] > shared_values[other_index]) == ascending) { + shared_indices[thread_index] = other_index; + shared_indices[index_to_compare] = this_index; } } __syncthreads(); } } if (thread_index < num_data) { - values_pointer[thread_index] = shared_values[thread_index]; + indices_pointer[thread_index] = shared_indices[thread_index]; } } -template -__global__ void BitonicSortMergeKernel(VAL_T* values, const int segment_length, const int len) { +template +__global__ void BitonicArgSortMergeKernel(const VAL_T* values, INDEX_T* indices, const int segment_length, const int len) { const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); const int segment_index = thread_index / segment_length; const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; + __shared__ INDEX_T shared_indices[BITONIC_SORT_NUM_ELEMENTS]; const int offset = static_cast(blockIdx.x * blockDim.x); const int local_len = min(BITONIC_SORT_NUM_ELEMENTS, len - offset); if (thread_index < len) { shared_values[threadIdx.x] = values[thread_index]; + shared_indices[threadIdx.x] = indices[thread_index]; } __syncthreads(); int half_segment_length = BITONIC_SORT_NUM_ELEMENTS / 2; @@ -332,22 +242,23 @@ __global__ void BitonicSortMergeKernel(VAL_T* values, const int segment_length, const int half_segment_index = static_cast(threadIdx.x) / half_segment_length; if (half_segment_index % 2 == 0) { const int index_to_compare = static_cast(threadIdx.x) + half_segment_length; - if (index_to_compare < local_len && ((shared_values[threadIdx.x] > shared_values[index_to_compare]) == ascending)) { - const VAL_T tmp = shared_values[index_to_compare]; - shared_values[index_to_compare] = shared_values[threadIdx.x]; - shared_values[threadIdx.x] = tmp; + const INDEX_T this_index = shared_indices[thread_index]; + const INDEX_T other_index = shared_indices[index_to_compare]; + if (index_to_compare < local_len && ((shared_values[this_index] > shared_values[other_index]) == ascending)) { + shared_indices[thread_index] = other_index; + shared_indices[index_to_compare] = this_index; } } __syncthreads(); half_segment_length >>= 1; } if (thread_index < len) { - values[thread_index] = shared_values[threadIdx.x]; + indices[thread_index] = shared_indices[threadIdx.x]; } } -template -__global__ void BitonicCompareKernel(VAL_T* values, const int half_segment_length, const int outer_segment_length, const int len) { +template +__global__ void BitonicArgCompareKernel(const VAL_T* values, INDEX_T* indices, const int half_segment_length, const int outer_segment_length, const int len) { const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); const int segment_index = thread_index / outer_segment_length; const int half_segment_index = thread_index / half_segment_length; @@ -359,88 +270,245 @@ __global__ void BitonicCompareKernel(VAL_T* values, const int half_segment_lengt const int segment_start = segment_index * outer_segment_length; if (thread_index >= offset + segment_start) { const int index_to_compare = thread_index + half_segment_length - offset; - if (index_to_compare < len && (values[thread_index] > values[index_to_compare]) == ascending) { - const VAL_T tmp = values[index_to_compare]; - values[index_to_compare] = values[thread_index]; - values[thread_index] = tmp; + const INDEX_T this_index = indices[thread_index]; + const INDEX_T other_index = indices[index_to_compare]; + if (index_to_compare < len && (values[this_index] > values[other_index]) == ascending) { + indices[thread_index] = other_index; + indices[index_to_compare] = this_index; } } } else { const int index_to_compare = thread_index + half_segment_length; + const INDEX_T this_index = indices[thread_index]; + const INDEX_T other_index = indices[index_to_compare]; if (index_to_compare < len) { - if ((values[thread_index] > values[index_to_compare]) == ascending) { - const VAL_T tmp = values[index_to_compare]; - values[index_to_compare] = values[thread_index]; - values[thread_index] = tmp; + if ((values[this_index] > values[other_index]) == ascending) { + indices[thread_index] = other_index; + indices[index_to_compare] = this_index; } } } } } -template -void BitonicSortGlobalHelper(VAL_T* values, const size_t len) { +template +void BitonicArgSortGlobalHelper(const VAL_T* values, INDEX_T* indices, const size_t len) { int max_depth = 1; int len_to_shift = static_cast(len) - 1; while (len_to_shift > 0) { ++max_depth; len_to_shift >>= 1; } - /*std::vector tmp_result(len); - CopyFromCUDADeviceToHostOuter(tmp_result.data(), values, len, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - Log::Warning("=============================== before sorting ==============================="); - for (size_t i = 0; i < len; ++i) { - Log::Warning("tmp_result[%d] = %d", i, tmp_result[i]); - }*/ const int num_blocks = (static_cast(len) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; - BitonicSortGlobalKernel<<>>(values, static_cast(len)); + BitonicArgSortGlobalKernel<<>>(values, indices, static_cast(len)); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - /*CopyFromCUDADeviceToHostOuter(tmp_result.data(), values, len, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - Log::Warning("=============================== after block sort stage ==============================="); - for (size_t i = 0; i < len; ++i) { - Log::Warning("tmp_result[%d] = %d", i, tmp_result[i]); - }*/ for (int depth = max_depth - 11; depth >= 1; --depth) { const int segment_length = (1 << (max_depth - depth)); int half_segment_length = (segment_length >> 1); { - BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); + BitonicArgCompareKernel<<>>( + values, indices, half_segment_length, segment_length, static_cast(len)); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - /*CopyFromCUDADeviceToHostOuter(tmp_result.data(), values, len, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - Log::Warning("=============================== after compare stage depth %d inner depth = %d ===============================", depth, depth); - for (size_t i = 0; i < len; ++i) { - Log::Warning("tmp_result[%d] = %d", i, tmp_result[i]); - }*/ half_segment_length >>= 1; } for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { - BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); + BitonicArgCompareKernel<<>>( + values, indices, half_segment_length, segment_length, static_cast(len)); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - /*CopyFromCUDADeviceToHostOuter(tmp_result.data(), values, len, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - Log::Warning("=============================== after compare stage depth %d inner depth = %d ===============================", depth, inner_depth); - for (size_t i = 0; i < len; ++i) { - Log::Warning("tmp_result[%d] = %d", i, tmp_result[i]); - }*/ half_segment_length >>= 1; } - BitonicSortMergeKernel<<>>(values, segment_length, static_cast(len)); + BitonicArgSortMergeKernel<<>>( + values, indices, segment_length, static_cast(len)); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - /*CopyFromCUDADeviceToHostOuter(tmp_result.data(), values, len, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - Log::Warning("=============================== after merge stage depth %d ===============================", depth); - for (size_t i = 0; i < len; ++i) { - Log::Warning("tmp_result[%d] = %d", i, tmp_result[i]); - }*/ } } template <> -void BitonicSortGlobal(int* values, const size_t len) { - BitonicSortGlobalHelper(values, len); +void BitonicArgSortGlobal(const double* values, int* indices, const size_t len) { + BitonicArgSortGlobalHelper(values, indices, len); +} + +template +__device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, const int len) { + __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; + __shared__ INDEX_T shared_indices[BITONIC_SORT_NUM_ELEMENTS]; + int len_to_shift = len - 1; + int max_depth = 1; + while (len_to_shift > 0) { + len_to_shift >>= 1; + ++max_depth; + } + const int num_blocks = (len + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS + static_cast(threadIdx.x); + if (this_index < len) { + shared_values[threadIdx.x] = values[this_index]; + shared_indices[threadIdx.x] = this_index; + } else { + shared_indices[threadIdx.x] = len; + } + __syncthreads(); + const int num_data_in_block = min(BITONIC_SORT_NUM_ELEMENTS, len - block_index * BITONIC_SORT_NUM_ELEMENTS); + for (int depth = max_depth - 1; depth > max_depth - 11; --depth) { + const int segment_length = (1 << (max_depth - depth)); + const int segment_index = this_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + { + const int half_segment_length = (segment_length >> 1); + const int half_segment_index = this_index / half_segment_length; + const int num_total_segment = (num_data_in_block + segment_length - 1) / segment_length; + const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ? + (num_total_segment * segment_length - num_data_in_block) : 0; + if (half_segment_index % 2 == 0) { + const int segment_start = segment_index * segment_length; + if (static_cast(threadIdx.x) >= offset + segment_start) { + const int other_index = static_cast(threadIdx.x) + half_segment_length - offset; + const INDEX_T this_data_index = shared_indices[threadIdx.x]; + const INDEX_T other_data_index = shared_indices[other_index]; + const VAL_T this_value = shared_values[threadIdx.x]; + const VAL_T other_value = shared_values[other_index]; + if (other_data_index < len && (this_value > other_value) == ascending) { + shared_indices[threadIdx.x] = other_data_index; + shared_indices[other_index] = this_data_index; + shared_values[threadIdx.x] = other_value; + shared_values[other_index] = this_value; + } + } + } + __syncthreads(); + } + for (int inner_depth = depth + 1; inner_depth < max_depth; ++inner_depth) { + const int half_segment_length = (1 << (max_depth - inner_depth - 1)); + const int half_segment_index = this_index / half_segment_length; + if (half_segment_index % 2 == 0) { + const int other_index = static_cast(threadIdx.x) + half_segment_length; + const INDEX_T this_data_index = shared_indices[threadIdx.x]; + const INDEX_T other_data_index = shared_indices[other_index]; + const VAL_T this_value = shared_values[threadIdx.x]; + const VAL_T other_value = shared_values[other_index]; + if (other_data_index < len && (this_value > other_value) == ascending) { + shared_indices[threadIdx.x] = other_data_index; + shared_indices[other_index] = this_data_index; + shared_values[threadIdx.x] = other_value; + shared_values[other_index] = this_value; + } + } + __syncthreads(); + } + } + if (this_index < len) { + indices[this_index] = shared_indices[threadIdx.x]; + } + __syncthreads(); + } + for (int depth = max_depth - 11; depth >= 1; --depth) { + const int segment_length = (1 << (max_depth - depth)); + { + const int num_total_segment = (len + segment_length - 1) / segment_length; + const int half_segment_length = (segment_length >> 1); + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS + static_cast(threadIdx.x); + const int segment_index = this_index / segment_length; + const int half_segment_index = this_index / half_segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + const int offset = ((half_segment_index >> 1) == num_total_segment - 1 && ascending == ASCENDING) ? + (num_total_segment * segment_length - len) : 0; + if (half_segment_index % 2 == 0) { + const int segment_start = segment_index * segment_length; + if (this_index >= offset + segment_start) { + const int other_index = this_index + half_segment_length - offset; + const INDEX_T this_data_index = indices[this_index]; + const INDEX_T other_data_index = indices[other_index]; + const VAL_T this_value = values[this_index]; + const VAL_T other_value = values[other_index]; + if ((this_value > other_value) == ascending) { + indices[this_index] = other_data_index; + indices[other_index] = this_data_index; + } + } + } + } + } + for (int inner_depth = depth + 1; inner_depth < 11; ++inner_depth) { + const int half_segment_length = (1 << (max_depth - inner_depth - 1)); + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS + static_cast(threadIdx.x); + const int segment_index = this_index / segment_length; + const int half_segment_index = this_index / half_segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + if (half_segment_index % 2 == 0) { + const int other_index = this_index + half_segment_length; + if (other_index < len) { + const INDEX_T this_data_index = indices[this_index]; + const INDEX_T other_data_index = indices[other_index]; + const VAL_T this_value = values[this_data_index]; + const VAL_T other_value = values[other_data_index]; + if ((this_value > other_value) == ascending) { + indices[this_index] = other_data_index; + indices[other_index] = this_data_index; + } + } + } + __syncthreads(); + } + } + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS + static_cast(threadIdx.x); + const int segment_index = this_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + if (this_index < len) { + const INDEX_T index = indices[this_index]; + shared_values[threadIdx.x] = values[index]; + shared_indices[threadIdx.x] = index; + } else { + shared_indices[threadIdx.x] = len; + } + __syncthreads(); + for (int inner_depth = 11; inner_depth < max_depth; ++inner_depth) { + const int half_segment_length = (1 << (max_depth - inner_depth - 1)); + const int half_segment_index = this_index / half_segment_length; + if (half_segment_index % 2 == 0) { + const int other_index = static_cast(threadIdx.x) + half_segment_length; + const INDEX_T this_data_index = shared_indices[threadIdx.x]; + const INDEX_T other_data_index = shared_indices[other_index]; + const VAL_T this_value = shared_values[threadIdx.x]; + const VAL_T other_value = shared_values[other_index]; + if (other_data_index < len && (this_value > other_value) == ascending) { + shared_indices[threadIdx.x] = other_data_index; + shared_indices[other_index] = this_data_index; + shared_values[threadIdx.x] = other_value; + shared_values[other_index] = this_value; + } + } + __syncthreads(); + } + } + } +} + +__global__ void BitonicArgSortItemsGlobalKernel(const double* scores, + const int num_queries, + const data_size_t* cuda_query_boundaries, + data_size_t* out_indices) { + for (int query_index = 0; query_index < num_queries; query_index += BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE) { + const data_size_t query_item_start = cuda_query_boundaries[query_index]; + const data_size_t query_item_end = cuda_query_boundaries[query_index]; + const data_size_t num_items_in_query = query_item_end - query_item_start; + BitonicArgSortDevice(scores + query_item_start, + out_indices + query_item_start, + num_items_in_query); + } +} + +void BitonicArgSortItemsGlobal( + const double* scores, + const int num_queries, + const data_size_t* cuda_query_boundaries, + data_size_t* out_indices) { + const int num_blocks = (num_queries + BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE - 1) / BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE; + BitonicArgSortItemsGlobalKernel<<>>( + scores, num_queries, cuda_query_boundaries, out_indices); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } } // namespace LightGBM diff --git a/src/objective/cuda/cuda_rank_objective.cpp b/src/objective/cuda/cuda_rank_objective.cpp index 9bc97ffb7545..97d0c95aae5e 100644 --- a/src/objective/cuda/cuda_rank_objective.cpp +++ b/src/objective/cuda/cuda_rank_objective.cpp @@ -52,13 +52,6 @@ void CUDALambdarankNDCG::Init(const Metadata& metadata, data_size_t num_data) { void CUDALambdarankNDCG::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { LaunchGetGradientsKernel(score, gradients, hessians); - std::vector host_gradients(100, 0.0f); - std::vector host_hessians(100, 0.0f); - CopyFromCUDADeviceToHostOuter(host_gradients.data(), gradients, 100, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(host_hessians.data(), hessians, 100, __FILE__, __LINE__); - for (int i = 0; i < 100; ++i) { - Log::Warning("host_gradient[%d] = %f, host_hessians[%d] = %f", i, host_gradients[i], host_hessians[i]); - } } CUDARankXENDCG::CUDARankXENDCG(const Config& config): RankXENDCG(config) {} diff --git a/src/objective/cuda/cuda_rank_objective.cu b/src/objective/cuda/cuda_rank_objective.cu index 2cfeb218568d..36eb4cc4cfeb 100644 --- a/src/objective/cuda/cuda_rank_objective.cu +++ b/src/objective/cuda/cuda_rank_objective.cu @@ -17,73 +17,6 @@ namespace LightGBM { -template -__global__ void BitonicSortForMergeSortLocal(T* values, const int num_total_data) { - const int thread_index = static_cast(threadIdx.x); - const int low = static_cast(blockIdx.x * BITONIC_SORT_NUM_ELEMENTS_LOCAL); - T* values_pointer = values + low; - const int num_data = min(BITONIC_SORT_NUM_ELEMENTS_LOCAL, num_total_data - low); - __shared__ T shared_values[BITONIC_SORT_NUM_ELEMENTS_LOCAL]; - if (thread_index < num_data) { - shared_values[thread_index] = values_pointer[thread_index]; - } - __syncthreads(); - for (int depth = BITONIC_SORT_DEPTH_LOCAL - 1; depth >= 1; --depth) { - const int segment_length = 1 << (BITONIC_SORT_DEPTH_LOCAL - depth); - const int segment_index = thread_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - for (int inner_depth = depth; inner_depth < BITONIC_SORT_DEPTH_LOCAL; ++inner_depth) { - const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH_LOCAL - 1 - inner_depth); - const int inner_segment_index_half = thread_index / inner_segment_length_half; - if (inner_segment_index_half % 2 == 0) { - const int index_to_compare = thread_index + inner_segment_length_half; - if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { - const T tmp = shared_values[thread_index]; - shared_values[thread_index] = shared_values[index_to_compare]; - shared_values[index_to_compare] = tmp; - } - } - __syncthreads(); - } - } - if (thread_index < num_data) { - values_pointer[thread_index] = shared_values[thread_index]; - } -} - -template -__global__ void BitonicSortLocal(T* values, const int low, const int high) { - const int thread_index = static_cast(threadIdx.x); - T* values_pointer = values + low; - const int num_data = high - low; - __shared__ T shared_values[1024]; - if (thread_index < num_data) { - shared_values[thread_index] = values_pointer[thread_index]; - } - __syncthreads(); - for (int depth = 10; depth >= 1; --depth) { - const int segment_length = 1 << (11 - depth); - const int segment_index = thread_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - for (int inner_depth = depth; inner_depth < 11; ++inner_depth) { - const int inner_segment_length_half = 1 << (10 - inner_depth); - const int inner_segment_index_half = thread_index / inner_segment_length_half; - if (inner_segment_index_half % 2 == 0) { - const int index_to_compare = thread_index + inner_segment_length_half; - if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { - const T tmp = shared_values[thread_index]; - shared_values[thread_index] = shared_values[index_to_compare]; - shared_values[index_to_compare] = tmp; - } - } - __syncthreads(); - } - } - if (thread_index < num_data) { - values_pointer[thread_index] = shared_values[thread_index]; - } -} - __device__ __forceinline__ void ArgSort(const score_t* scores, uint16_t* indices, const uint16_t num_items) { uint16_t num_items_aligned = 1; uint16_t num_items_ref = num_items - 1; From 7808455dc5ae5d0642dd0ff45543ab1e4de5ccd6 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 11 Aug 2021 07:15:21 +0000 Subject: [PATCH 053/166] remove limitation of maximum number of items per query in ranking --- include/LightGBM/cuda/cuda_algorithms.hpp | 151 +++++- src/cuda/cuda_algorithms.cu | 43 +- src/objective/cuda/cuda_rank_objective.cpp | 8 +- src/objective/cuda/cuda_rank_objective.cu | 495 ++++++++---------- src/objective/cuda/cuda_rank_objective.hpp | 4 +- .../cuda/cuda_histogram_constructor.hpp | 2 +- 6 files changed, 393 insertions(+), 310 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 57a8fa2746c6..84be24556f63 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -94,11 +94,77 @@ template __device__ void ReduceMax(T* values, size_t n); template -__device__ void PrefixSum(T* values, size_t n); +__device__ void PrefixSum(T* values, size_t n) { + unsigned int offset = 1; + unsigned int threadIdx_x = static_cast(threadIdx.x); + const T last_element = values[n - 1]; + __syncthreads(); + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + values[dst_pos] += values[src_pos]; + } + offset <<= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + values[n - 1] = 0; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const T src_val = values[src_pos]; + values[src_pos] = values[dst_pos]; + values[dst_pos] += src_val; + } + __syncthreads(); + } + if (threadIdx.x == 0) { + values[n] = values[n - 1] + last_element; + } + __syncthreads(); +} template -__device__ void PrefixSumConflictFree(T* values, size_t n) { - PrefixSumInner(values, n, T); +__device__ __forceinline__ void PrefixSumConflictFree(T* values, size_t n) { + size_t offset = 1; + unsigned int threadIdx_x = threadIdx.x; + const size_t conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); + const T last_element = values[conflict_free_n_minus_1]; + __syncthreads(); + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; + const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; + values[CONFLICT_FREE_INDEX(dst_pos)] += values[CONFLICT_FREE_INDEX(src_pos)]; + } + offset <<= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + values[conflict_free_n_minus_1] = 0; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; + const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; + const size_t conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); + const size_t conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); + const T src_val = values[conflict_free_src_pos]; + values[conflict_free_src_pos] = values[conflict_free_dst_pos]; + values[conflict_free_dst_pos] += src_val; + } + __syncthreads(); + } + if (threadIdx_x == 0) { + values[CONFLICT_FREE_INDEX(n)] = values[conflict_free_n_minus_1] + last_element; + } } template @@ -112,6 +178,85 @@ void BitonicArgSortItemsGlobal(const double* values, const data_size_t* cuda_query_boundaries, data_size_t* out_indices); +__device__ __forceinline__ void BitonicArgSort_1024(const score_t* scores, uint16_t* indices, const uint16_t num_items) { + uint16_t depth = 1; + uint16_t num_items_aligend = 1; + uint16_t num_items_ref = num_items - 1; + while (num_items_ref > 0) { + num_items_ref >>= 1; + num_items_aligend <<= 1; + ++depth; + } + for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { + const uint16_t outer_segment_length = 1 << (depth - outer_depth); + const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; + const bool ascending = (outer_segment_index % 2 > 0); + for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { + const uint16_t segment_length = 1 << (depth - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (threadIdx.x < num_items_aligend) { + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length; + if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) { + const uint16_t index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } + } + __syncthreads(); + } + } +} + +__device__ __forceinline__ void BitonicArgSort_2048(const score_t* scores, uint16_t* indices) { + for (uint16_t base = 0; base < 2048; base += 1024) { + for (uint16_t outer_depth = 10; outer_depth >= 1; --outer_depth) { + const uint16_t outer_segment_length = 1 << (11 - outer_depth); + const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; + const bool ascending = (base == 0) ? (outer_segment_index % 2 > 0) : (outer_segment_index % 2 == 0); + for (uint16_t inner_depth = outer_depth; inner_depth < 11; ++inner_depth) { + const uint16_t segment_length = 1 << (11 - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length + base; + if ((scores[indices[threadIdx.x + base]] > scores[indices[index_to_compare]]) == ascending) { + const uint16_t index = indices[threadIdx.x + base]; + indices[threadIdx.x + base] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } + __syncthreads(); + } + } + } + const unsigned int index_to_compare = threadIdx.x + 1024; + if (scores[indices[index_to_compare]] > scores[indices[threadIdx.x]]) { + const uint16_t temp_index = indices[index_to_compare]; + indices[index_to_compare] = indices[threadIdx.x]; + indices[threadIdx.x] = temp_index; + } + __syncthreads(); + for (uint16_t base = 0; base < 2048; base += 1024) { + for (uint16_t inner_depth = 1; inner_depth < 11; ++inner_depth) { + const uint16_t segment_length = 1 << (11 - inner_depth); + const uint16_t half_segment_length = segment_length >> 1; + const uint16_t half_segment_index = threadIdx.x / half_segment_length; + if (half_segment_index % 2 == 0) { + const uint16_t index_to_compare = threadIdx.x + half_segment_length + base; + if (scores[indices[threadIdx.x + base]] < scores[indices[index_to_compare]]) { + const uint16_t index = indices[threadIdx.x + base]; + indices[threadIdx.x + base] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } + __syncthreads(); + } + } +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 5aa67c4a68e3..5806664b7389 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -348,7 +348,6 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons shared_indices[threadIdx.x] = len; } __syncthreads(); - const int num_data_in_block = min(BITONIC_SORT_NUM_ELEMENTS, len - block_index * BITONIC_SORT_NUM_ELEMENTS); for (int depth = max_depth - 1; depth > max_depth - 11; --depth) { const int segment_length = (1 << (max_depth - depth)); const int segment_index = this_index / segment_length; @@ -356,12 +355,12 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons { const int half_segment_length = (segment_length >> 1); const int half_segment_index = this_index / half_segment_length; - const int num_total_segment = (num_data_in_block + segment_length - 1) / segment_length; + const int num_total_segment = (len + segment_length - 1) / segment_length; const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ? - (num_total_segment * segment_length - num_data_in_block) : 0; + (num_total_segment * segment_length - len) : 0; if (half_segment_index % 2 == 0) { const int segment_start = segment_index * segment_length; - if (static_cast(threadIdx.x) >= offset + segment_start) { + if (this_index >= offset + segment_start) { const int other_index = static_cast(threadIdx.x) + half_segment_length - offset; const INDEX_T this_data_index = shared_indices[threadIdx.x]; const INDEX_T other_data_index = shared_indices[other_index]; @@ -411,25 +410,28 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons const int segment_index = this_index / segment_length; const int half_segment_index = this_index / half_segment_length; const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - const int offset = ((half_segment_index >> 1) == num_total_segment - 1 && ascending == ASCENDING) ? + const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ? (num_total_segment * segment_length - len) : 0; if (half_segment_index % 2 == 0) { const int segment_start = segment_index * segment_length; if (this_index >= offset + segment_start) { const int other_index = this_index + half_segment_length - offset; - const INDEX_T this_data_index = indices[this_index]; - const INDEX_T other_data_index = indices[other_index]; - const VAL_T this_value = values[this_index]; - const VAL_T other_value = values[other_index]; - if ((this_value > other_value) == ascending) { - indices[this_index] = other_data_index; - indices[other_index] = this_data_index; + if (other_index < len) { + const INDEX_T this_data_index = indices[this_index]; + const INDEX_T other_data_index = indices[other_index]; + const VAL_T this_value = values[this_data_index]; + const VAL_T other_value = values[other_data_index]; + if ((this_value > other_value) == ascending) { + indices[this_index] = other_data_index; + indices[other_index] = this_data_index; + } } } } } + __syncthreads(); } - for (int inner_depth = depth + 1; inner_depth < 11; ++inner_depth) { + for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { const int half_segment_length = (1 << (max_depth - inner_depth - 1)); for (int block_index = 0; block_index < num_blocks; ++block_index) { const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS + static_cast(threadIdx.x); @@ -464,7 +466,7 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons shared_indices[threadIdx.x] = len; } __syncthreads(); - for (int inner_depth = 11; inner_depth < max_depth; ++inner_depth) { + for (int inner_depth = max_depth - 10; inner_depth < max_depth; ++inner_depth) { const int half_segment_length = (1 << (max_depth - inner_depth - 1)); const int half_segment_index = this_index / half_segment_length; if (half_segment_index % 2 == 0) { @@ -482,6 +484,10 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons } __syncthreads(); } + if (this_index < len) { + indices[this_index] = shared_indices[threadIdx.x]; + } + __syncthreads(); } } } @@ -490,13 +496,16 @@ __global__ void BitonicArgSortItemsGlobalKernel(const double* scores, const int num_queries, const data_size_t* cuda_query_boundaries, data_size_t* out_indices) { - for (int query_index = 0; query_index < num_queries; query_index += BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE) { + const int query_index_start = static_cast(blockIdx.x) * BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE; + const int query_index_end = min(query_index_start + BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE, num_queries); + for (int query_index = query_index_start; query_index < query_index_end; ++query_index) { const data_size_t query_item_start = cuda_query_boundaries[query_index]; - const data_size_t query_item_end = cuda_query_boundaries[query_index]; + const data_size_t query_item_end = cuda_query_boundaries[query_index + 1]; const data_size_t num_items_in_query = query_item_end - query_item_start; BitonicArgSortDevice(scores + query_item_start, out_indices + query_item_start, num_items_in_query); + __syncthreads(); } } @@ -506,7 +515,7 @@ void BitonicArgSortItemsGlobal( const data_size_t* cuda_query_boundaries, data_size_t* out_indices) { const int num_blocks = (num_queries + BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE - 1) / BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE; - BitonicArgSortItemsGlobalKernel<<>>( + BitonicArgSortItemsGlobalKernel<<>>( scores, num_queries, cuda_query_boundaries, out_indices); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } diff --git a/src/objective/cuda/cuda_rank_objective.cpp b/src/objective/cuda/cuda_rank_objective.cpp index 97d0c95aae5e..ba4f6c3d69fd 100644 --- a/src/objective/cuda/cuda_rank_objective.cpp +++ b/src/objective/cuda/cuda_rank_objective.cpp @@ -15,7 +15,7 @@ LambdarankNDCG(config) {} void CUDALambdarankNDCG::Init(const Metadata& metadata, data_size_t num_data) { const int num_threads = OMP_NUM_THREADS(); - TestCUDAQuickSort(); + TestCUDABitonicSortForQueryItems(); LambdarankNDCG::Init(metadata, num_data); std::vector thread_max_num_items_in_query(num_threads); @@ -40,8 +40,10 @@ void CUDALambdarankNDCG::Init(const Metadata& metadata, data_size_t num_data) { max_items_in_query >>= 1; max_items_in_query_aligned_ <<= 1; } - if (max_items_in_query_aligned_ > MAX_NUM_ITEM_IN_QUERY) { - Log::Warning("Too many items (%d) in a query.", max_items_in_query_aligned_); + if (max_items_in_query_aligned_ > 2048) { + AllocateCUDAMemoryOuter(&cuda_item_indices_buffer_, + static_cast(metadata.query_boundaries()[metadata.num_queries()]), + __FILE__, __LINE__); } cuda_labels_ = metadata.cuda_metadata()->cuda_label(); cuda_query_boundaries_ = metadata.cuda_metadata()->cuda_query_boundaries(); diff --git a/src/objective/cuda/cuda_rank_objective.cu b/src/objective/cuda/cuda_rank_objective.cu index 36eb4cc4cfeb..1b8724f41d81 100644 --- a/src/objective/cuda/cuda_rank_objective.cu +++ b/src/objective/cuda/cuda_rank_objective.cu @@ -17,127 +17,17 @@ namespace LightGBM { -__device__ __forceinline__ void ArgSort(const score_t* scores, uint16_t* indices, const uint16_t num_items) { - uint16_t num_items_aligned = 1; - uint16_t num_items_ref = num_items - 1; - uint16_t depth = 1; - while (num_items_ref > 0) { - num_items_aligned <<= 1; - num_items_ref >>= 1; - ++depth; - } - for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { - const uint16_t outer_segment_length = 1 << (depth - outer_depth); - const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; - const bool ascending = (outer_segment_index % 2 > 0); - for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { - const uint16_t segment_length = 1 << (depth - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < num_items_aligned) { - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length; - if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) { - const uint16_t index = indices[threadIdx.x]; - indices[threadIdx.x] = indices[index_to_compare]; - indices[index_to_compare] = index; - } - } - } - __syncthreads(); - } - } -} - -__device__ __forceinline__ void ArgSort_Partial(const score_t* scores, uint16_t* indices, const uint16_t num_items, const bool outer_decending) { - uint16_t num_items_aligned = 1; - uint16_t num_items_ref = num_items - 1; - uint16_t depth = 1; - while (num_items_ref > 0) { - num_items_aligned <<= 1; - num_items_ref >>= 1; - ++depth; - } - for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { - const uint16_t outer_segment_length = 1 << (depth - outer_depth); - const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; - const bool ascending = outer_decending ? (outer_segment_index % 2 > 0) : (outer_segment_index % 2 == 0); - for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { - const uint16_t segment_length = 1 << (depth - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < num_items_aligned) { - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length; - if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) { - const uint16_t index = indices[threadIdx.x]; - indices[threadIdx.x] = indices[index_to_compare]; - indices[index_to_compare] = index; - } - } - } - __syncthreads(); - } - } -} - -__device__ __forceinline__ void ArgSort_2048(const score_t* scores, uint16_t* indices, const uint16_t num_items) { - ArgSort_Partial(scores, indices, 1024, true); - ArgSort_Partial(scores + 1024, indices + 1024, 1024, false); - const unsigned int index_to_compare = threadIdx.x + 1024; - if (scores[indices[index_to_compare]] > scores[indices[threadIdx.x]]) { - const uint16_t temp_index = indices[index_to_compare]; - indices[index_to_compare] = indices[threadIdx.x]; - indices[threadIdx.x] = temp_index; - } - __syncthreads(); - for (uint16_t inner_depth = 1; inner_depth < 11; ++inner_depth) { - const uint16_t segment_length = 1 << (11 - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < 1024) { - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length; - if (scores[indices[threadIdx.x]] < scores[indices[index_to_compare]]) { - const uint16_t index = indices[threadIdx.x]; - indices[threadIdx.x] = indices[index_to_compare]; - indices[index_to_compare] = index; - } - } - } - __syncthreads(); - } - const score_t* scores_ptr = scores + 1024; - uint16_t* indices_ptr = indices + 1024; - for (uint16_t inner_depth = 1; inner_depth < 11; ++inner_depth) { - const uint16_t segment_length = 1 << (11 - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < 1024) { - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length; - if (scores_ptr[indices_ptr[threadIdx.x]] < scores_ptr[indices_ptr[index_to_compare]]) { - const uint16_t index = indices_ptr[threadIdx.x]; - indices_ptr[threadIdx.x] = indices_ptr[index_to_compare]; - indices_ptr[index_to_compare] = index; - } - } - } - __syncthreads(); - } -} - +template __global__ void GetGradientsKernel_LambdarankNDCG(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, const bool norm, const double sigmoid, const int truncation_level, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - __shared__ score_t shared_scores[MAX_NUM_ITEM_IN_QUERY]; - __shared__ uint16_t shared_indices[MAX_NUM_ITEM_IN_QUERY]; - __shared__ score_t shared_lambdas[MAX_NUM_ITEM_IN_QUERY]; - __shared__ score_t shared_hessians[MAX_NUM_ITEM_IN_QUERY]; + __shared__ score_t shared_scores[MAX_ITEM_GREATER_THAN_1024 ? 2048 : 1024]; + __shared__ uint16_t shared_indices[MAX_ITEM_GREATER_THAN_1024 ? 2048 : 1024]; + __shared__ score_t shared_lambdas[MAX_ITEM_GREATER_THAN_1024 ? 2048 : 1024]; + __shared__ score_t shared_hessians[MAX_ITEM_GREATER_THAN_1024 ? 2048 : 1024]; const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); - const double min_score = kMinScore; for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { const double inverse_max_dcg = cuda_inverse_max_dcgs[query_index]; const data_size_t query_start = cuda_query_boundaries[query_index]; @@ -153,16 +43,38 @@ __global__ void GetGradientsKernel_LambdarankNDCG(const double* cuda_scores, con shared_lambdas[threadIdx.x] = 0.0f; shared_hessians[threadIdx.x] = 0.0f; } else { - shared_scores[threadIdx.x] = min_score; + shared_scores[threadIdx.x] = kMinScore; shared_indices[threadIdx.x] = static_cast(threadIdx.x); } + if (MAX_ITEM_GREATER_THAN_1024) { + if (query_item_count > 1024) { + const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; + if (threadIdx_x_plus_1024 < query_item_count) { + shared_scores[threadIdx_x_plus_1024] = cuda_scores_pointer[threadIdx_x_plus_1024]; + shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); + shared_lambdas[threadIdx_x_plus_1024] = 0.0f; + shared_hessians[threadIdx_x_plus_1024] = 0.0f; + } else { + shared_scores[threadIdx_x_plus_1024] = kMinScore; + shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); + } + } + } __syncthreads(); - ArgSort(shared_scores, shared_indices, static_cast(query_item_count)); + if (MAX_ITEM_GREATER_THAN_1024) { + if (query_item_count > 1024) { + BitonicArgSort_2048(shared_scores, shared_indices); + } else { + BitonicArgSort_1024(shared_scores, shared_indices, static_cast(query_item_count)); + } + } else { + BitonicArgSort_1024(shared_scores, shared_indices, static_cast(query_item_count)); + } __syncthreads(); // get best and worst score const double best_score = shared_scores[shared_indices[0]]; data_size_t worst_idx = query_item_count - 1; - if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == min_score) { + if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == kMinScore) { worst_idx -= 1; } const double worst_score = shared_scores[shared_indices[worst_idx]]; @@ -174,87 +86,102 @@ __global__ void GetGradientsKernel_LambdarankNDCG(const double* cuda_scores, con // start accumulate lambdas by pairs that contain at least one document above truncation level const data_size_t num_items_i = min(query_item_count - 1, truncation_level); const data_size_t num_j_per_i = query_item_count - 1; - const data_size_t num_pairs = num_items_i * num_j_per_i; - const data_size_t num_pairs_per_thread = (num_pairs + blockDim.x - 1) / blockDim.x; - const data_size_t thread_start = static_cast(threadIdx.x) * num_pairs_per_thread; - const data_size_t thread_end = min(thread_start + num_pairs_per_thread, num_pairs); - for (data_size_t pair_index = thread_start; pair_index < thread_end; ++pair_index) { - const data_size_t i = pair_index / num_j_per_i; - const data_size_t j = pair_index % num_j_per_i + 1; - if (j > i) { - // skip pairs with the same labels - if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != min_score) { - data_size_t high_rank, low_rank; - if (cuda_label_pointer[shared_indices[i]] > cuda_label_pointer[shared_indices[j]]) { - high_rank = i; - low_rank = j; - } else { - high_rank = j; - low_rank = i; - } - const data_size_t high = shared_indices[high_rank]; - const int high_label = static_cast(cuda_label_pointer[high]); - const double high_score = shared_scores[high]; - const double high_label_gain = static_cast((1 << high_label) - 1); - const double high_discount = log2(2.0f + high_rank); - const data_size_t low = shared_indices[low_rank]; - const int low_label = static_cast(cuda_label_pointer[low]); - const double low_score = shared_scores[low]; - const double low_label_gain = static_cast((1 << low_label) - 1); - const double low_discount = log2(2.0f + low_rank); + const data_size_t s = num_j_per_i - num_items_i + 1; + const data_size_t num_pairs = (num_j_per_i + s) * num_items_i / 2; + double thread_sum_lambdas = 0.0f; + for (data_size_t pair_index = static_cast(threadIdx.x); pair_index < num_pairs; pair_index += static_cast(blockDim.x)) { + const double square = 2 * static_cast(pair_index) + s * s - s; + const double sqrt_result = floor(sqrt(square)); + const data_size_t row_index = static_cast(floor(sqrt(square - sqrt_result)) + 1 - s); + const data_size_t i = num_items_i - 1 - row_index; + const data_size_t j = num_j_per_i - (pair_index - (2 * s + row_index - 1) * row_index / 2); + if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != kMinScore) { + data_size_t high_rank, low_rank; + if (cuda_label_pointer[shared_indices[i]] > cuda_label_pointer[shared_indices[j]]) { + high_rank = i; + low_rank = j; + } else { + high_rank = j; + low_rank = i; + } + const data_size_t high = shared_indices[high_rank]; + const int high_label = static_cast(cuda_label_pointer[high]); + const double high_score = shared_scores[high]; + const double high_label_gain = static_cast((1 << high_label) - 1); + const double high_discount = log2(2.0f + high_rank); + const data_size_t low = shared_indices[low_rank]; + const int low_label = static_cast(cuda_label_pointer[low]); + const double low_score = shared_scores[low]; + const double low_label_gain = static_cast((1 << low_label) - 1); + const double low_discount = log2(2.0f + low_rank); - const double delta_score = high_score - low_score; + const double delta_score = high_score - low_score; - // get dcg gap - const double dcg_gap = high_label_gain - low_label_gain; - // get discount of this pair - const double paired_discount = fabs(high_discount - low_discount); - // get delta NDCG - double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; - // regular the delta_pair_NDCG by score distance - if (norm && best_score != worst_score) { - delta_pair_NDCG /= (0.01f + fabs(delta_score)); - } - // calculate lambda for this pair - double p_lambda = 1.0f / (1.0f + exp(sigmoid * delta_score)); - double p_hessian = p_lambda * (1.0f - p_lambda); - // update - p_lambda *= -sigmoid * delta_pair_NDCG; - p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; - atomicAdd_block(shared_lambdas + low, -static_cast(p_lambda)); - atomicAdd_block(shared_hessians + low, static_cast(p_hessian)); - atomicAdd_block(shared_lambdas + high, static_cast(p_lambda)); - atomicAdd_block(shared_hessians + high, static_cast(p_hessian)); - // lambda is negative, so use minus to accumulate - atomicAdd_block(&sum_lambdas, -2 * p_lambda); + // get dcg gap + const double dcg_gap = high_label_gain - low_label_gain; + // get discount of this pair + const double paired_discount = fabs(high_discount - low_discount); + // get delta NDCG + double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; + // regular the delta_pair_NDCG by score distance + if (norm && best_score != worst_score) { + delta_pair_NDCG /= (0.01f + fabs(delta_score)); } + // calculate lambda for this pair + double p_lambda = 1.0f / (1.0f + exp(sigmoid * delta_score)); + double p_hessian = p_lambda * (1.0f - p_lambda); + // update + p_lambda *= -sigmoid * delta_pair_NDCG; + p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; + atomicAdd_block(shared_lambdas + low, -static_cast(p_lambda)); + atomicAdd_block(shared_hessians + low, static_cast(p_hessian)); + atomicAdd_block(shared_lambdas + high, static_cast(p_lambda)); + atomicAdd_block(shared_hessians + high, static_cast(p_hessian)); + // lambda is negative, so use minus to accumulate + thread_sum_lambdas -= 2 * p_lambda; } } + atomicAdd_block(&sum_lambdas, thread_sum_lambdas); __syncthreads(); if (norm && sum_lambdas > 0) { - double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; + const double norm_factor = log2(1 + sum_lambdas) / sum_lambdas; if (threadIdx.x < static_cast(query_item_count)) { cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x] * norm_factor); cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x] * norm_factor); } + if (MAX_ITEM_GREATER_THAN_1024) { + if (query_item_count > 1024) { + const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; + if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024] * norm_factor); + cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024] * norm_factor); + } + } + } } else { if (threadIdx.x < static_cast(query_item_count)) { cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x]); cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x]); } + if (MAX_ITEM_GREATER_THAN_1024) { + if (query_item_count > 1024) { + const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; + if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { + cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024]); + cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024]); + } + } + } } __syncthreads(); } } -__global__ void GetGradientsKernel_LambdarankNDCG_2048(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, +__global__ void GetGradientsKernel_LambdarankNDCG_Sorted( + const double* cuda_scores, const int* cuda_item_indices_buffer, const label_t* cuda_labels, const data_size_t num_data, const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, const bool norm, const double sigmoid, const int truncation_level, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - __shared__ score_t shared_scores[MAX_NUM_ITEM_IN_QUERY]; - __shared__ uint16_t shared_indices[MAX_NUM_ITEM_IN_QUERY]; - __shared__ score_t shared_lambdas[MAX_NUM_ITEM_IN_QUERY]; - __shared__ score_t shared_hessians[MAX_NUM_ITEM_IN_QUERY]; const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { @@ -263,79 +190,57 @@ __global__ void GetGradientsKernel_LambdarankNDCG_2048(const double* cuda_scores const data_size_t query_end = cuda_query_boundaries[query_index + 1]; const data_size_t query_item_count = query_end - query_start; const double* cuda_scores_pointer = cuda_scores + query_start; + const int* cuda_item_indices_buffer_pointer = cuda_item_indices_buffer + query_start; score_t* cuda_out_gradients_pointer = cuda_out_gradients + query_start; score_t* cuda_out_hessians_pointer = cuda_out_hessians + query_start; const label_t* cuda_label_pointer = cuda_labels + query_start; - if (threadIdx.x < query_item_count) { - shared_scores[threadIdx.x] = cuda_scores_pointer[threadIdx.x]; - shared_indices[threadIdx.x] = static_cast(threadIdx.x); - shared_lambdas[threadIdx.x] = 0.0f; - shared_hessians[threadIdx.x] = 0.0f; - } else { - shared_scores[threadIdx.x] = kMinScore; - shared_indices[threadIdx.x] = static_cast(threadIdx.x); - } - if (query_item_count > 1024) { - const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; - if (threadIdx_x_plus_1024 < query_item_count) { - shared_scores[threadIdx_x_plus_1024] = cuda_scores_pointer[threadIdx_x_plus_1024]; - shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); - shared_lambdas[threadIdx_x_plus_1024] = 0.0f; - shared_hessians[threadIdx_x_plus_1024] = 0.0f; - } else { - shared_scores[threadIdx_x_plus_1024] = kMinScore; - shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); - } - } - __syncthreads(); - if (query_item_count > 1024) { - ArgSort_2048(shared_scores, shared_indices, static_cast(query_item_count)); - } else { - ArgSort(shared_scores, shared_indices, static_cast(query_item_count)); - } - __syncthreads(); // get best and worst score - const double best_score = shared_scores[shared_indices[0]]; + const double best_score = cuda_scores_pointer[cuda_item_indices_buffer_pointer[0]]; data_size_t worst_idx = query_item_count - 1; - if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == kMinScore) { + if (worst_idx > 0 && cuda_scores_pointer[cuda_item_indices_buffer_pointer[worst_idx]] == kMinScore) { worst_idx -= 1; } - const double worst_score = shared_scores[shared_indices[worst_idx]]; + const double worst_score = cuda_scores_pointer[cuda_item_indices_buffer_pointer[worst_idx]]; __shared__ double sum_lambdas; if (threadIdx.x == 0) { sum_lambdas = 0.0f; } + for (int item_index = static_cast(threadIdx.x); item_index < query_item_count; item_index += static_cast(blockDim.x)) { + cuda_out_gradients_pointer[item_index] = 0.0f; + cuda_out_hessians_pointer[item_index] = 0.0f; + } __syncthreads(); // start accumulate lambdas by pairs that contain at least one document above truncation level const data_size_t num_items_i = min(query_item_count - 1, truncation_level); const data_size_t num_j_per_i = query_item_count - 1; - const data_size_t num_pairs = num_items_i * num_j_per_i; - const data_size_t num_pairs_per_thread = (num_pairs + blockDim.x - 1) / blockDim.x; - const data_size_t thread_start = static_cast(threadIdx.x) * num_pairs_per_thread; - const data_size_t thread_end = min(thread_start + num_pairs_per_thread, num_pairs); + const data_size_t s = num_j_per_i - num_items_i + 1; + const data_size_t num_pairs = (num_j_per_i + s) * num_items_i / 2; double thread_sum_lambdas = 0.0f; - for (data_size_t pair_index = thread_start; pair_index < thread_end; ++pair_index) { - const data_size_t i = pair_index / num_j_per_i; - const data_size_t j = pair_index % num_j_per_i + 1; + for (data_size_t pair_index = static_cast(threadIdx.x); pair_index < num_pairs; pair_index += static_cast(blockDim.x)) { + const double square = 2 * static_cast(pair_index) + s * s - s; + const double sqrt_result = floor(sqrt(square)); + const data_size_t row_index = static_cast(floor(sqrt(square - sqrt_result)) + 1 - s); + const data_size_t i = num_items_i - 1 - row_index; + const data_size_t j = num_j_per_i - (pair_index - (2 * s + row_index - 1) * row_index / 2); if (j > i) { // skip pairs with the same labels - if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != kMinScore) { + if (cuda_label_pointer[cuda_item_indices_buffer_pointer[i]] != cuda_label_pointer[cuda_item_indices_buffer_pointer[j]] && cuda_scores_pointer[cuda_item_indices_buffer_pointer[j]] != kMinScore) { data_size_t high_rank, low_rank; - if (cuda_label_pointer[shared_indices[i]] > cuda_label_pointer[shared_indices[j]]) { + if (cuda_label_pointer[cuda_item_indices_buffer_pointer[i]] > cuda_label_pointer[cuda_item_indices_buffer_pointer[j]]) { high_rank = i; low_rank = j; } else { high_rank = j; low_rank = i; } - const data_size_t high = shared_indices[high_rank]; + const data_size_t high = cuda_item_indices_buffer_pointer[high_rank]; const int high_label = static_cast(cuda_label_pointer[high]); - const double high_score = shared_scores[high]; + const double high_score = cuda_scores_pointer[high]; const double high_label_gain = static_cast((1 << high_label) - 1); const double high_discount = log2(2.0f + high_rank); - const data_size_t low = shared_indices[low_rank]; + const data_size_t low = cuda_item_indices_buffer_pointer[low_rank]; const int low_label = static_cast(cuda_label_pointer[low]); - const double low_score = shared_scores[low]; + const double low_score = cuda_scores_pointer[low]; const double low_label_gain = static_cast((1 << low_label) - 1); const double low_discount = log2(2.0f + low_rank); @@ -357,10 +262,10 @@ __global__ void GetGradientsKernel_LambdarankNDCG_2048(const double* cuda_scores // update p_lambda *= -sigmoid * delta_pair_NDCG; p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; - atomicAdd_block(shared_lambdas + low, -static_cast(p_lambda)); - atomicAdd_block(shared_hessians + low, static_cast(p_hessian)); - atomicAdd_block(shared_lambdas + high, static_cast(p_lambda)); - atomicAdd_block(shared_hessians + high, static_cast(p_hessian)); + atomicAdd_block(cuda_out_gradients_pointer + low, -static_cast(p_lambda)); + atomicAdd_block(cuda_out_hessians_pointer + low, static_cast(p_hessian)); + atomicAdd_block(cuda_out_gradients_pointer + high, static_cast(p_lambda)); + atomicAdd_block(cuda_out_hessians_pointer + high, static_cast(p_hessian)); // lambda is negative, so use minus to accumulate thread_sum_lambdas -= 2 * p_lambda; } @@ -370,28 +275,9 @@ __global__ void GetGradientsKernel_LambdarankNDCG_2048(const double* cuda_scores __syncthreads(); if (norm && sum_lambdas > 0) { const double norm_factor = log2(1 + sum_lambdas) / sum_lambdas; - if (threadIdx.x < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x] * norm_factor); - cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x] * norm_factor); - } - if (query_item_count > 1024) { - const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; - if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024] * norm_factor); - cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024] * norm_factor); - } - } - } else { - if (threadIdx.x < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x]); - cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x]); - } - if (query_item_count > 1024) { - const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; - if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024]); - cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024]); - } + for (int item_index = static_cast(threadIdx.x); item_index < query_item_count; item_index += static_cast(blockDim.x)) { + cuda_out_gradients_pointer[item_index] *= norm_factor; + cuda_out_hessians_pointer[item_index] *= norm_factor; } } __syncthreads(); @@ -401,54 +287,23 @@ __global__ void GetGradientsKernel_LambdarankNDCG_2048(const double* cuda_scores void CUDALambdarankNDCG::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; if (max_items_in_query_aligned_ <= 1024) { - GetGradientsKernel_LambdarankNDCG<<>>(score, cuda_labels_, num_data_, + GetGradientsKernel_LambdarankNDCG<<>>(score, cuda_labels_, num_data_, num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, norm_, sigmoid_, truncation_level_, gradients, hessians); } else if (max_items_in_query_aligned_ <= 2048) { - GetGradientsKernel_LambdarankNDCG_2048<<>>(score, cuda_labels_, num_data_, + GetGradientsKernel_LambdarankNDCG<<>>(score, cuda_labels_, num_data_, num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, norm_, sigmoid_, truncation_level_, gradients, hessians); } else { - Log::Fatal("Too large max_items_in_query_aligned_ = %d", max_items_in_query_aligned_); - } - PrintLastCUDAErrorOuter(__FILE__, __LINE__); -} - -__device__ void PrefixSumBankConflict(uint16_t* elements, unsigned int n) { - unsigned int offset = 1; - unsigned int threadIdx_x = threadIdx.x; - const uint16_t last_element = elements[n - 1]; - __syncthreads(); - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - elements[dst_pos] += elements[src_pos]; - } - offset <<= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - elements[n - 1] = 0; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const uint32_t src_val = elements[src_pos]; - elements[src_pos] = elements[dst_pos]; - elements[dst_pos] += src_val; - } - __syncthreads(); - } - if (threadIdx.x == 0) { - elements[n] = elements[n - 1] + last_element; + BitonicArgSortItemsGlobal(score, num_queries_, cuda_query_boundaries_, cuda_item_indices_buffer_); + GetGradientsKernel_LambdarankNDCG_Sorted<<>>(score, cuda_item_indices_buffer_, cuda_labels_, num_data_, + num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, + norm_, sigmoid_, truncation_level_, + gradients, hessians); } - __syncthreads(); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } __global__ void CalcInverseMaxDCGKernel( @@ -478,7 +333,7 @@ __global__ void CalcInverseMaxDCGKernel( label_pos[threadIdx.x] = label_sum[threadIdx.x]; } __syncthreads(); - PrefixSumBankConflict(label_pos, MAX_RANK_LABEL); + PrefixSum(label_pos, MAX_RANK_LABEL); __syncthreads(); __shared__ double gain; if (threadIdx.x == 0) { @@ -595,6 +450,76 @@ void CUDALambdarankNDCG::TestCUDAQuickSort() const { Log::Warning("cuda argsort test pass"); } +void CUDALambdarankNDCG::TestCUDABitonicSortForQueryItems() const { + int num_queries = 1000; + std::vector items_per_query(num_queries + 1, 0); + std::vector item_scores; + const int max_item_per_query = 5000; + std::vector num_item_probs(max_item_per_query, 1.0f / max_item_per_query); + std::discrete_distribution num_item_distribution(num_item_probs.begin(), num_item_probs.end()); + std::uniform_real_distribution score_dist; + const int num_threads = OMP_NUM_THREADS(); + std::vector thread_random_engines(num_threads); + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + thread_random_engines[thread_index] = std::mt19937(thread_index); + } + int num_total_items = 0; + #pragma omp parallel for schedule(static) num_threads(num_threads) reduction(+:num_total_items) + for (int query_index = 0; query_index < num_queries; ++query_index) { + const int thread_index = omp_get_thread_num(); + items_per_query[query_index + 1] = num_item_distribution(thread_random_engines[thread_index]); + num_total_items += items_per_query[query_index + 1]; + } + for (int query_index = 0; query_index < num_queries; ++query_index) { + items_per_query[query_index + 1] += items_per_query[query_index]; + } + item_scores.resize(num_total_items, 0.0f); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int item_index = 0; item_index < num_total_items; ++item_index) { + const int thread_index = omp_get_thread_num(); + item_scores[item_index] = score_dist(thread_random_engines[thread_index]); + } + double* cuda_score = nullptr; + data_size_t* cuda_query_boundaries = nullptr; + data_size_t* cuda_out_indices = nullptr; + InitCUDAMemoryFromHostMemoryOuter(&cuda_score, item_scores.data(), item_scores.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_query_boundaries, items_per_query.data(), items_per_query.size(), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_out_indices, item_scores.size(), __FILE__, __LINE__); + const auto start = std::chrono::steady_clock::now(); + BitonicArgSortItemsGlobal(cuda_score, num_queries, cuda_query_boundaries, cuda_out_indices); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + const auto end = std::chrono::steady_clock::now(); + const std::chrono::duration duration = static_cast>(end - start); + Log::Warning("bitonic arg sort items global time = %f", duration.count()); + std::vector sorted_item_indices(item_scores.size()); + CopyFromCUDADeviceToHostOuter(sorted_item_indices.data(), cuda_out_indices, item_scores.size(), __FILE__, __LINE__); + std::vector host_sorted_item_indices(item_scores.size()); + PrintLastCUDAErrorOuter(__FILE__, __LINE__); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int i = 0; i < num_queries; ++i) { + const int query_start = items_per_query[i]; + const int query_end = items_per_query[i + 1]; + for (int j = query_start; j < query_end; ++j) { + host_sorted_item_indices[j] = j - query_start; + } + std::sort(host_sorted_item_indices.data() + query_start, host_sorted_item_indices.data() + query_end, [&item_scores, query_start] (int a, int b) { + return item_scores[query_start + a] > item_scores[query_start + b]; + }); + } + for (int query_index = 0; query_index < num_queries; ++query_index) { + const int query_start = items_per_query[query_index]; + const int query_end = items_per_query[query_index + 1]; + for (int item_index = query_start; item_index < query_end; ++item_index) { + const double cuda_item_score = item_scores[query_start + sorted_item_indices[item_index]]; + const double host_item_score = item_scores[query_start + host_sorted_item_indices[item_index]]; + if (cuda_item_score != host_item_score) { + Log::Warning("item_index = %d, query_start = %d, cuda_item_score = %f, host_item_score = %f, sorted_item_indices = %d", + item_index, query_start, cuda_item_score, host_item_score, sorted_item_indices[item_index]); + } + } + } +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/objective/cuda/cuda_rank_objective.hpp b/src/objective/cuda/cuda_rank_objective.hpp index dc2542dae4d8..5a9cd2bc2a52 100644 --- a/src/objective/cuda/cuda_rank_objective.hpp +++ b/src/objective/cuda/cuda_rank_objective.hpp @@ -9,7 +9,6 @@ #ifdef USE_CUDA -#define MAX_NUM_ITEM_IN_QUERY (2048) #define NUM_QUERY_PER_BLOCK (10) #define MAX_RANK_LABEL (32) @@ -37,9 +36,12 @@ class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG void TestCUDAQuickSort() const; + void TestCUDABitonicSortForQueryItems() const; + // CUDA memory, held by this object double* cuda_lambdas_; double* cuda_inverse_max_dcgs_; + int* cuda_item_indices_buffer_; // CUDA memory, held by other objects const label_t* cuda_labels_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index fc30731a92c7..613ba969acf6 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -105,7 +105,7 @@ class CUDAHistogramConstructor { /*! \brief aligned number of bins of the features whose histograms need to be fixed */ std::vector need_fix_histogram_features_num_bin_aligend_; /*! \brief minimum number of blocks allowed in the y dimension */ - const int min_grid_dim_y_ = 160; + const int min_grid_dim_y_ = 10; // CUDA memory, held by this object From 7a0d218036f8e7d6d188986f1f05177bd5157181 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 16 Aug 2021 02:05:29 +0000 Subject: [PATCH 054/166] add cuda metrics --- include/LightGBM/cuda/cuda_algorithms.hpp | 153 +++++-- .../LightGBM/cuda/cuda_objective_function.hpp | 4 +- src/boosting/gbdt.cpp | 4 +- src/cuda/cuda_algorithms.cu | 329 +++++++++++++- src/metric/binary_metric.hpp | 4 +- src/metric/cuda/cuda_binary_metric.cpp | 63 +++ src/metric/cuda/cuda_binary_metric.cu | 88 ++++ src/metric/cuda/cuda_binary_metric.hpp | 123 ++++++ src/metric/cuda/cuda_metric.hpp | 19 + src/metric/metric.cpp | 104 +++-- src/objective/cuda/cuda_binary_objective.cu | 8 +- .../cuda/cuda_multiclass_objective.cpp | 17 - src/objective/cuda/cuda_rank_objective.cpp | 35 +- src/objective/cuda/cuda_rank_objective.cu | 409 ++++++++++++------ src/objective/cuda/cuda_rank_objective.hpp | 16 +- src/objective/objective_function.cpp | 2 + src/objective/rank_objective.hpp | 4 + .../cuda/cuda_histogram_constructor.hpp | 2 +- 18 files changed, 1120 insertions(+), 264 deletions(-) create mode 100644 src/metric/cuda/cuda_binary_metric.cpp create mode 100644 src/metric/cuda/cuda_binary_metric.cu create mode 100644 src/metric/cuda/cuda_binary_metric.hpp create mode 100644 src/metric/cuda/cuda_metric.hpp diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 84be24556f63..3e4ae7172b47 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -18,6 +18,7 @@ #define NUM_BANKS_DATA_PARTITION (16) #define LOG_NUM_BANKS_DATA_PARTITION (4) +#define GLOBAL_PREFIX_SUM_BLOCK_SIZE (1024) #define CONFLICT_FREE_INDEX(n) \ ((n) + ((n) >> LOG_NUM_BANKS_DATA_PARTITION)) \ @@ -43,43 +44,6 @@ namespace LightGBM { __syncthreads(); \ } \ - -#define PrefixSumInner(elements, n, type) \ - size_t offset = 1; \ - unsigned int threadIdx_x = threadIdx.x; \ - const size_t conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); \ - const type last_element = elements[conflict_free_n_minus_1]; \ - __syncthreads(); \ - for (int d = (n >> 1); d > 0; d >>= 1) { \ - if (threadIdx_x < d) { \ - const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; \ - const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; \ - elements[CONFLICT_FREE_INDEX(dst_pos)] += elements[CONFLICT_FREE_INDEX(src_pos)]; \ - } \ - offset <<= 1; \ - __syncthreads(); \ - } \ - if (threadIdx_x == 0) { \ - elements[conflict_free_n_minus_1] = 0; \ - } \ - __syncthreads(); \ - for (int d = 1; d < n; d <<= 1) { \ - offset >>= 1; \ - if (threadIdx_x < d) { \ - const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; \ - const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; \ - const size_t conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); \ - const size_t conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); \ - const type src_val = elements[conflict_free_src_pos]; \ - elements[conflict_free_src_pos] = elements[conflict_free_dst_pos]; \ - elements[conflict_free_dst_pos] += src_val; \ - } \ - __syncthreads(); \ - } \ - if (threadIdx_x == 0) { \ - elements[CONFLICT_FREE_INDEX(n)] = elements[conflict_free_n_minus_1] + last_element; \ - } \ - template __device__ void ReduceSum(T* values, size_t n) { ReduceSumInner(values, n); @@ -93,6 +57,29 @@ __device__ void ReduceSumConflictFree(T* values, size_t n) { template __device__ void ReduceMax(T* values, size_t n); +template +void GlobalInclusivePrefixSum(T* values, size_t n); + +template +void GlobalGenAUCPosNegSum(const label_t* labels, + const label_t* weights, + const data_size_t* sorted_indices, + double* sum_pos_buffer, + double* block_sum_pos_buffer, + const data_size_t num_data); + +void GloblGenAUCMark(const double* scores, + const data_size_t* sorted_indices, + data_size_t* mark_buffer, + data_size_t* block_mark_buffer, + uint16_t* block_mark_first_zero, + const data_size_t num_data); + +void GlobalCalcAUC(const double* sum_pos_buffer, + const data_size_t* mark_buffer, + const data_size_t num_data, + double* block_buffer); + template __device__ void PrefixSum(T* values, size_t n) { unsigned int offset = 1; @@ -257,6 +244,98 @@ __device__ __forceinline__ void BitonicArgSort_2048(const score_t* scores, uint1 } } +template +__device__ __forceinline__ T ShuffleReduceSumWarp(T value, const data_size_t len) { + if (len > 0) { + const uint32_t mask = (0xffffffff >> (warpSize - len)); + for (int offset = warpSize / 2; offset > 0; offset >>= 1) { + value += __shfl_down_sync(mask, value, offset); + } + } + return value; +} + +// reduce values from an 1-dimensional block (block size must be no greather than 1024) +template +__device__ __forceinline__ T ShuffleReduceSum(T value, T* shared_mem_buffer, const size_t len) { + const uint32_t warpLane = threadIdx.x % warpSize; + const uint32_t warpID = threadIdx.x / warpSize; + const data_size_t warp_len = min(static_cast(warpSize), static_cast(len) - static_cast(warpID * warpSize)); + value = ShuffleReduceSumWarp(value, warp_len); + if (warpLane == 0) { + shared_mem_buffer[warpID] = value; + } + __syncthreads(); + const data_size_t num_warp = static_cast((len + warpSize - 1) / warpSize); + if (warpID == 0) { + value = shared_mem_buffer[warpLane]; + value = ShuffleReduceSumWarp(value, num_warp); + } + return value; +} + +template +__device__ __forceinline__ T ShuffleReduceMaxWarp(T value, const data_size_t len) { + if (len > 0) { + const uint32_t mask = (0xffffffff >> (warpSize - len)); + for (int offset = warpSize / 2; offset > 0; offset >>= 1) { + const T other_value = __shfl_down_sync(mask, value, offset); + value = (other_value > value) ? other_value : value; + } + } + return value; +} + +// reduce values from an 1-dimensional block (block size must be no greather than 1024) +template +__device__ __forceinline__ T ShuffleReduceMax(T value, T* shared_mem_buffer, const size_t len) { + const uint32_t warpLane = threadIdx.x % warpSize; + const uint32_t warpID = threadIdx.x / warpSize; + const data_size_t warp_len = min(static_cast(warpSize), static_cast(len) - static_cast(warpID * warpSize)); + value = ShuffleReduceMaxWarp(value, warp_len); + if (warpLane == 0) { + shared_mem_buffer[warpID] = value; + } + __syncthreads(); + const data_size_t num_warp = static_cast((len + warpSize - 1) / warpSize); + if (warpID == 0) { + value = shared_mem_buffer[warpLane]; + value = ShuffleReduceMaxWarp(value, num_warp); + } + return value; +} + +template +__device__ __forceinline__ T ShuffleReduceMinWarp(T value, const data_size_t len) { + if (len > 0) { + const uint32_t mask = (0xffffffff >> (warpSize - len)); + for (int offset = warpSize / 2; offset > 0; offset >>= 1) { + const T other_value = __shfl_down_sync(mask, value, offset); + value = (other_value < value) ? other_value : value; + } + } + return value; +} + +// reduce values from an 1-dimensional block (block size must be no greather than 1024) +template +__device__ __forceinline__ T ShuffleReduceMin(T value, T* shared_mem_buffer, const size_t len) { + const uint32_t warpLane = threadIdx.x % warpSize; + const uint32_t warpID = threadIdx.x / warpSize; + const data_size_t warp_len = min(static_cast(warpSize), static_cast(len) - static_cast(warpID * warpSize)); + value = ShuffleReduceMinWarp(value, warp_len); + if (warpLane == 0) { + shared_mem_buffer[warpID] = value; + } + __syncthreads(); + const data_size_t num_warp = static_cast((len + warpSize - 1) / warpSize); + if (warpID == 0) { + value = shared_mem_buffer[warpLane]; + value = ShuffleReduceMinWarp(value, num_warp); + } + return value; +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp index 830ec29f98c8..44af57132105 100644 --- a/include/LightGBM/cuda/cuda_objective_function.hpp +++ b/include/LightGBM/cuda/cuda_objective_function.hpp @@ -17,9 +17,7 @@ namespace LightGBM { class CUDAObjectiveInterface { public: - virtual void ConvertOutputCUDA(const data_size_t /*num_data*/, const double* /*input*/, double* /*output*/) const { - Log::Warning("in naive convert output"); - }; + virtual void ConvertOutputCUDA(const data_size_t /*num_data*/, const double* /*input*/, double* /*output*/) const {} }; } // namespace LightGBM diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index cb76ef445c05..81dd1bc27bde 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -526,12 +526,12 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { } std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { - if (config_->device_type == std::string("cuda")) { + /*if (config_->device_type == std::string("cuda")) { std::vector tmp_score(num_data * num_class_, 0.0f); CopyFromCUDADeviceToHostOuter(tmp_score.data(), score, static_cast(num_data * num_class_), __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); return metric->Eval(tmp_score.data(), objective_function_); - } else { + } else*/ { return metric->Eval(score, objective_function_); } } diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 5806664b7389..48685fe450a0 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -175,7 +175,7 @@ __global__ void BitonicArgSortGlobalKernel(const VAL_T* values, INDEX_T* indices __shared__ INDEX_T shared_indices[BITONIC_SORT_NUM_ELEMENTS]; if (thread_index < num_data) { shared_values[thread_index] = values_pointer[thread_index]; - shared_indices[thread_index] = indices_pointer[thread_index]; + shared_indices[thread_index] = static_cast(thread_index + blockIdx.x * blockDim.x); } __syncthreads(); for (int depth = BITONIC_SORT_DEPTH - 1; depth >= 1; --depth) { @@ -195,9 +195,13 @@ __global__ void BitonicArgSortGlobalKernel(const VAL_T* values, INDEX_T* indices const int index_to_compare = thread_index + inner_segment_length_half - offset; const INDEX_T this_index = shared_indices[thread_index]; const INDEX_T other_index = shared_indices[index_to_compare]; - if (index_to_compare < num_data && (shared_values[this_index] > shared_values[other_index]) == ascending) { + const VAL_T this_value = shared_values[thread_index]; + const VAL_T other_value = shared_values[index_to_compare]; + if (index_to_compare < num_data && (this_value > other_value) == ascending) { shared_indices[thread_index] = other_index; shared_indices[index_to_compare] = this_index; + shared_values[thread_index] = other_value; + shared_values[index_to_compare] = this_value; } } } @@ -209,10 +213,14 @@ __global__ void BitonicArgSortGlobalKernel(const VAL_T* values, INDEX_T* indices if (inner_segment_index_half % 2 == 0) { const int index_to_compare = thread_index + inner_segment_length_half; const INDEX_T this_index = shared_indices[thread_index]; - const INDEX_T other_index = shared_indices[thread_index]; - if (index_to_compare < num_data && (shared_values[this_index] > shared_values[other_index]) == ascending) { + const INDEX_T other_index = shared_indices[index_to_compare]; + const VAL_T this_value = shared_values[thread_index]; + const VAL_T other_value = shared_values[index_to_compare]; + if (index_to_compare < num_data && (this_value > other_value) == ascending) { shared_indices[thread_index] = other_index; shared_indices[index_to_compare] = this_index; + shared_values[thread_index] = other_value; + shared_values[index_to_compare] = this_value; } } __syncthreads(); @@ -242,11 +250,15 @@ __global__ void BitonicArgSortMergeKernel(const VAL_T* values, INDEX_T* indices, const int half_segment_index = static_cast(threadIdx.x) / half_segment_length; if (half_segment_index % 2 == 0) { const int index_to_compare = static_cast(threadIdx.x) + half_segment_length; - const INDEX_T this_index = shared_indices[thread_index]; + const INDEX_T this_index = shared_indices[threadIdx.x]; const INDEX_T other_index = shared_indices[index_to_compare]; - if (index_to_compare < local_len && ((shared_values[this_index] > shared_values[other_index]) == ascending)) { - shared_indices[thread_index] = other_index; + const VAL_T this_value = shared_values[threadIdx.x]; + const VAL_T other_value = shared_values[index_to_compare]; + if (index_to_compare < local_len && ((this_value > other_value) == ascending)) { + shared_indices[threadIdx.x] = other_index; shared_indices[index_to_compare] = this_index; + shared_values[threadIdx.x] = other_value; + shared_values[index_to_compare] = this_value; } } __syncthreads(); @@ -324,8 +336,8 @@ void BitonicArgSortGlobalHelper(const VAL_T* values, INDEX_T* indices, const siz } template <> -void BitonicArgSortGlobal(const double* values, int* indices, const size_t len) { - BitonicArgSortGlobalHelper(values, indices, len); +void BitonicArgSortGlobal(const double* values, data_size_t* indices, const size_t len) { + BitonicArgSortGlobalHelper(values, indices, len); } template @@ -520,4 +532,303 @@ void BitonicArgSortItemsGlobal( SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } +__device__ void PrefixSumZeroOut(data_size_t* values, size_t n) { + unsigned int offset = 1; + unsigned int threadIdx_x = static_cast(threadIdx.x); + const data_size_t last_element = values[n - 1]; + __syncthreads(); + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + if (values[dst_pos] != 0) { + values[dst_pos] += values[src_pos]; + } + } + offset <<= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + values[n - 1] = 0; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const data_size_t src_val = values[src_pos]; + values[src_pos] = values[dst_pos]; + if (src_val != 0) { + values[dst_pos] += src_val; + } else { + values[dst_pos] = 0; + } + } + __syncthreads(); + } + if (threadIdx.x == 0) { + if (last_element != 0) { + values[n] = values[n - 1] + last_element; + } else { + values[n] = 0; + } + } + __syncthreads(); +} + +template +__global__ void GlobalInclusivePrefixSumKernel(T* values, T* block_buffer, data_size_t num_data) { + __shared__ T shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + shared_buffer[threadIdx.x] = (data_index < num_data ? values[data_index] : 0); + __syncthreads(); + PrefixSum(shared_buffer, blockDim.x); + if (data_index < num_data) { + values[data_index] = shared_buffer[threadIdx.x + 1]; + } + if (threadIdx.x == 0) { + block_buffer[blockIdx.x] = shared_buffer[blockDim.x]; + } +} + +template +__global__ void GlobalInclusivePrefixSumReduceBlockKernel(T* block_buffer, data_size_t num_blocks) { + __shared__ T shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; + T thread_sum = 0; + const data_size_t num_blocks_per_thread = (num_blocks + static_cast(blockDim.x) - 1) / static_cast(blockDim.x); + const data_size_t thread_start_block_index = static_cast(threadIdx.x) * num_blocks_per_thread; + const data_size_t thread_end_block_index = min(thread_start_block_index + num_blocks_per_thread, num_blocks); + for (data_size_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { + thread_sum += block_buffer[block_index]; + } + shared_buffer[threadIdx.x] = thread_sum; + __syncthreads(); + PrefixSum(shared_buffer, blockDim.x); + const T thread_sum_base = shared_buffer[threadIdx.x]; + for (data_size_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { + block_buffer[block_index] += thread_sum_base; + } +} + +__global__ void GlobalInclusivePrefixSumReduceBlockZeroOutKernel(data_size_t* block_buffer, data_size_t num_blocks) { + __shared__ data_size_t shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; + data_size_t thread_sum = 0; + const data_size_t num_blocks_per_thread = (num_blocks + static_cast(blockDim.x) - 1) / static_cast(blockDim.x); + const data_size_t thread_start_block_index = static_cast(threadIdx.x) * num_blocks_per_thread; + const data_size_t thread_end_block_index = min(thread_start_block_index + num_blocks_per_thread, num_blocks); + for (data_size_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { + const data_size_t block_buffer_value = block_buffer[block_index]; + if (block_buffer_value != 0) { + thread_sum += block_buffer[block_index]; + } else { + thread_sum = 0; + } + } + shared_buffer[threadIdx.x] = thread_sum; + __syncthreads(); + PrefixSumZeroOut(shared_buffer, blockDim.x); + data_size_t thread_sum_base = shared_buffer[threadIdx.x]; + for (data_size_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { + if (block_buffer[block_index] != 0) { + block_buffer[block_index] += thread_sum_base; + } else { + thread_sum_base = 0; + } + } +} + +template +__global__ void GlobalInclusivePrefixSumAddBlockBaseKernel(const T* block_buffer, T* values, data_size_t num_data) { + const T block_sum_base = block_buffer[blockIdx.x]; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + values[data_index] += block_sum_base; + } +} + +__global__ void GlobalInclusivePrefixSumAddBlockBaseGenAUCMarkKernel(const data_size_t* block_buffer, data_size_t* values, const uint16_t* block_first_zero, data_size_t num_data) { + const data_size_t block_sum_base = block_buffer[blockIdx.x]; + const uint16_t first_zero = block_first_zero[blockIdx.x]; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data && threadIdx.x < first_zero) { + values[data_index] += block_sum_base; + } +} + +template +void GlobalInclusivePrefixSum(T* values, T* block_buffer, size_t n) { + const data_size_t num_data = static_cast(n); + const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + GlobalInclusivePrefixSumKernel<<>>( + values, block_buffer, num_data); + GlobalInclusivePrefixSumReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( + block_buffer, num_blocks); + GlobalInclusivePrefixSumAddBlockBaseKernel<<>>( + block_buffer, values, num_data); +} + +__global__ void GlobalGenAUCMarkKernel(const double* scores, + const data_size_t* sorted_indices, + data_size_t* mark_buffer, + data_size_t* block_mark_buffer, + uint16_t* block_mark_first_zero, + data_size_t num_data) { + __shared__ data_size_t shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; + __shared__ uint16_t shuffle_reduce_shared_buffer[32]; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + if (data_index > 1) { + shared_buffer[threadIdx.x] = (scores[sorted_indices[data_index]] == scores[sorted_indices[data_index - 1]]); + } else { + shared_buffer[threadIdx.x] = 0; + } + } else { + shared_buffer[threadIdx.x] = 0; + } + __syncthreads(); + PrefixSumZeroOut(shared_buffer, blockDim.x); + uint16_t block_first_zero = (shared_buffer[threadIdx.x] == 0 ? threadIdx.x : blockDim.x); + block_first_zero = ShuffleReduceMin(block_first_zero, shuffle_reduce_shared_buffer, blockDim.x); + if (data_index < num_data) { + mark_buffer[data_index] = shared_buffer[threadIdx.x + 1]; + } + if (threadIdx.x == 0) { + block_mark_buffer[blockIdx.x] = shared_buffer[blockDim.x]; + block_mark_first_zero[blockIdx.x] = block_first_zero; + } +} + +void GloblGenAUCMark(const double* scores, + const data_size_t* sorted_indices, + data_size_t* mark_buffer, + data_size_t* block_mark_buffer, + uint16_t* block_mark_first_zero, + const data_size_t num_data) { + const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + GlobalGenAUCMarkKernel<<>>(scores, sorted_indices, mark_buffer, block_mark_buffer, block_mark_first_zero, num_data); + GlobalInclusivePrefixSumReduceBlockZeroOutKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( + block_mark_buffer, num_blocks); + GlobalInclusivePrefixSumAddBlockBaseGenAUCMarkKernel<<>>( + block_mark_buffer, mark_buffer, block_mark_first_zero, num_data); +} + +template +__global__ void GlobalGenAUCPosSumKernel(const label_t* labels, + const label_t* weights, + const data_size_t* sorted_indices, + double* sum_pos_buffer, + double* block_sum_pos_buffer, + const data_size_t num_data) { + __shared__ double shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const double pos = IS_POS ? + (USE_WEIGHT ? + (data_index < num_data ? static_cast(labels[sorted_indices[data_index]] > 0) * weights[sorted_indices[data_index]] : 0.0f) : + (data_index < num_data ? static_cast(labels[sorted_indices[data_index]] > 0) : 0.0f)) : + (USE_WEIGHT ? + (data_index < num_data ? static_cast(labels[sorted_indices[data_index]] <= 0) * weights[sorted_indices[data_index]] : 0.0f) : + (data_index < num_data ? static_cast(labels[sorted_indices[data_index]] <= 0) : 0.0f)); + + shared_buffer[threadIdx.x] = pos; + __syncthreads(); + PrefixSum(shared_buffer, blockDim.x); + if (data_index < num_data) { + sum_pos_buffer[data_index] = shared_buffer[threadIdx.x + 1]; + } + if (threadIdx.x == 0) { + block_sum_pos_buffer[blockIdx.x] = shared_buffer[blockDim.x]; + } +} + +template +void GlobalGenAUCPosNegSumInner(const label_t* labels, + const label_t* weights, + const data_size_t* sorted_indices, + double* sum_pos_buffer, + double* block_sum_pos_buffer, + const data_size_t num_data) { + const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + GlobalGenAUCPosSumKernel<<>>(labels, weights, sorted_indices, sum_pos_buffer, block_sum_pos_buffer, num_data); + GlobalInclusivePrefixSumReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( + block_sum_pos_buffer, num_blocks); + GlobalInclusivePrefixSumAddBlockBaseKernel<<>>( + block_sum_pos_buffer, sum_pos_buffer, num_data); +} + +template <> +void GlobalGenAUCPosNegSum(const label_t* labels, + const label_t* weights, + const data_size_t* sorted_indices, + double* sum_pos_buffer, + double* block_sum_pos_buffer, + const data_size_t num_data) { + GlobalGenAUCPosNegSumInner(labels, weights, sorted_indices, sum_pos_buffer, block_sum_pos_buffer, num_data); +} + +template <> +void GlobalGenAUCPosNegSum(const label_t* labels, + const label_t* weights, + const data_size_t* sorted_indices, + double* sum_pos_buffer, + double* block_sum_pos_buffer, + const data_size_t num_data) { + GlobalGenAUCPosNegSumInner(labels, weights, sorted_indices, sum_pos_buffer, block_sum_pos_buffer, num_data); +} + +template <> +void GlobalGenAUCPosNegSum(const label_t* labels, + const label_t* weights, + const data_size_t* sorted_indices, + double* sum_pos_buffer, + double* block_sum_pos_buffer, + const data_size_t num_data) { + GlobalGenAUCPosNegSumInner(labels, weights, sorted_indices, sum_pos_buffer, block_sum_pos_buffer, num_data); +} + +__global__ void GlobalCalcAUCKernel( + const double* sum_pos_buffer, + const data_size_t* mark_buffer, + const data_size_t num_data, + double* block_buffer) { + __shared__ double shared_buffer[32]; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + double area = 0.0f; + if (data_index < num_data) { + if (data_index == num_data - 1 || mark_buffer[data_index + 1] == 0) { + const data_size_t prev_data_index = data_index - mark_buffer[data_index] - 1; + const double prev_sum_pos = (prev_data_index < 0 ? 0.0f : sum_pos_buffer[prev_data_index]); + const double cur_pos = sum_pos_buffer[data_index] - prev_sum_pos; + const double cur_neg = static_cast(data_index - prev_data_index) - cur_pos; + area = cur_neg * (cur_pos * 0.5f + prev_sum_pos); + } + } + area = ShuffleReduceSum(area, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + block_buffer[blockIdx.x] = area; + } +} + +template +__global__ void BlockReduceSum(T* block_buffer, const data_size_t num_blocks) { + __shared__ T shared_buffer[32]; + T thread_sum = 0; + for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { + thread_sum += block_buffer[block_index]; + } + thread_sum = ShuffleReduceSum(thread_sum, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + block_buffer[0] = thread_sum; + } +} + +void GlobalCalcAUC(const double* sum_pos_buffer, + const data_size_t* mark_buffer, + const data_size_t num_data, + double* block_buffer) { + const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + GlobalCalcAUCKernel<<>>(sum_pos_buffer, mark_buffer, num_data, block_buffer); + BlockReduceSum<<>>(block_buffer, num_blocks); +} + } // namespace LightGBM diff --git a/src/metric/binary_metric.hpp b/src/metric/binary_metric.hpp index f0e1c886bb1c..b4df28b781ae 100644 --- a/src/metric/binary_metric.hpp +++ b/src/metric/binary_metric.hpp @@ -96,7 +96,7 @@ class BinaryMetric: public Metric { return std::vector(1, loss); } - private: + protected: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ @@ -250,7 +250,7 @@ class AUCMetric: public Metric { return std::vector(1, auc); } - private: + protected: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ diff --git a/src/metric/cuda/cuda_binary_metric.cpp b/src/metric/cuda/cuda_binary_metric.cpp new file mode 100644 index 000000000000..0f629744420b --- /dev/null +++ b/src/metric/cuda/cuda_binary_metric.cpp @@ -0,0 +1,63 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_binary_metric.hpp" + +namespace LightGBM { + +template +CUDABinaryMetric::CUDABinaryMetric(const Config& config): BinaryMetric(config) {} + +template +CUDABinaryMetric::~CUDABinaryMetric() {} + +template +void CUDABinaryMetric::Init(const Metadata& metadata, data_size_t num_data) { + BinaryMetric::Init(metadata, num_data); + + cuda_label_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + + const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_BINARY_METRIC - 1) / EVAL_BLOCK_SIZE_BINARY_METRIC; + AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); +} + +template +std::vector CUDABinaryMetric::Eval(const double* score, const ObjectiveFunction* objective) const { + double sum_loss = 0.0f; + objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); + LaunchEvalKernel(cuda_score_convert_buffer_); + CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); + return std::vector(1, sum_loss / this->sum_weights_); +} + +CUDABinaryLoglossMetric::CUDABinaryLoglossMetric(const Config& config): CUDABinaryMetric(config) {} + +CUDABinaryErrorMetric::CUDABinaryErrorMetric(const Config& config) : CUDABinaryMetric(config) {} + +CUDAAUCMetric::CUDAAUCMetric(const Config& config): AUCMetric(config) {} + +CUDAAUCMetric::~CUDAAUCMetric() {} + +void CUDAAUCMetric::Init(const Metadata& metadata, data_size_t num_data) { + AUCMetric::Init(metadata, num_data); + AllocateCUDAMemoryOuter(&cuda_indices_buffer_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_pos_buffer_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_threshold_mark_, static_cast(num_data), __FILE__, __LINE__); + const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_BINARY_METRIC - 1) / EVAL_BLOCK_SIZE_BINARY_METRIC; + AllocateCUDAMemoryOuter(&cuda_block_sum_pos_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_threshold_mark_buffer_, static_cast(num_blocks), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_mark_first_zero_, static_cast(num_blocks), __FILE__, __LINE__); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + cuda_label_ = metadata.cuda_metadata()->cuda_label(); +} + +std::vector CUDAAUCMetric::Eval(const double* score, const ObjectiveFunction*) const { + LaunchEvalKernel(score); +} + +} // namespace LightGBM diff --git a/src/metric/cuda/cuda_binary_metric.cu b/src/metric/cuda/cuda_binary_metric.cu new file mode 100644 index 000000000000..051864edf375 --- /dev/null +++ b/src/metric/cuda/cuda_binary_metric.cu @@ -0,0 +1,88 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include +#include "cuda_binary_metric.hpp" + +#include + +namespace LightGBM { + +template +__global__ void EvalKernel_BinaryPointWiseLoss(const double* score, + const label_t* label, + const label_t* weights, + const data_size_t num_data, + const double sum_weight, + double* cuda_sum_loss_buffer) { + // assert that warpSize == 32 and maximum number of threads per block is 1024 + __shared__ double shared_buffer[32]; + const int data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const double pointwise_loss = data_index < num_data ? + (USE_WEIGHT ? CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score[data_index]) * weights[data_index] : + CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score[data_index])) : + 0.0f; + const double loss = ShuffleReduceSum(pointwise_loss, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + cuda_sum_loss_buffer[blockIdx.x] = loss; + } +} + +__global__ void ReduceLossKernel(const double* cuda_sum_loss_buffer, const data_size_t num_blocks, double* out_loss) { + __shared__ double shared_buffer[32]; + double thread_sum_loss = 0.0f; + for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { + thread_sum_loss += cuda_sum_loss_buffer[block_index]; + } + const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, static_cast(num_blocks)); + if (threadIdx.x == 0) { + *out_loss = sum_loss; + } +} + +template +void CUDABinaryMetric::LaunchEvalKernelInner(const double* score) const { + const data_size_t num_blocks = (BinaryMetric::num_data_ + EVAL_BLOCK_SIZE_BINARY_METRIC - 1) / EVAL_BLOCK_SIZE_BINARY_METRIC; + if (cuda_weights_ == nullptr) { + EvalKernel_BinaryPointWiseLoss<<>>( + score, cuda_label_, cuda_weights_, + this->num_data_, + this->sum_weights_, + cuda_sum_loss_buffer_); + } else { + EvalKernel_BinaryPointWiseLoss<<>>( + score, cuda_label_, cuda_weights_, + this->num_data_, + this->sum_weights_, + cuda_sum_loss_buffer_); + } + ReduceLossKernel<<<1, EVAL_BLOCK_SIZE_BINARY_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); +} + +template <> +void CUDABinaryMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDABinaryMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +void CUDAAUCMetric::LaunchEvalKernel(const double* score) const { + BitonicArgSortGlobal(score, cuda_indices_buffer_, static_cast(num_data_)); + if (cuda_weights_ == nullptr) { + GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); + } else { + GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); + Log::Fatal("CUDA AUC with weights is not supported."); + } + GloblGenAUCMark(score, cuda_indices_buffer_, cuda_threshold_mark_, cuda_block_threshold_mark_buffer_, cuda_block_mark_first_zero_, num_data_); + GlobalCalcAUC(cuda_sum_pos_buffer_, cuda_threshold_mark_, num_data_, cuda_block_sum_pos_buffer_); + double total_area = 0.0f; + CopyFromCUDADeviceToHostOuter(&total_area, cuda_block_sum_pos_buffer_, 1, __FILE__, __LINE__); +} + +} // namespace LightGBM diff --git a/src/metric/cuda/cuda_binary_metric.hpp b/src/metric/cuda/cuda_binary_metric.hpp new file mode 100644 index 000000000000..e2d0f86b3e1c --- /dev/null +++ b/src/metric/cuda/cuda_binary_metric.hpp @@ -0,0 +1,123 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_ +#define LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_ + +#include "cuda_metric.hpp" +#include "../binary_metric.hpp" + +#define EVAL_BLOCK_SIZE_BINARY_METRIC (1024) + +namespace LightGBM { + +template +class CUDABinaryMetric : public CUDAMetricInterface, public BinaryMetric { + public: + explicit CUDABinaryMetric(const Config& config); + + ~CUDABinaryMetric(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + std::vector Eval(const double* score, const ObjectiveFunction* objective) const override; + + protected: + void LaunchEvalKernel(const double* score) const; + + void LaunchEvalKernelInner(const double* score) const; + + const label_t* cuda_label_; + const label_t* cuda_weights_; + double* cuda_sum_loss_buffer_; + double* cuda_score_convert_buffer_; + double* cuda_sum_loss_; +}; + +class CUDABinaryLoglossMetric : public CUDABinaryMetric { + public: + explicit CUDABinaryLoglossMetric(const Config& config); + + inline static double LossOnPoint(label_t label, double prob) { + if (label <= 0) { + if (1.0f - prob > kEpsilon) { + return -std::log(1.0f - prob); + } + } else { + if (prob > kEpsilon) { + return -std::log(prob); + } + } + return -std::log(kEpsilon); + } + + __device__ inline static double LossOnPointCUDA(label_t label, double prob) { + if (label <= 0) { + if (1.0f - prob > kEpsilon) { + return -log(1.0f - prob); + } + } else { + if (prob > kEpsilon) { + return -log(prob); + } + } + return -log(kEpsilon); + } + + inline static const char* Name() { + return "binary_logloss"; + } +}; + +class CUDABinaryErrorMetric: public CUDABinaryMetric { + public: + explicit CUDABinaryErrorMetric(const Config& config); + + inline static double LossOnPoint(label_t label, double prob) { + if (prob <= 0.5f) { + return label > 0; + } else { + return label <= 0; + } + } + + __device__ inline static double LossOnPointCUDA(label_t label, double prob) { + if (prob <= 0.5f) { + return label > 0; + } else { + return label <= 0; + } + } + + inline static const char* Name() { + return "binary_error"; + } +}; + +class CUDAAUCMetric : public AUCMetric { + public: + CUDAAUCMetric(const Config& config); + + ~CUDAAUCMetric(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + std::vector Eval(const double* score, const ObjectiveFunction*) const override; + + private: + void LaunchEvalKernel(const double* score) const; + + data_size_t* cuda_indices_buffer_; + double* cuda_sum_pos_buffer_; + double* cuda_block_sum_pos_buffer_; + data_size_t* cuda_threshold_mark_; + data_size_t* cuda_block_threshold_mark_buffer_; + uint16_t* cuda_block_mark_first_zero_; + const label_t* cuda_label_; + const label_t* cuda_weights_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_ diff --git a/src/metric/cuda/cuda_metric.hpp b/src/metric/cuda/cuda_metric.hpp new file mode 100644 index 000000000000..56fb79d572fa --- /dev/null +++ b/src/metric/cuda/cuda_metric.hpp @@ -0,0 +1,19 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_METRIC_CUDA_CUDA_METRIC_HPP_ +#define LIGHTGBM_METRIC_CUDA_CUDA_METRIC_HPP_ + +#include +#include + +namespace LightGBM { + +class CUDAMetricInterface { + +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_METRIC_CUDA_CUDA_METRIC_HPP_ diff --git a/src/metric/metric.cpp b/src/metric/metric.cpp index a7104c2a7880..9a3b41547209 100644 --- a/src/metric/metric.cpp +++ b/src/metric/metric.cpp @@ -11,55 +11,67 @@ #include "regression_metric.hpp" #include "xentropy_metric.hpp" +#include "cuda/cuda_binary_metric.hpp" + namespace LightGBM { Metric* Metric::CreateMetric(const std::string& type, const Config& config) { - if (type == std::string("l2")) { - return new L2Metric(config); - } else if (type == std::string("rmse")) { - return new RMSEMetric(config); - } else if (type == std::string("l1")) { - return new L1Metric(config); - } else if (type == std::string("quantile")) { - return new QuantileMetric(config); - } else if (type == std::string("huber")) { - return new HuberLossMetric(config); - } else if (type == std::string("fair")) { - return new FairLossMetric(config); - } else if (type == std::string("poisson")) { - return new PoissonMetric(config); - } else if (type == std::string("binary_logloss")) { - return new BinaryLoglossMetric(config); - } else if (type == std::string("binary_error")) { - return new BinaryErrorMetric(config); - } else if (type == std::string("auc")) { - return new AUCMetric(config); - } else if (type == std::string("average_precision")) { - return new AveragePrecisionMetric(config); - } else if (type == std::string("auc_mu")) { - return new AucMuMetric(config); - } else if (type == std::string("ndcg")) { - return new NDCGMetric(config); - } else if (type == std::string("map")) { - return new MapMetric(config); - } else if (type == std::string("multi_logloss")) { - return new MultiSoftmaxLoglossMetric(config); - } else if (type == std::string("multi_error")) { - return new MultiErrorMetric(config); - } else if (type == std::string("cross_entropy")) { - return new CrossEntropyMetric(config); - } else if (type == std::string("cross_entropy_lambda")) { - return new CrossEntropyLambdaMetric(config); - } else if (type == std::string("kullback_leibler")) { - return new KullbackLeiblerDivergence(config); - } else if (type == std::string("mape")) { - return new MAPEMetric(config); - } else if (type == std::string("gamma")) { - return new GammaMetric(config); - } else if (type == std::string("gamma_deviance")) { - return new GammaDevianceMetric(config); - } else if (type == std::string("tweedie")) { - return new TweedieMetric(config); + if (config.device_type == std::string("cuda")) { + if (type == std::string("binary_logloss")) { + return new CUDABinaryLoglossMetric(config); + } else if (type == std::string("binary_error")) { + return new CUDABinaryErrorMetric(config); + } else if (type == std::string("auc")) { + return new CUDAAUCMetric(config); + } + } else { + if (type == std::string("l2")) { + return new L2Metric(config); + } else if (type == std::string("rmse")) { + return new RMSEMetric(config); + } else if (type == std::string("l1")) { + return new L1Metric(config); + } else if (type == std::string("quantile")) { + return new QuantileMetric(config); + } else if (type == std::string("huber")) { + return new HuberLossMetric(config); + } else if (type == std::string("fair")) { + return new FairLossMetric(config); + } else if (type == std::string("poisson")) { + return new PoissonMetric(config); + } else if (type == std::string("binary_logloss")) { + return new BinaryLoglossMetric(config); + } else if (type == std::string("binary_error")) { + return new BinaryErrorMetric(config); + } else if (type == std::string("auc")) { + return new AUCMetric(config); + } else if (type == std::string("average_precision")) { + return new AveragePrecisionMetric(config); + } else if (type == std::string("auc_mu")) { + return new AucMuMetric(config); + } else if (type == std::string("ndcg")) { + return new NDCGMetric(config); + } else if (type == std::string("map")) { + return new MapMetric(config); + } else if (type == std::string("multi_logloss")) { + return new MultiSoftmaxLoglossMetric(config); + } else if (type == std::string("multi_error")) { + return new MultiErrorMetric(config); + } else if (type == std::string("cross_entropy")) { + return new CrossEntropyMetric(config); + } else if (type == std::string("cross_entropy_lambda")) { + return new CrossEntropyLambdaMetric(config); + } else if (type == std::string("kullback_leibler")) { + return new KullbackLeiblerDivergence(config); + } else if (type == std::string("mape")) { + return new MAPEMetric(config); + } else if (type == std::string("gamma")) { + return new GammaMetric(config); + } else if (type == std::string("gamma_deviance")) { + return new GammaDevianceMetric(config); + } else if (type == std::string("tweedie")) { + return new TweedieMetric(config); + } } return nullptr; } diff --git a/src/objective/cuda/cuda_binary_objective.cu b/src/objective/cuda/cuda_binary_objective.cu index 4b25fc09e8e8..590e20ea8c00 100644 --- a/src/objective/cuda/cuda_binary_objective.cu +++ b/src/objective/cuda/cuda_binary_objective.cu @@ -58,10 +58,10 @@ __global__ void GetGradientsKernel_BinaryLogloss(const double* cuda_scores, cons const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); if (data_index < num_data) { const label_t cuda_label = static_cast(cuda_labels[data_index]); - const int label = IS_OVA ? (cuda_label > 0 ? 1 : -1) : (cuda_label == ova_class_id ? 1 : -1); - const double response = -label * sigmoid / (1.0f + std::exp(label * sigmoid * cuda_scores[data_index])); + const int label = IS_OVA ? (cuda_label == ova_class_id ? 1 : -1) : (cuda_label > 0 ? 1 : -1); + const double response = -label * sigmoid / (1.0f + exp(label * sigmoid * cuda_scores[data_index])); const double abs_response = fabs(response); - if (USE_WEIGHT) { + if (!USE_WEIGHT) { if (USE_LABEL_WEIGHT) { const double label_weight = cuda_label_weights[label]; cuda_out_gradients[data_index] = static_cast(response * label_weight); @@ -128,6 +128,8 @@ void CUDABinaryLogloss::LaunchGetGradientsKernel(const double* scores, score_t* } } +#undef GetGradientsKernel_BinaryLogloss_ARGS + __global__ void ConvertOutputCUDAKernel_BinaryLogloss(const double sigmoid, const data_size_t num_data, const double* input, double* output) { const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); if (data_index < num_data) { diff --git a/src/objective/cuda/cuda_multiclass_objective.cpp b/src/objective/cuda/cuda_multiclass_objective.cpp index 930fd7dd07f4..9d68ad051736 100644 --- a/src/objective/cuda/cuda_multiclass_objective.cpp +++ b/src/objective/cuda/cuda_multiclass_objective.cpp @@ -24,23 +24,6 @@ void CUDAMulticlassSoftmax::Init(const Metadata& metadata, data_size_t num_data) void CUDAMulticlassSoftmax::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { LaunchGetGradientsKernel(score, gradients, hessians); - /*std::vector cpu_gradients(100, 0.0f); - std::vector cpu_hessians(100, 0.0f); - CopyFromCUDADeviceToHostOuter(cpu_gradients.data(), gradients, 100, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(cpu_hessians.data(), hessians, 100, __FILE__, __LINE__); - for (size_t i = 0; i < 100; ++i) { - Log::Warning("class 0 data %d gradient = %f, hessians = %f", i, cpu_gradients[i], cpu_hessians[i]); - } - CopyFromCUDADeviceToHostOuter(cpu_gradients.data(), gradients + num_data_ - 100, 100, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(cpu_hessians.data(), hessians + num_data_ - 100, 100, __FILE__, __LINE__); - for (size_t i = 0; i < 100; ++i) { - Log::Warning("class 0 data %d gradient = %f, hessians = %f", i + num_data_ - 100, cpu_gradients[i], cpu_hessians[i]); - } - CopyFromCUDADeviceToHostOuter(cpu_gradients.data(), gradients + num_data_, 100, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(cpu_hessians.data(), hessians + num_data_, 100, __FILE__, __LINE__); - for (size_t i = 0; i < 100; ++i) { - Log::Warning("class 1 data %d gradient = %f, hessians = %f", i, cpu_gradients[i], cpu_hessians[i]); - }*/ SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } diff --git a/src/objective/cuda/cuda_rank_objective.cpp b/src/objective/cuda/cuda_rank_objective.cpp index ba4f6c3d69fd..e3c84158f1d2 100644 --- a/src/objective/cuda/cuda_rank_objective.cpp +++ b/src/objective/cuda/cuda_rank_objective.cpp @@ -13,9 +13,10 @@ namespace LightGBM { CUDALambdarankNDCG::CUDALambdarankNDCG(const Config& config): LambdarankNDCG(config) {} +CUDALambdarankNDCG::CUDALambdarankNDCG(const std::vector& strs): LambdarankNDCG(strs) {} + void CUDALambdarankNDCG::Init(const Metadata& metadata, data_size_t num_data) { const int num_threads = OMP_NUM_THREADS(); - TestCUDABitonicSortForQueryItems(); LambdarankNDCG::Init(metadata, num_data); std::vector thread_max_num_items_in_query(num_threads); @@ -56,17 +57,43 @@ void CUDALambdarankNDCG::GetGradients(const double* score, score_t* gradients, s LaunchGetGradientsKernel(score, gradients, hessians); } -CUDARankXENDCG::CUDARankXENDCG(const Config& config): RankXENDCG(config) {} +CUDARankXENDCG::CUDARankXENDCG(const Config& config): CUDALambdarankNDCG(config) {} -CUDARankXENDCG::CUDARankXENDCG(const std::vector& strs): RankXENDCG(strs) {} +CUDARankXENDCG::CUDARankXENDCG(const std::vector& strs): CUDALambdarankNDCG(strs) {} CUDARankXENDCG::~CUDARankXENDCG() {} void CUDARankXENDCG::Init(const Metadata& metadata, data_size_t num_data) { - RankXENDCG::Init(metadata, num_data); + CUDALambdarankNDCG::Init(metadata, num_data); + for (data_size_t i = 0; i < num_queries_; ++i) { + rands_.emplace_back(seed_ + i); + } + item_rands_.resize(num_data, 0.0f); + AllocateCUDAMemoryOuter(&cuda_item_rands_, static_cast(num_data), __FILE__, __LINE__); + //if (max_items_in_query_aligned_ >= 2048) { + AllocateCUDAMemoryOuter(&cuda_params_buffer_, static_cast(num_data_), __FILE__, __LINE__); + //} +} + +void CUDARankXENDCG::GenerateItemRands() const { + const int num_threads = OMP_NUM_THREADS(); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t i = 0; i < num_queries_; ++i) { + OMP_LOOP_EX_BEGIN(); + const data_size_t start = query_boundaries_[i]; + const data_size_t end = query_boundaries_[i + 1]; + for (data_size_t j = start; j < end; ++j) { + item_rands_[j] = rands_[i].NextFloat(); + } + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); } void CUDARankXENDCG::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { + GenerateItemRands(); + CopyFromHostToCUDADeviceOuter(cuda_item_rands_, item_rands_.data(), item_rands_.size(), __FILE__, __LINE__); LaunchGetGradientsKernel(score, gradients, hessians); } diff --git a/src/objective/cuda/cuda_rank_objective.cu b/src/objective/cuda/cuda_rank_objective.cu index 1b8724f41d81..a1eeb1bd1a61 100644 --- a/src/objective/cuda/cuda_rank_objective.cu +++ b/src/objective/cuda/cuda_rank_objective.cu @@ -372,151 +372,292 @@ void CUDALambdarankNDCG::LaunchCalcInverseMaxDCGKernel() { SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } -__global__ void GetGradientsKernel_RankXENDCG() {} - -void CUDARankXENDCG::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const {} +__device__ __forceinline__ double CUDAPhi(const label_t l, double g) { + return pow(2.0f, static_cast(l)) - g; +} -void CUDALambdarankNDCG::TestCUDAQuickSort() const { - const int test_num_data = (1 << 24) + 13; - const int data_range = 1000; - const int num_threads = OMP_NUM_THREADS(); - std::vector rand_integers(test_num_data, 0); - std::vector distribution_prob(data_range, 1.0f / data_range); - std::discrete_distribution dist(distribution_prob.begin(), distribution_prob.end()); - std::vector rand_engines(num_threads); - Threading::For(0, test_num_data, 512, - [&rand_engines, &dist, &rand_integers] (int thread_index, int start, int end) { - rand_engines[thread_index] = std::mt19937(thread_index); - for (int i = start; i < end; ++i) { - rand_integers[i] = dist(rand_engines[thread_index]); - } - }); +template +__global__ void GetGradientsKernel_RankXENDCG_SharedMemory( + const double* cuda_scores, + const label_t* cuda_labels, + const double* cuda_item_rands, + const data_size_t num_data, + const data_size_t num_queries, + const data_size_t* cuda_query_boundaries, + score_t* cuda_out_gradients, + score_t* cuda_out_hessians) { + const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; + const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); + for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { + const data_size_t item_index_start = cuda_query_boundaries[query_index]; + const data_size_t item_index_end = cuda_query_boundaries[query_index + 1]; + const data_size_t query_item_count = item_index_end - item_index_start; + score_t* cuda_out_gradients_pointer = cuda_out_gradients + item_index_start; + score_t* cuda_out_hessians_pointer = cuda_out_hessians + item_index_start; + const label_t* cuda_labels_pointer = cuda_labels + item_index_start; + const double* cuda_scores_pointer = cuda_scores + item_index_start; + const double* cuda_item_rands_pointer = cuda_item_rands + item_index_start; + const data_size_t block_reduce_size = query_item_count >= 1024 ? 1024 : query_item_count; + __shared__ double shared_rho[SHARED_MEMORY_SIZE]; + // assert that warpSize == 32 + __shared__ double shared_buffer[32]; + __shared__ double shared_params[SHARED_MEMORY_SIZE]; + __shared__ score_t shared_lambdas[SHARED_MEMORY_SIZE]; + __shared__ double reduce_result; + if (query_item_count <= 1) { + for (data_size_t i = 0; i <= query_item_count; ++i) { + cuda_out_gradients_pointer[i] = 0.0f; + cuda_out_hessians_pointer[i] = 0.0f; + } + __syncthreads(); + } else { + // compute softmax + double thread_reduce_result = kMinScore; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + const double rho = cuda_scores_pointer[i]; + shared_rho[i] = rho; + if (rho > thread_reduce_result) { + thread_reduce_result = rho; + } + } + __syncthreads(); + thread_reduce_result = ShuffleReduceMax(thread_reduce_result, shared_buffer, block_reduce_size); + if (threadIdx.x == 0) { + reduce_result = thread_reduce_result; + if (blockIdx.x == 0) { + printf("reduce max score = %f\n", reduce_result); + } + } + __syncthreads(); + thread_reduce_result = 0.0f; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + const double exp_value = exp(shared_rho[i] - reduce_result); + shared_rho[i] = exp_value; + thread_reduce_result += exp_value; + } + thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); + if (threadIdx.x == 0) { + reduce_result = thread_reduce_result; + } + __syncthreads(); + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + shared_rho[i] /= reduce_result; + } + __syncthreads(); - const int smaller_test_num_data = /*(1 << 11) +*/ 170; - std::vector bitonic_sort_integers(rand_integers.begin(), rand_integers.begin() + smaller_test_num_data); - std::vector cuda_bitonic_sort_integers = bitonic_sort_integers; - std::vector host_bitonic_sort_integers = bitonic_sort_integers; - int* cuda_bitonic_sort_integers_pointer = nullptr; - InitCUDAMemoryFromHostMemoryOuter(&cuda_bitonic_sort_integers_pointer, cuda_bitonic_sort_integers.data(), smaller_test_num_data, __FILE__, __LINE__); - auto start_1024 = std::chrono::steady_clock::now(); - BitonicSortGlobal(cuda_bitonic_sort_integers_pointer, smaller_test_num_data); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - auto end_1024 = std::chrono::steady_clock::now(); - auto duration_1024 = static_cast>(end_1024 - start_1024); - Log::Warning("bitonic sort 1024 time = %f", duration_1024.count()); - CopyFromCUDADeviceToHostOuter(cuda_bitonic_sort_integers.data(), cuda_bitonic_sort_integers_pointer, smaller_test_num_data, __FILE__, __LINE__); - start_1024 = std::chrono::steady_clock::now(); - std::sort(host_bitonic_sort_integers.begin(), host_bitonic_sort_integers.end()); - end_1024 = std::chrono::steady_clock::now(); - duration_1024 = static_cast>(end_1024 - start_1024); - Log::Warning("host sort 1024 time = %f", duration_1024.count()); - for (int i = 0; i < smaller_test_num_data; ++i) { - if (host_bitonic_sort_integers[i] != cuda_bitonic_sort_integers[i]) { - Log::Warning("error index %d host_bitonic_sort_integers = %d, cuda_bitonic_sort_integers = %d", i, host_bitonic_sort_integers[i], cuda_bitonic_sort_integers[i]); + // compute params + thread_reduce_result = 0.0f; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + const double param_value = CUDAPhi(cuda_labels_pointer[i], cuda_item_rands_pointer[i]); + shared_params[i] = param_value; + thread_reduce_result += param_value; + } + thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); + if (threadIdx.x == 0) { + reduce_result = thread_reduce_result; + reduce_result = 1.0f / max(kEpsilon, reduce_result); + } + __syncthreads(); + const double inv_denominator = reduce_result; + thread_reduce_result = 0.0f; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + const double term = -shared_params[i] * inv_denominator + shared_rho[i]; + shared_lambdas[i] = static_cast(term); + shared_params[i] = term / (1.0f - shared_rho[i]); + thread_reduce_result += shared_params[i]; + } + thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); + if (threadIdx.x == 0) { + reduce_result = thread_reduce_result; + } + __syncthreads(); + const double sum_l1 = reduce_result; + thread_reduce_result = 0.0f; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + const double term = shared_rho[i] * (sum_l1 - shared_params[i]); + shared_lambdas[i] += static_cast(term); + shared_params[i] = term / (1.0f - shared_rho[i]); + thread_reduce_result += shared_params[i]; + } + thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); + if (threadIdx.x == 0) { + reduce_result = thread_reduce_result; + } + __syncthreads(); + const double sum_l2 = reduce_result; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + shared_lambdas[i] += static_cast(shared_rho[i] * (sum_l2 - shared_params[i])); + cuda_out_hessians_pointer[i] = static_cast(shared_rho[i] * (1.0f - shared_rho[i])); + } + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + cuda_out_gradients_pointer[i] = shared_lambdas[i]; + } + __syncthreads(); } - } - - std::vector cuda_rand_integers = rand_integers; - std::vector host_rand_integers = rand_integers; - int* cuda_data = nullptr; - InitCUDAMemoryFromHostMemoryOuter(&cuda_data, rand_integers.data(), rand_integers.size(), __FILE__, __LINE__); - auto start = std::chrono::steady_clock::now(); - BitonicSortGlobal(cuda_data, static_cast(test_num_data)); - auto end = std::chrono::steady_clock::now(); - auto duration = static_cast>(end - start); - Log::Warning("cuda sort time = %f", duration.count()); - CopyFromCUDADeviceToHostOuter(cuda_rand_integers.data(), cuda_data, static_cast(test_num_data), __FILE__, __LINE__); - start = std::chrono::steady_clock::now(); - std::sort(host_rand_integers.begin(), host_rand_integers.end()); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); - Log::Warning("cpu sort time = %f", duration.count()); - std::vector parallel_rand_integers = rand_integers; - start = std::chrono::steady_clock::now(); - Common::ParallelSort(parallel_rand_integers.begin(), parallel_rand_integers.end(), [](int a, int b) { return a < b; }); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); - Log::Warning("parallel sort time = %f", duration.count()); - for (int i = 0; i < 100; ++i) { - Log::Warning("after sort cuda_rand_integers[%d] = %d", i, cuda_rand_integers[i]); } - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (int i = 0; i < test_num_data; ++i) { - if (cuda_rand_integers[i] != host_rand_integers[i]) { - Log::Warning("index %d cuda_rand_integers = %d, host_rand_integers = %d", i, cuda_rand_integers[i], host_rand_integers[i]); +} + +__global__ void GetGradientsKernel_RankXENDCG_GlobalMemory( + const double* cuda_scores, + const label_t* cuda_labels, + const double* cuda_item_rands, + const data_size_t num_data, + const data_size_t num_queries, + const data_size_t* cuda_query_boundaries, + double* cuda_params_buffer, + score_t* cuda_out_gradients, + score_t* cuda_out_hessians) { + const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; + const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); + for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { + const data_size_t item_index_start = cuda_query_boundaries[query_index]; + const data_size_t item_index_end = cuda_query_boundaries[query_index + 1]; + const data_size_t query_item_count = item_index_end - item_index_start; + score_t* cuda_out_gradients_pointer = cuda_out_gradients + item_index_start; + score_t* cuda_out_hessians_pointer = cuda_out_hessians + item_index_start; + const label_t* cuda_labels_pointer = cuda_labels + item_index_start; + const double* cuda_scores_pointer = cuda_scores + item_index_start; + const double* cuda_item_rands_pointer = cuda_item_rands + item_index_start; + double* cuda_params_buffer_pointer = cuda_params_buffer + item_index_start; + const data_size_t block_reduce_size = query_item_count > 1024 ? 1024 : query_item_count; + // assert that warpSize == 32, so we use buffer size 1024 / 32 = 32 + __shared__ double shared_buffer[32]; + __shared__ double reduce_result; + if (query_item_count <= 1) { + for (data_size_t i = 0; i <= query_item_count; ++i) { + cuda_out_gradients_pointer[i] = 0.0f; + cuda_out_hessians_pointer[i] = 0.0f; + } + __syncthreads(); + } else { + // compute softmax + double thread_reduce_result = kMinScore; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + const double rho = cuda_scores_pointer[i]; + if (rho > thread_reduce_result) { + thread_reduce_result = rho; + } + } + __syncthreads(); + thread_reduce_result = ShuffleReduceMax(thread_reduce_result, shared_buffer, block_reduce_size); + if (threadIdx.x == 0) { + reduce_result = thread_reduce_result; + } + __syncthreads(); + thread_reduce_result = 0.0f; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + const double exp_value = exp(cuda_scores_pointer[i] - reduce_result); + cuda_out_hessians_pointer[i] = exp_value; + thread_reduce_result += exp_value; + } + thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); + if (threadIdx.x == 0) { + reduce_result = thread_reduce_result; + } + __syncthreads(); + // store probability into hessians + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + cuda_out_hessians_pointer[i] /= reduce_result; + } + __syncthreads(); + + // compute params + thread_reduce_result = 0.0f; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + const double param_value = CUDAPhi(cuda_labels_pointer[i], cuda_item_rands_pointer[i]); + cuda_params_buffer_pointer[i] = param_value; + thread_reduce_result += param_value; + } + thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); + if (threadIdx.x == 0) { + reduce_result = thread_reduce_result; + reduce_result = 1.0f / max(kEpsilon, reduce_result); + } + __syncthreads(); + const double inv_denominator = reduce_result; + thread_reduce_result = 0.0f; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + const double term = -cuda_params_buffer_pointer[i] * inv_denominator + cuda_out_hessians_pointer[i]; + cuda_out_gradients_pointer[i] = static_cast(term); + const double param = term / (1.0f - cuda_out_hessians_pointer[i]); + cuda_params_buffer_pointer[i] = param; + thread_reduce_result += param; + } + thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); + if (threadIdx.x == 0) { + reduce_result = thread_reduce_result; + } + __syncthreads(); + const double sum_l1 = reduce_result; + thread_reduce_result = 0.0f; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + const double term = cuda_out_hessians_pointer[i] * (sum_l1 - cuda_params_buffer_pointer[i]); + cuda_out_gradients_pointer[i] += static_cast(term); + const double param = term / (1.0f - cuda_out_hessians_pointer[i]); + cuda_params_buffer_pointer[i] = param; + thread_reduce_result += param; + } + thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); + if (threadIdx.x == 0) { + reduce_result = thread_reduce_result; + } + __syncthreads(); + const double sum_l2 = reduce_result; + for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { + const double prob = cuda_out_hessians_pointer[i]; + cuda_out_gradients_pointer[i] += static_cast(prob * (sum_l2 - cuda_params_buffer_pointer[i])); + cuda_out_hessians_pointer[i] = static_cast(prob * (1.0f - prob)); + } + __syncthreads(); } - CHECK_EQ(cuda_rand_integers[i], host_rand_integers[i]); } - Log::Warning("cuda argsort test pass"); } -void CUDALambdarankNDCG::TestCUDABitonicSortForQueryItems() const { - int num_queries = 1000; - std::vector items_per_query(num_queries + 1, 0); - std::vector item_scores; - const int max_item_per_query = 5000; - std::vector num_item_probs(max_item_per_query, 1.0f / max_item_per_query); - std::discrete_distribution num_item_distribution(num_item_probs.begin(), num_item_probs.end()); - std::uniform_real_distribution score_dist; - const int num_threads = OMP_NUM_THREADS(); - std::vector thread_random_engines(num_threads); - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - thread_random_engines[thread_index] = std::mt19937(thread_index); - } - int num_total_items = 0; - #pragma omp parallel for schedule(static) num_threads(num_threads) reduction(+:num_total_items) - for (int query_index = 0; query_index < num_queries; ++query_index) { - const int thread_index = omp_get_thread_num(); - items_per_query[query_index + 1] = num_item_distribution(thread_random_engines[thread_index]); - num_total_items += items_per_query[query_index + 1]; - } - for (int query_index = 0; query_index < num_queries; ++query_index) { - items_per_query[query_index + 1] += items_per_query[query_index]; - } - item_scores.resize(num_total_items, 0.0f); - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (int item_index = 0; item_index < num_total_items; ++item_index) { - const int thread_index = omp_get_thread_num(); - item_scores[item_index] = score_dist(thread_random_engines[thread_index]); +void CUDARankXENDCG::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; + if (max_items_in_query_aligned_ <= 1024) { + GetGradientsKernel_RankXENDCG_SharedMemory<1024><<>>( + score, + cuda_labels_, + cuda_item_rands_, + num_data_, + num_queries_, + cuda_query_boundaries_, + gradients, + hessians); + } else if (max_items_in_query_aligned_ <= 2 * 1024) { + GetGradientsKernel_RankXENDCG_SharedMemory<2 * 1024><<>>( + score, + cuda_labels_, + cuda_item_rands_, + num_data_, + num_queries_, + cuda_query_boundaries_, + gradients, + hessians); + } else { + GetGradientsKernel_RankXENDCG_GlobalMemory<<>>( + score, + cuda_labels_, + cuda_item_rands_, + num_data_, + num_queries_, + cuda_query_boundaries_, + cuda_params_buffer_, + gradients, + hessians); } - double* cuda_score = nullptr; - data_size_t* cuda_query_boundaries = nullptr; - data_size_t* cuda_out_indices = nullptr; - InitCUDAMemoryFromHostMemoryOuter(&cuda_score, item_scores.data(), item_scores.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_query_boundaries, items_per_query.data(), items_per_query.size(), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_out_indices, item_scores.size(), __FILE__, __LINE__); - const auto start = std::chrono::steady_clock::now(); - BitonicArgSortItemsGlobal(cuda_score, num_queries, cuda_query_boundaries, cuda_out_indices); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - const auto end = std::chrono::steady_clock::now(); - const std::chrono::duration duration = static_cast>(end - start); - Log::Warning("bitonic arg sort items global time = %f", duration.count()); - std::vector sorted_item_indices(item_scores.size()); - CopyFromCUDADeviceToHostOuter(sorted_item_indices.data(), cuda_out_indices, item_scores.size(), __FILE__, __LINE__); - std::vector host_sorted_item_indices(item_scores.size()); PrintLastCUDAErrorOuter(__FILE__, __LINE__); - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (int i = 0; i < num_queries; ++i) { - const int query_start = items_per_query[i]; - const int query_end = items_per_query[i + 1]; - for (int j = query_start; j < query_end; ++j) { - host_sorted_item_indices[j] = j - query_start; - } - std::sort(host_sorted_item_indices.data() + query_start, host_sorted_item_indices.data() + query_end, [&item_scores, query_start] (int a, int b) { - return item_scores[query_start + a] > item_scores[query_start + b]; - }); - } - for (int query_index = 0; query_index < num_queries; ++query_index) { - const int query_start = items_per_query[query_index]; - const int query_end = items_per_query[query_index + 1]; - for (int item_index = query_start; item_index < query_end; ++item_index) { - const double cuda_item_score = item_scores[query_start + sorted_item_indices[item_index]]; - const double host_item_score = item_scores[query_start + host_sorted_item_indices[item_index]]; - if (cuda_item_score != host_item_score) { - Log::Warning("item_index = %d, query_start = %d, cuda_item_score = %f, host_item_score = %f, sorted_item_indices = %d", - item_index, query_start, cuda_item_score, host_item_score, sorted_item_indices[item_index]); - } - } + const int num_show = 1000; + std::vector host_gradients(num_show, 0.0f); + std::vector host_hessians(num_show, 0.0f); + std::vector host_scores(num_show, 0.0f); + CopyFromCUDADeviceToHostOuter(host_gradients.data(), gradients, num_show, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(host_hessians.data(), hessians, num_show, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(host_scores.data(), score, num_show, __FILE__, __LINE__); + for (int i = 0; i < num_show; ++i) { + Log::Warning("host_gradients[%d] = %f, host_hessians[%d] = %f, host_scores[%d] = %f", i, host_gradients[i], i, host_hessians[i], i, host_scores[i]); } } diff --git a/src/objective/cuda/cuda_rank_objective.hpp b/src/objective/cuda/cuda_rank_objective.hpp index 5a9cd2bc2a52..4e71b68f87b4 100644 --- a/src/objective/cuda/cuda_rank_objective.hpp +++ b/src/objective/cuda/cuda_rank_objective.hpp @@ -28,16 +28,12 @@ class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; - private: + protected: void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; void LaunchCalcInverseMaxDCGKernel(); - void TestCUDAQuickSort() const; - - void TestCUDABitonicSortForQueryItems() const; - // CUDA memory, held by this object double* cuda_lambdas_; double* cuda_inverse_max_dcgs_; @@ -52,7 +48,7 @@ class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG int max_items_in_query_aligned_; }; -class CUDARankXENDCG : public CUDAObjectiveInterface, public RankXENDCG { +class CUDARankXENDCG : public CUDALambdarankNDCG { public: explicit CUDARankXENDCG(const Config& config); @@ -66,6 +62,14 @@ class CUDARankXENDCG : public CUDAObjectiveInterface, public RankXENDCG { private: void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; + + // TODO(shiyu1994): move random number generation into CUDA + void GenerateItemRands() const; + + mutable std::vector item_rands_; + mutable std::vector rands_; + mutable double* cuda_item_rands_; + mutable double* cuda_params_buffer_; }; } // namespace LightGBM diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index c436587c3ad0..3ec74414405b 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -25,6 +25,8 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new CUDARegressionL2loss(config); } else if (type == std::string("lambdarank")) { return new CUDALambdarankNDCG(config); + } else if (type == std::string("rank_xendcg")) { + return new CUDARankXENDCG(config); } else if (type == std::string("multiclass")) { return new CUDAMulticlassSoftmax(config); } diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index f11c76127009..552750152c5f 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -62,6 +62,10 @@ class RankingObjective : public ObjectiveFunction { } } } + const int num_show = 1000; + for (int i = 0; i < num_show; ++i) { + Log::Warning("gradients[%d] = %f, hessians[%d] = %f, score[%d] = %f", i, gradients[i], i, hessians[i], i, score[i]); + } } virtual void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 613ba969acf6..fc30731a92c7 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -105,7 +105,7 @@ class CUDAHistogramConstructor { /*! \brief aligned number of bins of the features whose histograms need to be fixed */ std::vector need_fix_histogram_features_num_bin_aligend_; /*! \brief minimum number of blocks allowed in the y dimension */ - const int min_grid_dim_y_ = 10; + const int min_grid_dim_y_ = 160; // CUDA memory, held by this object From ca42f3b9a0919e6ee3f57c3ffb18b39a3063895b Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 18 Aug 2021 13:55:45 +0000 Subject: [PATCH 055/166] fix CUDA AUC --- src/boosting/gbdt.cpp | 1 - src/cuda/cuda_algorithms.cu | 170 ++++++++--- src/cuda/from_git | 146 ++++++++++ src/io/cuda/cuda_tree.cu | 9 + src/metric/cuda/cuda_binary_metric.cpp | 20 +- src/metric/cuda/cuda_binary_metric.cu | 375 ++++++++++++++++++++++++- src/metric/cuda/cuda_binary_metric.hpp | 2 + 7 files changed, 680 insertions(+), 43 deletions(-) create mode 100644 src/cuda/from_git diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 81dd1bc27bde..c34c2610720c 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -393,7 +393,6 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } // bagging logic Bagging(iter_); - bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { const size_t offset = static_cast(cur_tree_id) * num_data_; diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 48685fe450a0..1c7e2b00b851 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -32,7 +32,7 @@ __global__ void BitonicSortGlobalKernel(T* values, const int num_total_data) { const int inner_depth = depth; const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); const int inner_segment_index_half = thread_index / inner_segment_length_half; - const int offset = ((inner_segment_index_half >> 1) == num_total_segment - 1 && ascending == ASCENDING) ? + const int offset = ((inner_segment_index_half >> 1) == num_total_segment - 1 && ascending == outer_ascending) ? (num_total_segment * segment_length - num_data) : 0; const int segment_start = segment_index * segment_length; if (inner_segment_index_half % 2 == 0) { @@ -163,6 +163,16 @@ void BitonicSortGlobal(int* values, const size_t len) { BitonicSortGlobalHelper(values, len); } +template <> +void BitonicSortGlobal(double* values, const size_t len) { + BitonicSortGlobalHelper(values, len); +} + +template <> +void BitonicSortGlobal(double* values, const size_t len) { + BitonicSortGlobalHelper(values, len); +} + template __global__ void BitonicArgSortGlobalKernel(const VAL_T* values, INDEX_T* indices, const int num_total_data) { const int thread_index = static_cast(threadIdx.x); @@ -187,7 +197,7 @@ __global__ void BitonicArgSortGlobalKernel(const VAL_T* values, INDEX_T* indices const int inner_depth = depth; const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); const int inner_segment_index_half = thread_index / inner_segment_length_half; - const int offset = ((inner_segment_index_half >> 1) == num_total_segment - 1 && ascending == ASCENDING) ? + const int offset = ((inner_segment_index_half >> 1) == num_total_segment - 1 && ascending == outer_ascending) ? (num_total_segment * segment_length - num_data) : 0; const int segment_start = segment_index * segment_length; if (inner_segment_index_half % 2 == 0) { @@ -241,8 +251,9 @@ __global__ void BitonicArgSortMergeKernel(const VAL_T* values, INDEX_T* indices, const int offset = static_cast(blockIdx.x * blockDim.x); const int local_len = min(BITONIC_SORT_NUM_ELEMENTS, len - offset); if (thread_index < len) { - shared_values[threadIdx.x] = values[thread_index]; - shared_indices[threadIdx.x] = indices[thread_index]; + const INDEX_T index = indices[thread_index]; + shared_values[threadIdx.x] = values[index]; + shared_indices[threadIdx.x] = index; } __syncthreads(); int half_segment_length = BITONIC_SORT_NUM_ELEMENTS / 2; @@ -282,18 +293,20 @@ __global__ void BitonicArgCompareKernel(const VAL_T* values, INDEX_T* indices, c const int segment_start = segment_index * outer_segment_length; if (thread_index >= offset + segment_start) { const int index_to_compare = thread_index + half_segment_length - offset; - const INDEX_T this_index = indices[thread_index]; - const INDEX_T other_index = indices[index_to_compare]; - if (index_to_compare < len && (values[this_index] > values[other_index]) == ascending) { - indices[thread_index] = other_index; - indices[index_to_compare] = this_index; + if (index_to_compare < len) { + const INDEX_T this_index = indices[thread_index]; + const INDEX_T other_index = indices[index_to_compare]; + if ((values[this_index] > values[other_index]) == ascending) { + indices[thread_index] = other_index; + indices[index_to_compare] = this_index; + } } } } else { const int index_to_compare = thread_index + half_segment_length; - const INDEX_T this_index = indices[thread_index]; - const INDEX_T other_index = indices[index_to_compare]; if (index_to_compare < len) { + const INDEX_T this_index = indices[thread_index]; + const INDEX_T other_index = indices[index_to_compare]; if ((values[this_index] > values[other_index]) == ascending) { indices[thread_index] = other_index; indices[index_to_compare] = this_index; @@ -340,6 +353,11 @@ void BitonicArgSortGlobal(const double* values, data BitonicArgSortGlobalHelper(values, indices, len); } +template <> +void BitonicArgSortGlobal(const double* values, data_size_t* indices, const size_t len) { + BitonicArgSortGlobalHelper(values, indices, len); +} + template __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, const int len) { __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; @@ -577,6 +595,59 @@ __device__ void PrefixSumZeroOut(data_size_t* values, size_t n) { __syncthreads(); } +__device__ void PrefixSumZeroOut(data_size_t* values, bool* is_all_non_zero, size_t n) { + unsigned int offset = 1; + unsigned int threadIdx_x = static_cast(threadIdx.x); + const data_size_t last_element = values[n - 1]; + const bool last_is_all_non_zero = is_all_non_zero[n - 1]; + __syncthreads(); + for (int d = (n >> 1); d > 0; d >>= 1) { + if (threadIdx_x < d) { + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + if (is_all_non_zero[dst_pos]) { + values[dst_pos] += values[src_pos]; + is_all_non_zero[dst_pos] &= is_all_non_zero[src_pos]; + } + } + offset <<= 1; + __syncthreads(); + } + if (threadIdx_x == 0) { + values[n - 1] = 0; + is_all_non_zero[n - 1] = true; + } + __syncthreads(); + for (int d = 1; d < n; d <<= 1) { + offset >>= 1; + if (threadIdx_x < d) { + const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; + const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; + const data_size_t src_val = values[src_pos]; + const bool src_is_all_non_zero = is_all_non_zero[src_pos]; + values[src_pos] = values[dst_pos]; + is_all_non_zero[src_pos] = is_all_non_zero[dst_pos]; + if (src_is_all_non_zero) { + values[dst_pos] += src_val; + } else { + values[dst_pos] = src_val; + } + is_all_non_zero[dst_pos] &= src_is_all_non_zero; + } + __syncthreads(); + } + if (threadIdx.x == 0) { + if (last_is_all_non_zero) { + values[n] = values[n - 1] + last_element; + is_all_non_zero[n] = is_all_non_zero[n - 1]; + } else { + values[n] = last_element; + is_all_non_zero[n] = last_is_all_non_zero; + } + } + __syncthreads(); +} + template __global__ void GlobalInclusivePrefixSumKernel(T* values, T* block_buffer, data_size_t num_data) { __shared__ T shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; @@ -588,7 +659,7 @@ __global__ void GlobalInclusivePrefixSumKernel(T* values, T* block_buffer, data_ values[data_index] = shared_buffer[threadIdx.x + 1]; } if (threadIdx.x == 0) { - block_buffer[blockIdx.x] = shared_buffer[blockDim.x]; + block_buffer[blockIdx.x + 1] = shared_buffer[blockDim.x]; } } @@ -596,9 +667,9 @@ template __global__ void GlobalInclusivePrefixSumReduceBlockKernel(T* block_buffer, data_size_t num_blocks) { __shared__ T shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; T thread_sum = 0; - const data_size_t num_blocks_per_thread = (num_blocks + static_cast(blockDim.x) - 1) / static_cast(blockDim.x); + const data_size_t num_blocks_per_thread = (num_blocks + static_cast(blockDim.x)) / static_cast(blockDim.x); const data_size_t thread_start_block_index = static_cast(threadIdx.x) * num_blocks_per_thread; - const data_size_t thread_end_block_index = min(thread_start_block_index + num_blocks_per_thread, num_blocks); + const data_size_t thread_end_block_index = min(thread_start_block_index + num_blocks_per_thread, num_blocks + 1); for (data_size_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { thread_sum += block_buffer[block_index]; } @@ -611,30 +682,35 @@ __global__ void GlobalInclusivePrefixSumReduceBlockKernel(T* block_buffer, data_ } } -__global__ void GlobalInclusivePrefixSumReduceBlockZeroOutKernel(data_size_t* block_buffer, data_size_t num_blocks) { +__global__ void GlobalInclusivePrefixSumReduceBlockZeroOutKernel(data_size_t* block_buffer, const uint16_t* block_mark_first_zero, data_size_t num_blocks) { __shared__ data_size_t shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; + __shared__ bool is_all_non_zero[GLOBAL_PREFIX_SUM_BLOCK_SIZE]; data_size_t thread_sum = 0; const data_size_t num_blocks_per_thread = (num_blocks + static_cast(blockDim.x) - 1) / static_cast(blockDim.x); const data_size_t thread_start_block_index = static_cast(threadIdx.x) * num_blocks_per_thread; const data_size_t thread_end_block_index = min(thread_start_block_index + num_blocks_per_thread, num_blocks); + bool thread_is_all_non_zero = true; + data_size_t first_with_zero_block = thread_end_block_index; for (data_size_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { + const uint16_t mark_first_zero = block_mark_first_zero[block_index]; const data_size_t block_buffer_value = block_buffer[block_index]; - if (block_buffer_value != 0) { - thread_sum += block_buffer[block_index]; + if (mark_first_zero == GLOBAL_PREFIX_SUM_BLOCK_SIZE) { + thread_sum += block_buffer_value; } else { - thread_sum = 0; + thread_is_all_non_zero = false; + thread_sum = block_buffer_value; + if (first_with_zero_block == thread_end_block_index) { + first_with_zero_block = block_index; + } } } + is_all_non_zero[threadIdx.x] = thread_is_all_non_zero; shared_buffer[threadIdx.x] = thread_sum; __syncthreads(); - PrefixSumZeroOut(shared_buffer, blockDim.x); + PrefixSumZeroOut(shared_buffer, is_all_non_zero, blockDim.x); data_size_t thread_sum_base = shared_buffer[threadIdx.x]; - for (data_size_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { - if (block_buffer[block_index] != 0) { - block_buffer[block_index] += thread_sum_base; - } else { - thread_sum_base = 0; - } + for (data_size_t block_index = thread_start_block_index; block_index < first_with_zero_block; ++block_index) { + block_buffer[block_index] += thread_sum_base; } } @@ -647,8 +723,12 @@ __global__ void GlobalInclusivePrefixSumAddBlockBaseKernel(const T* block_buffer } } -__global__ void GlobalInclusivePrefixSumAddBlockBaseGenAUCMarkKernel(const data_size_t* block_buffer, data_size_t* values, const uint16_t* block_first_zero, data_size_t num_data) { - const data_size_t block_sum_base = block_buffer[blockIdx.x]; +__global__ void GlobalInclusivePrefixSumAddBlockBaseGenAUCMarkKernel( + const data_size_t* block_buffer, + data_size_t* values, + const uint16_t* block_first_zero, + data_size_t num_data) { + const data_size_t block_sum_base = (blockIdx.x == 0 ? 0 : block_buffer[blockIdx.x - 1]); const uint16_t first_zero = block_first_zero[blockIdx.x]; const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (data_index < num_data && threadIdx.x < first_zero) { @@ -676,19 +756,34 @@ __global__ void GlobalGenAUCMarkKernel(const double* scores, data_size_t num_data) { __shared__ data_size_t shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; __shared__ uint16_t shuffle_reduce_shared_buffer[32]; + __shared__ bool is_all_non_zero[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (data_index < num_data) { - if (data_index > 1) { - shared_buffer[threadIdx.x] = (scores[sorted_indices[data_index]] == scores[sorted_indices[data_index - 1]]); + if (data_index > 0) { + shared_buffer[threadIdx.x] = static_cast(scores[sorted_indices[data_index]] == scores[sorted_indices[data_index - 1]]); } else { shared_buffer[threadIdx.x] = 0; } } else { shared_buffer[threadIdx.x] = 0; } + is_all_non_zero[threadIdx.x] = static_cast(shared_buffer[threadIdx.x]); __syncthreads(); - PrefixSumZeroOut(shared_buffer, blockDim.x); uint16_t block_first_zero = (shared_buffer[threadIdx.x] == 0 ? threadIdx.x : blockDim.x); + /*if (blockIdx.x == 0 && threadIdx.x == 0) { + for (uint32_t i = 0; i < blockDim.x; ++i) { + printf("before prefix sum shared_buffer[%d] = %d\n", i, shared_buffer[i]); + } + }*/ + PrefixSumZeroOut(shared_buffer, is_all_non_zero, blockDim.x); + /*if (blockIdx.x == 0 && threadIdx.x == 0) { + for (uint32_t i = 0; i < blockDim.x; ++i) { + const data_size_t local_data_index = static_cast(i + blockIdx.x * blockDim.x); + printf("shared_buffer[%d] = %d, scores[%d] = %f, original_offset = %d\n", i, shared_buffer[i], + sorted_indices[local_data_index], scores[sorted_indices[local_data_index]], + static_cast(local_data_index > 0 && scores[sorted_indices[local_data_index]] == scores[sorted_indices[local_data_index - 1]])); + } + }*/ block_first_zero = ShuffleReduceMin(block_first_zero, shuffle_reduce_shared_buffer, blockDim.x); if (data_index < num_data) { mark_buffer[data_index] = shared_buffer[threadIdx.x + 1]; @@ -708,13 +803,14 @@ void GloblGenAUCMark(const double* scores, const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; GlobalGenAUCMarkKernel<<>>(scores, sorted_indices, mark_buffer, block_mark_buffer, block_mark_first_zero, num_data); GlobalInclusivePrefixSumReduceBlockZeroOutKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( - block_mark_buffer, num_blocks); + block_mark_buffer, block_mark_first_zero, num_blocks); GlobalInclusivePrefixSumAddBlockBaseGenAUCMarkKernel<<>>( block_mark_buffer, mark_buffer, block_mark_first_zero, num_data); } template -__global__ void GlobalGenAUCPosSumKernel(const label_t* labels, +__global__ void GlobalGenAUCPosSumKernel( + const label_t* labels, const label_t* weights, const data_size_t* sorted_indices, double* sum_pos_buffer, @@ -722,7 +818,7 @@ __global__ void GlobalGenAUCPosSumKernel(const label_t* labels, const data_size_t num_data) { __shared__ double shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const double pos = IS_POS ? + const double pos = IS_POS ? (USE_WEIGHT ? (data_index < num_data ? static_cast(labels[sorted_indices[data_index]] > 0) * weights[sorted_indices[data_index]] : 0.0f) : (data_index < num_data ? static_cast(labels[sorted_indices[data_index]] > 0) : 0.0f)) : @@ -737,7 +833,7 @@ __global__ void GlobalGenAUCPosSumKernel(const label_t* labels, sum_pos_buffer[data_index] = shared_buffer[threadIdx.x + 1]; } if (threadIdx.x == 0) { - block_sum_pos_buffer[blockIdx.x] = shared_buffer[blockDim.x]; + block_sum_pos_buffer[blockIdx.x + 1] = shared_buffer[blockDim.x]; } } @@ -757,13 +853,13 @@ void GlobalGenAUCPosNegSumInner(const label_t* labels, } template <> -void GlobalGenAUCPosNegSum(const label_t* labels, +void GlobalGenAUCPosNegSum(const label_t* labels, const label_t* weights, const data_size_t* sorted_indices, double* sum_pos_buffer, double* block_sum_pos_buffer, const data_size_t num_data) { - GlobalGenAUCPosNegSumInner(labels, weights, sorted_indices, sum_pos_buffer, block_sum_pos_buffer, num_data); + GlobalGenAUCPosNegSumInner(labels, weights, sorted_indices, sum_pos_buffer, block_sum_pos_buffer, num_data); } template <> @@ -828,7 +924,7 @@ void GlobalCalcAUC(const double* sum_pos_buffer, double* block_buffer) { const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; GlobalCalcAUCKernel<<>>(sum_pos_buffer, mark_buffer, num_data, block_buffer); - BlockReduceSum<<>>(block_buffer, num_blocks); + BlockReduceSum<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); } } // namespace LightGBM diff --git a/src/cuda/from_git b/src/cuda/from_git new file mode 100644 index 000000000000..2263cf1869e1 --- /dev/null +++ b/src/cuda/from_git @@ -0,0 +1,146 @@ +template +__global__ void BitonicSortGlobalKernel(T* values, const int num_total_data) { + const int thread_index = static_cast(threadIdx.x); + const int low = static_cast(blockIdx.x * BITONIC_SORT_NUM_ELEMENTS); + const bool outer_ascending = ASCENDING ? (blockIdx.x % 2 == 0) : (blockIdx.x % 2 == 1); + T* values_pointer = values + low; + const int num_data = min(BITONIC_SORT_NUM_ELEMENTS, num_total_data - low); + __shared__ T shared_values[BITONIC_SORT_NUM_ELEMENTS]; + if (thread_index < num_data) { + shared_values[thread_index] = values_pointer[thread_index]; + } + __syncthreads(); + for (int depth = BITONIC_SORT_DEPTH - 1; depth >= 1; --depth) { + const int segment_length = 1 << (BITONIC_SORT_DEPTH - depth); + const int segment_index = thread_index / segment_length; + const bool ascending = outer_ascending ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + const int num_total_segment = (num_data + segment_length - 1) / segment_length; + { + const int inner_depth = depth; + const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); + const int inner_segment_index_half = thread_index / inner_segment_length_half; + const int offset = ((inner_segment_index_half >> 1) == num_total_segment - 1 && ascending == ASCENDING) ? + (num_total_segment * segment_length - num_data) : 0; + const int segment_start = segment_index * segment_length; + if (inner_segment_index_half % 2 == 0) { + if (thread_index >= offset + segment_start) { + const int index_to_compare = thread_index + inner_segment_length_half - offset; + if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { + const T tmp = shared_values[thread_index]; + shared_values[thread_index] = shared_values[index_to_compare]; + shared_values[index_to_compare] = tmp; + } + } + } + __syncthreads(); + } + for (int inner_depth = depth + 1; inner_depth < BITONIC_SORT_DEPTH; ++inner_depth) { + const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); + const int inner_segment_index_half = thread_index / inner_segment_length_half; + if (inner_segment_index_half % 2 == 0) { + const int index_to_compare = thread_index + inner_segment_length_half; + if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { + const T tmp = shared_values[thread_index]; + shared_values[thread_index] = shared_values[index_to_compare]; + shared_values[index_to_compare] = tmp; + } + } + __syncthreads(); + } + } + if (thread_index < num_data) { + values_pointer[thread_index] = shared_values[thread_index]; + } +} + +template +__global__ void BitonicSortMergeKernel(VAL_T* values, const int segment_length, const int len) { + const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const int segment_index = thread_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; + const int offset = static_cast(blockIdx.x * blockDim.x); + const int local_len = min(BITONIC_SORT_NUM_ELEMENTS, len - offset); + if (thread_index < len) { + shared_values[threadIdx.x] = values[thread_index]; + } + __syncthreads(); + int half_segment_length = BITONIC_SORT_NUM_ELEMENTS / 2; + while (half_segment_length >= 1) { + const int half_segment_index = static_cast(threadIdx.x) / half_segment_length; + if (half_segment_index % 2 == 0) { + const int index_to_compare = static_cast(threadIdx.x) + half_segment_length; + if (index_to_compare < local_len && ((shared_values[threadIdx.x] > shared_values[index_to_compare]) == ascending)) { + const VAL_T tmp = shared_values[index_to_compare]; + shared_values[index_to_compare] = shared_values[threadIdx.x]; + shared_values[threadIdx.x] = tmp; + } + } + __syncthreads(); + half_segment_length >>= 1; + } + if (thread_index < len) { + values[thread_index] = shared_values[threadIdx.x]; + } +} + +template +__global__ void BitonicCompareKernel(VAL_T* values, const int half_segment_length, const int outer_segment_length, const int len) { + const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const int segment_index = thread_index / outer_segment_length; + const int half_segment_index = thread_index / half_segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + if (half_segment_index % 2 == 0) { + const int num_total_segment = (len + outer_segment_length - 1) / outer_segment_length; + if (BEGIN && (half_segment_index >> 1) == num_total_segment - 1 && ascending == ASCENDING) { + const int offset = num_total_segment * outer_segment_length - len; + const int segment_start = segment_index * outer_segment_length; + if (thread_index >= offset + segment_start) { + const int index_to_compare = thread_index + half_segment_length - offset; + if (index_to_compare < len && (values[thread_index] > values[index_to_compare]) == ascending) { + const VAL_T tmp = values[index_to_compare]; + values[index_to_compare] = values[thread_index]; + values[thread_index] = tmp; + } + } + } else { + const int index_to_compare = thread_index + half_segment_length; + if (index_to_compare < len) { + if ((values[thread_index] > values[index_to_compare]) == ascending) { + const VAL_T tmp = values[index_to_compare]; + values[index_to_compare] = values[thread_index]; + values[thread_index] = tmp; + } + } + } + } +} + +template +void BitonicSortGlobalHelper(VAL_T* values, const size_t len) { + int max_depth = 1; + int len_to_shift = static_cast(len) - 1; + while (len_to_shift > 0) { + ++max_depth; + len_to_shift >>= 1; + } + const int num_blocks = (static_cast(len) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; + BitonicSortGlobalKernel<<>>(values, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + for (int depth = max_depth - 11; depth >= 1; --depth) { + const int segment_length = (1 << (max_depth - depth)); + int half_segment_length = (segment_length >> 1); + { + BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + half_segment_length >>= 1; + } + for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { + BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + half_segment_length >>= 1; + } + BitonicSortMergeKernel<<>>(values, segment_length, static_cast(len)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + } +} \ No newline at end of file diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index dea410e205d5..0e5b4cb1c922 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -59,6 +59,9 @@ __global__ void SplitKernel(// split information const int new_node_index = num_leaves - 1; const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); const int parent_index = leaf_parent[leaf_index]; + if (parent_index == new_node_index) { + printf("error !!!! parent_index = %d, new_node_index = %d\n", parent_index, new_node_index); + } if (thread_index == 0) { if (parent_index >= 0) { // if cur node is left child @@ -176,6 +179,7 @@ __global__ void AddPredictionToScoreKernel( const data_size_t data_index = USE_INDICES ? cuda_used_indices[inner_data_index] : inner_data_index; if (data_index < num_data) { int node = 0; + int iter = 0; while (node >= 0) { const int split_feature_inner = cuda_split_feature_inner[node]; const int column = cuda_feature_to_column[split_feature_inner]; @@ -215,6 +219,11 @@ __global__ void AddPredictionToScoreKernel( node = cuda_right_child[node]; } } + ++iter; + if (iter > 1000) { + printf("error iter = %d\n", iter); + printf("node = %d, cuda_left_child[%d] = %d, cuda_right_child[%d] = %d\n", node, node, cuda_left_child[node], node, cuda_right_child[node]); + } } score[data_index] += cuda_leaf_value[~node]; } diff --git a/src/metric/cuda/cuda_binary_metric.cpp b/src/metric/cuda/cuda_binary_metric.cpp index 0f629744420b..6070611a3103 100644 --- a/src/metric/cuda/cuda_binary_metric.cpp +++ b/src/metric/cuda/cuda_binary_metric.cpp @@ -50,14 +50,30 @@ void CUDAAUCMetric::Init(const Metadata& metadata, data_size_t num_data) { AllocateCUDAMemoryOuter(&cuda_threshold_mark_, static_cast(num_data), __FILE__, __LINE__); const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_BINARY_METRIC - 1) / EVAL_BLOCK_SIZE_BINARY_METRIC; AllocateCUDAMemoryOuter(&cuda_block_sum_pos_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_threshold_mark_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_mark_first_zero_, static_cast(num_blocks), __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_sum_pos_buffer_, 0, 1, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_threshold_mark_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_threshold_mark_buffer_, 0, 1, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_mark_first_zero_, static_cast(num_blocks) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_mark_first_zero_, 0, 1, __FILE__, __LINE__); cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); cuda_label_ = metadata.cuda_metadata()->cuda_label(); + + TestCUDABitonicSortForQueryItems(); } std::vector CUDAAUCMetric::Eval(const double* score, const ObjectiveFunction*) const { + Log::Warning("before evaluate AUC"); LaunchEvalKernel(score); + double total_area = 0.0f, sum_pos = 0.0f; + CopyFromCUDADeviceToHostOuter(&total_area, cuda_block_sum_pos_buffer_, 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(&sum_pos, cuda_sum_pos_buffer_ + static_cast(num_data_ - 1), 1, __FILE__, __LINE__); + Log::Warning("sum_pos = %f", sum_pos); + Log::Warning("after evaluate AUC"); + if (sum_pos != sum_weights_ && sum_pos > 0.0f) { + return std::vector(1, total_area / (sum_pos * (sum_weights_ - sum_pos))); + } else { + return std::vector(1, 1.0f); + } } } // namespace LightGBM diff --git a/src/metric/cuda/cuda_binary_metric.cu b/src/metric/cuda/cuda_binary_metric.cu index 051864edf375..4a31b1c5f94c 100644 --- a/src/metric/cuda/cuda_binary_metric.cu +++ b/src/metric/cuda/cuda_binary_metric.cu @@ -72,17 +72,386 @@ void CUDABinaryMetric::LaunchEvalKernel(const double* sco } void CUDAAUCMetric::LaunchEvalKernel(const double* score) const { + int num_pos = 0; + for (int data_index = 0; data_index < num_data_; ++data_index) { + num_pos += static_cast(label_[data_index]); + } + Log::Warning("sum_pos = %d", num_pos); BitonicArgSortGlobal(score, cuda_indices_buffer_, static_cast(num_data_)); + std::vector host_sorted_indices(num_data_, 0); + std::vector host_score(num_data_, 0.0f); + CopyFromCUDADeviceToHostOuter(host_sorted_indices.data(), cuda_indices_buffer_, static_cast(num_data_), __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(host_score.data(), score, static_cast(num_data_), __FILE__, __LINE__); + //Log::Warning("host_sorted_indices[%d] = %d, host_score[%d] = %f", 0, host_sorted_indices[0], host_score[host_sorted_indices[0]]); + for (int i = 0; i < num_data_ - 1; ++i) { + //Log::Warning("host_sorted_indices[%d] = %d, host_score[%d] = %f", i + 1, host_sorted_indices[i + 1], host_sorted_indices[i + 1], host_score[host_sorted_indices[i + 1]]); + CHECK_GE(host_score[host_sorted_indices[i]], host_score[host_sorted_indices[i + 1]]); + } + SetCUDAMemoryOuter(cuda_block_sum_pos_buffer_, 0, 1, __FILE__, __LINE__); if (cuda_weights_ == nullptr) { - GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); + GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); + std::vector host_cuda_sum_pos_buffer(num_data_); + CopyFromCUDADeviceToHostOuter(host_cuda_sum_pos_buffer.data(), cuda_sum_pos_buffer_, static_cast(num_data_), __FILE__, __LINE__); + double cur_sum_pos = 0.0f; + for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { + cur_sum_pos += static_cast(label_[host_sorted_indices[data_index]] > 0); + CHECK_EQ(cur_sum_pos, host_cuda_sum_pos_buffer[data_index]); + } } else { GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); Log::Fatal("CUDA AUC with weights is not supported."); } GloblGenAUCMark(score, cuda_indices_buffer_, cuda_threshold_mark_, cuda_block_threshold_mark_buffer_, cuda_block_mark_first_zero_, num_data_); + std::vector host_threshold_mask(num_data_, 0); + CopyFromCUDADeviceToHostOuter(host_threshold_mask.data(), cuda_threshold_mark_, static_cast(num_data_), __FILE__, __LINE__); + for (int i = 0; i < num_data_; ++i) { + //Log::Warning("host_threshold_mask[%d] = %d", i, host_threshold_mask[i]); + const bool is_valid = i == 0 || host_threshold_mask[i] == 0 || (host_threshold_mask[i] == host_threshold_mask[i - 1] + 1); + if (!is_valid) { + Log::Warning("host_threshold_mask[%d] = %d, host_threshold_mask[%d] = %d", i, host_threshold_mask[i], i - 1, host_threshold_mask[i - 1]); + } + CHECK(is_valid); + if (i > 0) { + const bool should_increase = (host_score[host_sorted_indices[i]] == host_score[host_sorted_indices[i - 1]]); + if (should_increase) { + CHECK_EQ(host_threshold_mask[i], host_threshold_mask[i - 1] + 1); + } else { + CHECK_EQ(host_threshold_mask[i], 0); + } + } + } GlobalCalcAUC(cuda_sum_pos_buffer_, cuda_threshold_mark_, num_data_, cuda_block_sum_pos_buffer_); - double total_area = 0.0f; - CopyFromCUDADeviceToHostOuter(&total_area, cuda_block_sum_pos_buffer_, 1, __FILE__, __LINE__); +} + +void CUDAAUCMetric::TestCUDABitonicSortForQueryItems() const { + int num_queries = 1000; + std::vector items_per_query(num_queries + 1, 0); + std::vector item_scores; + const int max_item_per_query = 5000; + std::vector num_item_probs(max_item_per_query, 1.0f / max_item_per_query); + std::discrete_distribution num_item_distribution(num_item_probs.begin(), num_item_probs.end()); + std::uniform_real_distribution score_dist; + const int num_threads = OMP_NUM_THREADS(); + std::vector thread_random_engines(num_threads); + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + thread_random_engines[thread_index] = std::mt19937(thread_index); + } + int num_total_items = 0; + #pragma omp parallel for schedule(static) num_threads(num_threads) reduction(+:num_total_items) + for (int query_index = 0; query_index < num_queries; ++query_index) { + const int thread_index = omp_get_thread_num(); + items_per_query[query_index + 1] = num_item_distribution(thread_random_engines[thread_index]); + num_total_items += items_per_query[query_index + 1]; + } + for (int query_index = 0; query_index < num_queries; ++query_index) { + items_per_query[query_index + 1] += items_per_query[query_index]; + } + item_scores.resize(num_total_items, 0.0f); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int item_index = 0; item_index < num_total_items; ++item_index) { + const int thread_index = omp_get_thread_num(); + item_scores[item_index] = score_dist(thread_random_engines[thread_index]); + } + double* cuda_score = nullptr; + data_size_t* cuda_query_boundaries = nullptr; + data_size_t* cuda_out_indices = nullptr; + InitCUDAMemoryFromHostMemoryOuter(&cuda_score, item_scores.data(), item_scores.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_query_boundaries, items_per_query.data(), items_per_query.size(), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_out_indices, item_scores.size(), __FILE__, __LINE__); + const auto start = std::chrono::steady_clock::now(); + BitonicArgSortItemsGlobal(cuda_score, num_queries, cuda_query_boundaries, cuda_out_indices); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + const auto end = std::chrono::steady_clock::now(); + const std::chrono::duration duration = static_cast>(end - start); + Log::Warning("bitonic arg sort items global time = %f", duration.count()); + std::vector sorted_item_indices(item_scores.size()); + CopyFromCUDADeviceToHostOuter(sorted_item_indices.data(), cuda_out_indices, item_scores.size(), __FILE__, __LINE__); + std::vector host_sorted_item_indices(item_scores.size()); + PrintLastCUDAErrorOuter(__FILE__, __LINE__); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int i = 0; i < num_queries; ++i) { + const int query_start = items_per_query[i]; + const int query_end = items_per_query[i + 1]; + for (int j = query_start; j < query_end; ++j) { + host_sorted_item_indices[j] = j - query_start; + } + std::sort(host_sorted_item_indices.data() + query_start, host_sorted_item_indices.data() + query_end, [&item_scores, query_start] (int a, int b) { + return item_scores[query_start + a] > item_scores[query_start + b]; + }); + } + for (int query_index = 0; query_index < num_queries; ++query_index) { + const int query_start = items_per_query[query_index]; + const int query_end = items_per_query[query_index + 1]; + for (int item_index = query_start; item_index < query_end; ++item_index) { + const double cuda_item_score = item_scores[query_start + sorted_item_indices[item_index]]; + const double host_item_score = item_scores[query_start + host_sorted_item_indices[item_index]]; + if (cuda_item_score != host_item_score) { + Log::Warning("item_index = %d, query_start = %d, cuda_item_score = %f, host_item_score = %f, sorted_item_indices = %d", + item_index, query_start, cuda_item_score, host_item_score, sorted_item_indices[item_index]); + } + } + } + Log::Warning("bitonic argsort items test pass"); + std::vector copied_scores = item_scores; + const std::vector const_copied_scores = item_scores; + std::vector host_indices(item_scores.size()); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t data_index = 0; data_index < static_cast(host_indices.size()); ++data_index) { + host_indices[data_index] = data_index; + } + data_size_t* cuda_indices = nullptr; + AllocateCUDAMemoryOuter(&cuda_indices, item_scores.size(), __FILE__, __LINE__); + BitonicArgSortGlobal(cuda_score, cuda_indices, host_indices.size()); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + PrintLastCUDAErrorOuter(__FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(host_indices.data(), cuda_indices, host_indices.size(), __FILE__, __LINE__); + std::vector host_cuda_score(item_scores.size()); + CopyFromCUDADeviceToHostOuter(host_cuda_score.data(), cuda_score, item_scores.size(), __FILE__, __LINE__); + for (size_t i = 0; i < host_indices.size() - 1; ++i) { + const data_size_t index_1 = host_indices[i]; + const data_size_t index_2 = host_indices[i + 1]; + const double score_1 = host_cuda_score[index_1]; + const double score_2 = host_cuda_score[index_2]; + if (score_1 > score_2) { + Log::Warning("error in argsort score_1 = %.20f, score_2 = %.20f", score_1, score_2); + break; + } + } + std::vector host_sort_item_scores = item_scores; + std::sort(host_sort_item_scores.begin(), host_sort_item_scores.end()); + double* new_cuda_score = nullptr; + InitCUDAMemoryFromHostMemoryOuter(&new_cuda_score, item_scores.data(), item_scores.size(), __FILE__, __LINE__); + BitonicSortGlobal(cuda_score, item_scores.size()); + BitonicSortGlobal(new_cuda_score, item_scores.size()); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + PrintLastCUDAErrorOuter(__FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(item_scores.data(), cuda_score, item_scores.size(), __FILE__, __LINE__); + std::vector cuda_score_sorted(item_scores.size()); + CopyFromCUDADeviceToHostOuter(cuda_score_sorted.data(), new_cuda_score, cuda_score_sorted.size(), __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + for (size_t i = 0; i < item_scores.size() - 1; ++i) { + const double score_1 = item_scores[i]; + const double score_2 = item_scores[i + 1]; + if (score_1 > score_2) { + Log::Warning("error in sort score_1 = %.20f, score_2 = %.20f", score_1, score_2); + break; + } + } + for (size_t i = 0; i < cuda_score_sorted.size() - 1; ++i) { + const double score_1 = cuda_score_sorted[i]; + const double score_2 = cuda_score_sorted[i + 1]; + if (score_1 > score_2) { + Log::Warning("error in new sort score_1 = %.20f, score_2 = %.20f", score_1, score_2); + break; + } + } + { + const int num_test_int = 2508113; + std::vector random_ints(num_test_int, 0); + Threading::For(0, num_test_int, 512, [&random_ints, &thread_random_engines, &num_item_distribution] (int thread_index, int start, int end) { + for (int data_index = start; data_index < end; ++data_index) { + random_ints[data_index] = num_item_distribution(thread_random_engines[thread_index]); + } + }); + int* cuda_rand_ints = nullptr; + InitCUDAMemoryFromHostMemoryOuter(&cuda_rand_ints, random_ints.data(), random_ints.size(), __FILE__, __LINE__); + BitonicSortGlobal(cuda_rand_ints, random_ints.size()); + CopyFromCUDADeviceToHostOuter(random_ints.data(), cuda_rand_ints, random_ints.size(), __FILE__, __LINE__); + /*const int segment_length = 1024; + const int num_segments = (num_test_int + segment_length - 1) / segment_length; + for (int segment_index = 0; segment_index < num_segments; ++segment_index) { + const int segment_start = segment_index * segment_length; + const int segment_end = std::min(segment_start + segment_length, num_test_int); + const bool ascending = (segment_index % 2 == 0); + for (int data_index = segment_start; data_index < segment_end - 1; ++data_index) { + const int value_1 = random_ints[data_index]; + const int value_2 = random_ints[data_index + 1]; + if (ascending) { + if (value_1 > value_2) { + Log::Warning("data_index = %d error in first stage of bitonic sort value_1 = %d, value_2 = %d", data_index, value_1, value_2); + } + } else { + if (value_1 < value_2) { + Log::Warning("data_index = %d error in first stage of bitonic sort value_1 = %d, value_2 = %d", data_index, value_1, value_2); + } + } + } + }*/ + for (int i = 0; i < num_test_int - 1; ++i) { + const int value_1 = random_ints[i]; + const int value_2 = random_ints[i + 1]; + if (value_1 > value_2) { + Log::Warning("error in int value_1 = %d, value_2 = %d", value_1, value_2); + break; + } + } + Log::Warning("test int global sort passed"); + } + { + const int num_test_double = 2508113; + std::vector random_double(num_test_double, 0); + Threading::For(0, num_test_double, 512, [&random_double, &thread_random_engines, &score_dist] (int thread_index, int start, int end) { + for (int data_index = start; data_index < end; ++data_index) { + if (data_index % 10 == 0) { + random_double[data_index] = random_double[data_index / 10]; + } else { + random_double[data_index] = score_dist(thread_random_engines[thread_index]); + } + } + }); + double* cuda_rand_double = nullptr; + for (int i = 0; i < num_test_double; ++i) { + CHECK_GE(random_double[i], 0.0f); + CHECK_LE(random_double[i], 1.0f); + } + InitCUDAMemoryFromHostMemoryOuter(&cuda_rand_double, random_double.data(), random_double.size(), __FILE__, __LINE__); + BitonicSortGlobal(cuda_rand_double, random_double.size()); + CopyFromCUDADeviceToHostOuter(random_double.data(), cuda_rand_double, random_double.size(), __FILE__, __LINE__); + /*const int segment_length = 1024; + const int num_segments = (num_test_double + segment_length - 1) / segment_length; + for (int segment_index = 0; segment_index < num_segments; ++segment_index) { + const int segment_start = segment_index * segment_length; + const int segment_end = std::min(segment_start + segment_length, num_test_double); + const bool ascending = (segment_index % 2 == 0); + for (int data_index = segment_start; data_index < segment_end - 1; ++data_index) { + const double value_1 = random_double[data_index]; + const double value_2 = random_double[data_index + 1]; + if (ascending) { + if (value_1 > value_2) { + Log::Warning("data_index = %d error in first stage of bitonic sort value_1 = %f, value_2 = %f", data_index, value_1, value_2); + } + } else { + if (value_1 < value_2) { + Log::Warning("data_index = %d error in first stage of bitonic sort value_1 = %f, value_2 = %f", data_index, value_1, value_2); + } + } + } + }*/ + for (int i = 0; i < num_test_double - 1; ++i) { + const double value_1 = random_double[i]; + const double value_2 = random_double[i + 1]; + if (value_1 > value_2) { + Log::Warning("error in double value_1 = %.20f, value_2 = %.20f", value_1, value_2); + break; + } + } + for (int i = 0; i < num_test_double; ++i) { + CHECK_GE(random_double[i], 0.0f); + CHECK_LE(random_double[i], 1.0f); + } + Log::Warning("test doublecd global sort passed"); + } + { + double* cuda_copied_scores = nullptr; + std::vector host_copied_scores = const_copied_scores; + InitCUDAMemoryFromHostMemoryOuter(&cuda_copied_scores, copied_scores.data(), copied_scores.size(), __FILE__, __LINE__); + BitonicSortGlobal(cuda_copied_scores, copied_scores.size()); + std::sort(host_copied_scores.begin(), host_copied_scores.end(), [] (double a, double b) { return a > b; }); + CopyFromCUDADeviceToHostOuter(copied_scores.data(), cuda_copied_scores, copied_scores.size(), __FILE__, __LINE__); + for (int i = 0; i < copied_scores.size(); ++i) { + const double host_value = host_copied_scores[i]; + const double cuda_value = copied_scores[i]; + const double host_sort_value = host_sort_item_scores[i]; + if (host_value != cuda_value) { + Log::Warning("error in sort item scores %f vs %f", host_value, cuda_value); + break; + } + } + } + { + double* cuda_copied_scores = nullptr; + InitCUDAMemoryFromHostMemoryOuter(&cuda_copied_scores, const_copied_scores.data(), const_copied_scores.size(), __FILE__, __LINE__); + data_size_t* cuda_indices = nullptr; + AllocateCUDAMemoryOuter(&cuda_indices, const_copied_scores.size(), __FILE__, __LINE__); + BitonicArgSortGlobal(cuda_copied_scores, cuda_indices, const_copied_scores.size()); + std::vector host_indices(const_copied_scores.size()); + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t i = 0; i < static_cast(const_copied_scores.size()); ++i) { + host_indices[i] = i; + } + std::sort(host_indices.begin(), host_indices.end(), + [&const_copied_scores] (data_size_t a, data_size_t b) { return const_copied_scores[a] > const_copied_scores[b]; }); + std::vector host_cuda_indices(const_copied_scores.size()); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(host_cuda_indices.data(), cuda_indices, const_copied_scores.size(), __FILE__, __LINE__); + /*const int segment_length = 1024; + const int num_segments = (static_cast(const_copied_scores.size()) + segment_length - 1) / segment_length; + for (int segment_index = 0; segment_index < num_segments; ++segment_index) { + const int segment_start = segment_index * segment_length; + const int segment_end = std::min(segment_start + segment_length, static_cast(const_copied_scores.size())); + const bool ascending = (segment_index % 2 == 1); + for (data_size_t data_index = segment_start; data_index < segment_end - 1; ++data_index) { + const data_size_t index_1 = host_cuda_indices[data_index]; + const data_size_t index_2 = host_cuda_indices[data_index + 1]; + const double value_1 = const_copied_scores[index_1]; + const double value_2 = const_copied_scores[index_2]; + if (ascending) { + if (value_1 > value_2) { + Log::Warning("error ascending = %d, index_1 = %d, index_2 = %d, value_1 = %f, value_2 = %f", static_cast(ascending), + index_1, index_2, value_1, value_2); + } + } else { + if (value_1 < value_2) { + Log::Warning("error ascending = %d, index_1 = %d, index_2 = %d, value_1 = %f, value_2 = %f", static_cast(ascending), + index_1, index_2, value_1, value_2); + } + } + } + }*/ + BitonicSortGlobal(cuda_copied_scores, const_copied_scores.size()); + std::vector host_cuda_sort_scores(const_copied_scores.size()); + CopyFromCUDADeviceToHostOuter(host_cuda_sort_scores.data(), cuda_copied_scores, const_copied_scores.size(), __FILE__, __LINE__); + for (int i = 0; i < static_cast(const_copied_scores.size()); ++i) { + const double sort_score = host_cuda_sort_scores[i]; + const double argsort_score = const_copied_scores[host_cuda_indices[i]]; + if (sort_score != argsort_score) { + Log::Warning("error sort_score = %.20f, argsort_score = %.20f", sort_score, argsort_score); + } + CHECK_EQ(sort_score, argsort_score); + } + for (int i = 0; i < copied_scores.size(); ++i) { + const data_size_t host_index = host_indices[i]; + const data_size_t cuda_index = host_cuda_indices[i]; + const double host_value = const_copied_scores[host_index]; + const double cuda_value = const_copied_scores[cuda_index]; + if (host_index != cuda_index) { + Log::Warning("i = %d error in arg sort scores %d vs %d, host_value = %f, cuda_value = %f", i, host_index, cuda_index, host_value, cuda_value); + break; + } + } + } + { + std::vector item_argsort_scores = const_copied_scores; + std::vector item_argsort_query_boundaries{0, static_cast(item_scores.size())}; + data_size_t* cuda_query_boundaries = nullptr; + InitCUDAMemoryFromHostMemoryOuter(&cuda_query_boundaries, item_argsort_query_boundaries.data(), item_argsort_query_boundaries.size(), __FILE__, __LINE__); + double* cuda_argsort_scores = nullptr; + InitCUDAMemoryFromHostMemoryOuter(&cuda_argsort_scores, item_argsort_scores.data(), item_argsort_scores.size(), __FILE__, __LINE__); + data_size_t* out_indices = nullptr; + AllocateCUDAMemoryOuter(&out_indices, item_argsort_scores.size(), __FILE__, __LINE__); + BitonicArgSortItemsGlobal(cuda_argsort_scores, 1, cuda_query_boundaries, out_indices); + std::vector cuda_sort_indices(item_argsort_scores.size()); + CopyFromCUDADeviceToHostOuter(cuda_sort_indices.data(), out_indices, cuda_sort_indices.size(), __FILE__, __LINE__); + + double* cuda_sort_scores = nullptr; + InitCUDAMemoryFromHostMemoryOuter(&cuda_sort_scores, const_copied_scores.data(), const_copied_scores.size(), __FILE__, __LINE__); + BitonicSortGlobal(cuda_sort_scores, const_copied_scores.size()); + std::vector cuda_sort_scores_to_host(const_copied_scores.size()); + CopyFromCUDADeviceToHostOuter(cuda_sort_scores_to_host.data(), cuda_sort_scores, const_copied_scores.size(), __FILE__, __LINE__); + + std::vector host_sort_result = const_copied_scores; + Log::Warning("num scores = %d", const_copied_scores.size()); + std::sort(host_sort_result.begin(), host_sort_result.end(), [] (const double a, const double b) { return a > b; }); + for (data_size_t i = 0; i < static_cast(const_copied_scores.size()); ++i) { + CHECK_EQ(host_sort_result[i], const_copied_scores[cuda_sort_indices[i]]); + } + Log::Warning("bitonic item arg sort items success"); + for (data_size_t i = 0; i < static_cast(const_copied_scores.size()); ++i) { + CHECK_EQ(host_sort_result[i], cuda_sort_scores_to_host[i]); + } + Log::Warning("bitonic sort items success"); + } } } // namespace LightGBM diff --git a/src/metric/cuda/cuda_binary_metric.hpp b/src/metric/cuda/cuda_binary_metric.hpp index e2d0f86b3e1c..9eab91de03d6 100644 --- a/src/metric/cuda/cuda_binary_metric.hpp +++ b/src/metric/cuda/cuda_binary_metric.hpp @@ -108,6 +108,8 @@ class CUDAAUCMetric : public AUCMetric { private: void LaunchEvalKernel(const double* score) const; + void TestCUDABitonicSortForQueryItems() const; + data_size_t* cuda_indices_buffer_; double* cuda_sum_pos_buffer_; double* cuda_block_sum_pos_buffer_; From c6811023e54e9b00bd14b642e84ba882602ce3b3 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 18 Aug 2021 14:22:55 +0000 Subject: [PATCH 056/166] remove debug code --- CMakeLists.txt | 2 + include/LightGBM/cuda/cuda_algorithms.hpp | 2 + src/cuda/cuda_algorithms.cu | 52 +-- src/io/cuda/cuda_tree.cpp | 1 - src/io/cuda/cuda_tree.cu | 9 - src/metric/cuda/cuda_binary_metric.cpp | 9 +- src/metric/cuda/cuda_binary_metric.cu | 376 +--------------------- src/metric/cuda/cuda_binary_metric.hpp | 4 +- 8 files changed, 49 insertions(+), 406 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a64ceb968ee..4060c2ef63da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -352,6 +352,8 @@ if(USE_CUDA) src/boosting/cuda/*.cu src/application/cuda/*.cpp src/application/cuda/*.cu + src/metric/cuda/*.cpp + src/metric/cuda/*.cu endif(USE_CUDA) ) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 3e4ae7172b47..855dfd06676f 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -75,7 +75,9 @@ void GloblGenAUCMark(const double* scores, uint16_t* block_mark_first_zero, const data_size_t num_data); +template void GlobalCalcAUC(const double* sum_pos_buffer, + const double* sum_neg_buffer, const data_size_t* mark_buffer, const data_size_t num_data, double* block_buffer); diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 1c7e2b00b851..5488e4d9ee93 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -770,20 +770,7 @@ __global__ void GlobalGenAUCMarkKernel(const double* scores, is_all_non_zero[threadIdx.x] = static_cast(shared_buffer[threadIdx.x]); __syncthreads(); uint16_t block_first_zero = (shared_buffer[threadIdx.x] == 0 ? threadIdx.x : blockDim.x); - /*if (blockIdx.x == 0 && threadIdx.x == 0) { - for (uint32_t i = 0; i < blockDim.x; ++i) { - printf("before prefix sum shared_buffer[%d] = %d\n", i, shared_buffer[i]); - } - }*/ PrefixSumZeroOut(shared_buffer, is_all_non_zero, blockDim.x); - /*if (blockIdx.x == 0 && threadIdx.x == 0) { - for (uint32_t i = 0; i < blockDim.x; ++i) { - const data_size_t local_data_index = static_cast(i + blockIdx.x * blockDim.x); - printf("shared_buffer[%d] = %d, scores[%d] = %f, original_offset = %d\n", i, shared_buffer[i], - sorted_indices[local_data_index], scores[sorted_indices[local_data_index]], - static_cast(local_data_index > 0 && scores[sorted_indices[local_data_index]] == scores[sorted_indices[local_data_index - 1]])); - } - }*/ block_first_zero = ShuffleReduceMin(block_first_zero, shuffle_reduce_shared_buffer, blockDim.x); if (data_index < num_data) { mark_buffer[data_index] = shared_buffer[threadIdx.x + 1]; @@ -882,8 +869,10 @@ void GlobalGenAUCPosNegSum(const label_t* labels, GlobalGenAUCPosNegSumInner(labels, weights, sorted_indices, sum_pos_buffer, block_sum_pos_buffer, num_data); } +template __global__ void GlobalCalcAUCKernel( const double* sum_pos_buffer, + const double* sum_neg_buffer, const data_size_t* mark_buffer, const data_size_t num_data, double* block_buffer) { @@ -894,9 +883,16 @@ __global__ void GlobalCalcAUCKernel( if (data_index == num_data - 1 || mark_buffer[data_index + 1] == 0) { const data_size_t prev_data_index = data_index - mark_buffer[data_index] - 1; const double prev_sum_pos = (prev_data_index < 0 ? 0.0f : sum_pos_buffer[prev_data_index]); - const double cur_pos = sum_pos_buffer[data_index] - prev_sum_pos; - const double cur_neg = static_cast(data_index - prev_data_index) - cur_pos; - area = cur_neg * (cur_pos * 0.5f + prev_sum_pos); + if (USE_WEIGHT) { + const double prev_sum_neg = (prev_data_index < 0 ? 0.0f : sum_neg_buffer[prev_data_index]); + const double cur_pos = sum_pos_buffer[data_index] - prev_sum_pos; + const double cur_neg = sum_neg_buffer[data_index] - prev_sum_neg; + area = cur_neg * (cur_pos * 0.5f + prev_sum_pos); + } else { + const double cur_pos = sum_pos_buffer[data_index] - prev_sum_pos; + const double cur_neg = static_cast(data_index - prev_data_index) - cur_pos; + area = cur_neg * (cur_pos * 0.5f + prev_sum_pos); + } } } area = ShuffleReduceSum(area, shared_buffer, blockDim.x); @@ -918,13 +914,33 @@ __global__ void BlockReduceSum(T* block_buffer, const data_size_t num_blocks) { } } -void GlobalCalcAUC(const double* sum_pos_buffer, +template +void GlobalCalcAUCInner(const double* sum_pos_buffer, + const double* sum_neg_buffer, const data_size_t* mark_buffer, const data_size_t num_data, double* block_buffer) { const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - GlobalCalcAUCKernel<<>>(sum_pos_buffer, mark_buffer, num_data, block_buffer); + GlobalCalcAUCKernel<<>>(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); BlockReduceSum<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); } +template <> +void GlobalCalcAUC(const double* sum_pos_buffer, + const double* sum_neg_buffer, + const data_size_t* mark_buffer, + const data_size_t num_data, + double* block_buffer) { + GlobalCalcAUCInner(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); +} + +template <> +void GlobalCalcAUC(const double* sum_pos_buffer, + const double* sum_neg_buffer, + const data_size_t* mark_buffer, + const data_size_t num_data, + double* block_buffer) { + GlobalCalcAUCInner(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); +} + } // namespace LightGBM diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index aa4f6989a114..cacac6ef0775 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -88,7 +88,6 @@ void CUDATree::InitCUDAMemory() { static_cast(max_leaves_), __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_leaf_parent_, 0, 1, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_leaf_value_, 0.0f, 1, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_leaf_weight_, 0.0f, 1, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_leaf_parent_, -1, 1, __FILE__, __LINE__); diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index 0e5b4cb1c922..dea410e205d5 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -59,9 +59,6 @@ __global__ void SplitKernel(// split information const int new_node_index = num_leaves - 1; const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); const int parent_index = leaf_parent[leaf_index]; - if (parent_index == new_node_index) { - printf("error !!!! parent_index = %d, new_node_index = %d\n", parent_index, new_node_index); - } if (thread_index == 0) { if (parent_index >= 0) { // if cur node is left child @@ -179,7 +176,6 @@ __global__ void AddPredictionToScoreKernel( const data_size_t data_index = USE_INDICES ? cuda_used_indices[inner_data_index] : inner_data_index; if (data_index < num_data) { int node = 0; - int iter = 0; while (node >= 0) { const int split_feature_inner = cuda_split_feature_inner[node]; const int column = cuda_feature_to_column[split_feature_inner]; @@ -219,11 +215,6 @@ __global__ void AddPredictionToScoreKernel( node = cuda_right_child[node]; } } - ++iter; - if (iter > 1000) { - printf("error iter = %d\n", iter); - printf("node = %d, cuda_left_child[%d] = %d, cuda_right_child[%d] = %d\n", node, node, cuda_left_child[node], node, cuda_right_child[node]); - } } score[data_index] += cuda_leaf_value[~node]; } diff --git a/src/metric/cuda/cuda_binary_metric.cpp b/src/metric/cuda/cuda_binary_metric.cpp index 6070611a3103..e802ad142508 100644 --- a/src/metric/cuda/cuda_binary_metric.cpp +++ b/src/metric/cuda/cuda_binary_metric.cpp @@ -57,18 +57,17 @@ void CUDAAUCMetric::Init(const Metadata& metadata, data_size_t num_data) { SetCUDAMemoryOuter(cuda_block_mark_first_zero_, 0, 1, __FILE__, __LINE__); cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); cuda_label_ = metadata.cuda_metadata()->cuda_label(); - - TestCUDABitonicSortForQueryItems(); + if (cuda_weights_ != nullptr) { + AllocateCUDAMemoryOuter(&cuda_block_sum_neg_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_sum_neg_buffer_, 0, 1, __FILE__, __LINE__); + } } std::vector CUDAAUCMetric::Eval(const double* score, const ObjectiveFunction*) const { - Log::Warning("before evaluate AUC"); LaunchEvalKernel(score); double total_area = 0.0f, sum_pos = 0.0f; CopyFromCUDADeviceToHostOuter(&total_area, cuda_block_sum_pos_buffer_, 1, __FILE__, __LINE__); CopyFromCUDADeviceToHostOuter(&sum_pos, cuda_sum_pos_buffer_ + static_cast(num_data_ - 1), 1, __FILE__, __LINE__); - Log::Warning("sum_pos = %f", sum_pos); - Log::Warning("after evaluate AUC"); if (sum_pos != sum_weights_ && sum_pos > 0.0f) { return std::vector(1, total_area / (sum_pos * (sum_weights_ - sum_pos))); } else { diff --git a/src/metric/cuda/cuda_binary_metric.cu b/src/metric/cuda/cuda_binary_metric.cu index 4a31b1c5f94c..5d70351ff514 100644 --- a/src/metric/cuda/cuda_binary_metric.cu +++ b/src/metric/cuda/cuda_binary_metric.cu @@ -72,385 +72,19 @@ void CUDABinaryMetric::LaunchEvalKernel(const double* sco } void CUDAAUCMetric::LaunchEvalKernel(const double* score) const { - int num_pos = 0; - for (int data_index = 0; data_index < num_data_; ++data_index) { - num_pos += static_cast(label_[data_index]); - } - Log::Warning("sum_pos = %d", num_pos); BitonicArgSortGlobal(score, cuda_indices_buffer_, static_cast(num_data_)); - std::vector host_sorted_indices(num_data_, 0); - std::vector host_score(num_data_, 0.0f); - CopyFromCUDADeviceToHostOuter(host_sorted_indices.data(), cuda_indices_buffer_, static_cast(num_data_), __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(host_score.data(), score, static_cast(num_data_), __FILE__, __LINE__); - //Log::Warning("host_sorted_indices[%d] = %d, host_score[%d] = %f", 0, host_sorted_indices[0], host_score[host_sorted_indices[0]]); - for (int i = 0; i < num_data_ - 1; ++i) { - //Log::Warning("host_sorted_indices[%d] = %d, host_score[%d] = %f", i + 1, host_sorted_indices[i + 1], host_sorted_indices[i + 1], host_score[host_sorted_indices[i + 1]]); - CHECK_GE(host_score[host_sorted_indices[i]], host_score[host_sorted_indices[i + 1]]); - } SetCUDAMemoryOuter(cuda_block_sum_pos_buffer_, 0, 1, __FILE__, __LINE__); if (cuda_weights_ == nullptr) { GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); - std::vector host_cuda_sum_pos_buffer(num_data_); - CopyFromCUDADeviceToHostOuter(host_cuda_sum_pos_buffer.data(), cuda_sum_pos_buffer_, static_cast(num_data_), __FILE__, __LINE__); - double cur_sum_pos = 0.0f; - for (data_size_t data_index = 0; data_index < num_data_; ++data_index) { - cur_sum_pos += static_cast(label_[host_sorted_indices[data_index]] > 0); - CHECK_EQ(cur_sum_pos, host_cuda_sum_pos_buffer[data_index]); - } } else { GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); - Log::Fatal("CUDA AUC with weights is not supported."); + GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_neg_buffer_, cuda_block_sum_neg_buffer_, num_data_); } GloblGenAUCMark(score, cuda_indices_buffer_, cuda_threshold_mark_, cuda_block_threshold_mark_buffer_, cuda_block_mark_first_zero_, num_data_); - std::vector host_threshold_mask(num_data_, 0); - CopyFromCUDADeviceToHostOuter(host_threshold_mask.data(), cuda_threshold_mark_, static_cast(num_data_), __FILE__, __LINE__); - for (int i = 0; i < num_data_; ++i) { - //Log::Warning("host_threshold_mask[%d] = %d", i, host_threshold_mask[i]); - const bool is_valid = i == 0 || host_threshold_mask[i] == 0 || (host_threshold_mask[i] == host_threshold_mask[i - 1] + 1); - if (!is_valid) { - Log::Warning("host_threshold_mask[%d] = %d, host_threshold_mask[%d] = %d", i, host_threshold_mask[i], i - 1, host_threshold_mask[i - 1]); - } - CHECK(is_valid); - if (i > 0) { - const bool should_increase = (host_score[host_sorted_indices[i]] == host_score[host_sorted_indices[i - 1]]); - if (should_increase) { - CHECK_EQ(host_threshold_mask[i], host_threshold_mask[i - 1] + 1); - } else { - CHECK_EQ(host_threshold_mask[i], 0); - } - } - } - GlobalCalcAUC(cuda_sum_pos_buffer_, cuda_threshold_mark_, num_data_, cuda_block_sum_pos_buffer_); -} - -void CUDAAUCMetric::TestCUDABitonicSortForQueryItems() const { - int num_queries = 1000; - std::vector items_per_query(num_queries + 1, 0); - std::vector item_scores; - const int max_item_per_query = 5000; - std::vector num_item_probs(max_item_per_query, 1.0f / max_item_per_query); - std::discrete_distribution num_item_distribution(num_item_probs.begin(), num_item_probs.end()); - std::uniform_real_distribution score_dist; - const int num_threads = OMP_NUM_THREADS(); - std::vector thread_random_engines(num_threads); - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - thread_random_engines[thread_index] = std::mt19937(thread_index); - } - int num_total_items = 0; - #pragma omp parallel for schedule(static) num_threads(num_threads) reduction(+:num_total_items) - for (int query_index = 0; query_index < num_queries; ++query_index) { - const int thread_index = omp_get_thread_num(); - items_per_query[query_index + 1] = num_item_distribution(thread_random_engines[thread_index]); - num_total_items += items_per_query[query_index + 1]; - } - for (int query_index = 0; query_index < num_queries; ++query_index) { - items_per_query[query_index + 1] += items_per_query[query_index]; - } - item_scores.resize(num_total_items, 0.0f); - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (int item_index = 0; item_index < num_total_items; ++item_index) { - const int thread_index = omp_get_thread_num(); - item_scores[item_index] = score_dist(thread_random_engines[thread_index]); - } - double* cuda_score = nullptr; - data_size_t* cuda_query_boundaries = nullptr; - data_size_t* cuda_out_indices = nullptr; - InitCUDAMemoryFromHostMemoryOuter(&cuda_score, item_scores.data(), item_scores.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_query_boundaries, items_per_query.data(), items_per_query.size(), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_out_indices, item_scores.size(), __FILE__, __LINE__); - const auto start = std::chrono::steady_clock::now(); - BitonicArgSortItemsGlobal(cuda_score, num_queries, cuda_query_boundaries, cuda_out_indices); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - const auto end = std::chrono::steady_clock::now(); - const std::chrono::duration duration = static_cast>(end - start); - Log::Warning("bitonic arg sort items global time = %f", duration.count()); - std::vector sorted_item_indices(item_scores.size()); - CopyFromCUDADeviceToHostOuter(sorted_item_indices.data(), cuda_out_indices, item_scores.size(), __FILE__, __LINE__); - std::vector host_sorted_item_indices(item_scores.size()); - PrintLastCUDAErrorOuter(__FILE__, __LINE__); - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (int i = 0; i < num_queries; ++i) { - const int query_start = items_per_query[i]; - const int query_end = items_per_query[i + 1]; - for (int j = query_start; j < query_end; ++j) { - host_sorted_item_indices[j] = j - query_start; - } - std::sort(host_sorted_item_indices.data() + query_start, host_sorted_item_indices.data() + query_end, [&item_scores, query_start] (int a, int b) { - return item_scores[query_start + a] > item_scores[query_start + b]; - }); - } - for (int query_index = 0; query_index < num_queries; ++query_index) { - const int query_start = items_per_query[query_index]; - const int query_end = items_per_query[query_index + 1]; - for (int item_index = query_start; item_index < query_end; ++item_index) { - const double cuda_item_score = item_scores[query_start + sorted_item_indices[item_index]]; - const double host_item_score = item_scores[query_start + host_sorted_item_indices[item_index]]; - if (cuda_item_score != host_item_score) { - Log::Warning("item_index = %d, query_start = %d, cuda_item_score = %f, host_item_score = %f, sorted_item_indices = %d", - item_index, query_start, cuda_item_score, host_item_score, sorted_item_indices[item_index]); - } - } - } - Log::Warning("bitonic argsort items test pass"); - std::vector copied_scores = item_scores; - const std::vector const_copied_scores = item_scores; - std::vector host_indices(item_scores.size()); - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (data_size_t data_index = 0; data_index < static_cast(host_indices.size()); ++data_index) { - host_indices[data_index] = data_index; - } - data_size_t* cuda_indices = nullptr; - AllocateCUDAMemoryOuter(&cuda_indices, item_scores.size(), __FILE__, __LINE__); - BitonicArgSortGlobal(cuda_score, cuda_indices, host_indices.size()); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - PrintLastCUDAErrorOuter(__FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(host_indices.data(), cuda_indices, host_indices.size(), __FILE__, __LINE__); - std::vector host_cuda_score(item_scores.size()); - CopyFromCUDADeviceToHostOuter(host_cuda_score.data(), cuda_score, item_scores.size(), __FILE__, __LINE__); - for (size_t i = 0; i < host_indices.size() - 1; ++i) { - const data_size_t index_1 = host_indices[i]; - const data_size_t index_2 = host_indices[i + 1]; - const double score_1 = host_cuda_score[index_1]; - const double score_2 = host_cuda_score[index_2]; - if (score_1 > score_2) { - Log::Warning("error in argsort score_1 = %.20f, score_2 = %.20f", score_1, score_2); - break; - } - } - std::vector host_sort_item_scores = item_scores; - std::sort(host_sort_item_scores.begin(), host_sort_item_scores.end()); - double* new_cuda_score = nullptr; - InitCUDAMemoryFromHostMemoryOuter(&new_cuda_score, item_scores.data(), item_scores.size(), __FILE__, __LINE__); - BitonicSortGlobal(cuda_score, item_scores.size()); - BitonicSortGlobal(new_cuda_score, item_scores.size()); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - PrintLastCUDAErrorOuter(__FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(item_scores.data(), cuda_score, item_scores.size(), __FILE__, __LINE__); - std::vector cuda_score_sorted(item_scores.size()); - CopyFromCUDADeviceToHostOuter(cuda_score_sorted.data(), new_cuda_score, cuda_score_sorted.size(), __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - for (size_t i = 0; i < item_scores.size() - 1; ++i) { - const double score_1 = item_scores[i]; - const double score_2 = item_scores[i + 1]; - if (score_1 > score_2) { - Log::Warning("error in sort score_1 = %.20f, score_2 = %.20f", score_1, score_2); - break; - } - } - for (size_t i = 0; i < cuda_score_sorted.size() - 1; ++i) { - const double score_1 = cuda_score_sorted[i]; - const double score_2 = cuda_score_sorted[i + 1]; - if (score_1 > score_2) { - Log::Warning("error in new sort score_1 = %.20f, score_2 = %.20f", score_1, score_2); - break; - } - } - { - const int num_test_int = 2508113; - std::vector random_ints(num_test_int, 0); - Threading::For(0, num_test_int, 512, [&random_ints, &thread_random_engines, &num_item_distribution] (int thread_index, int start, int end) { - for (int data_index = start; data_index < end; ++data_index) { - random_ints[data_index] = num_item_distribution(thread_random_engines[thread_index]); - } - }); - int* cuda_rand_ints = nullptr; - InitCUDAMemoryFromHostMemoryOuter(&cuda_rand_ints, random_ints.data(), random_ints.size(), __FILE__, __LINE__); - BitonicSortGlobal(cuda_rand_ints, random_ints.size()); - CopyFromCUDADeviceToHostOuter(random_ints.data(), cuda_rand_ints, random_ints.size(), __FILE__, __LINE__); - /*const int segment_length = 1024; - const int num_segments = (num_test_int + segment_length - 1) / segment_length; - for (int segment_index = 0; segment_index < num_segments; ++segment_index) { - const int segment_start = segment_index * segment_length; - const int segment_end = std::min(segment_start + segment_length, num_test_int); - const bool ascending = (segment_index % 2 == 0); - for (int data_index = segment_start; data_index < segment_end - 1; ++data_index) { - const int value_1 = random_ints[data_index]; - const int value_2 = random_ints[data_index + 1]; - if (ascending) { - if (value_1 > value_2) { - Log::Warning("data_index = %d error in first stage of bitonic sort value_1 = %d, value_2 = %d", data_index, value_1, value_2); - } - } else { - if (value_1 < value_2) { - Log::Warning("data_index = %d error in first stage of bitonic sort value_1 = %d, value_2 = %d", data_index, value_1, value_2); - } - } - } - }*/ - for (int i = 0; i < num_test_int - 1; ++i) { - const int value_1 = random_ints[i]; - const int value_2 = random_ints[i + 1]; - if (value_1 > value_2) { - Log::Warning("error in int value_1 = %d, value_2 = %d", value_1, value_2); - break; - } - } - Log::Warning("test int global sort passed"); - } - { - const int num_test_double = 2508113; - std::vector random_double(num_test_double, 0); - Threading::For(0, num_test_double, 512, [&random_double, &thread_random_engines, &score_dist] (int thread_index, int start, int end) { - for (int data_index = start; data_index < end; ++data_index) { - if (data_index % 10 == 0) { - random_double[data_index] = random_double[data_index / 10]; - } else { - random_double[data_index] = score_dist(thread_random_engines[thread_index]); - } - } - }); - double* cuda_rand_double = nullptr; - for (int i = 0; i < num_test_double; ++i) { - CHECK_GE(random_double[i], 0.0f); - CHECK_LE(random_double[i], 1.0f); - } - InitCUDAMemoryFromHostMemoryOuter(&cuda_rand_double, random_double.data(), random_double.size(), __FILE__, __LINE__); - BitonicSortGlobal(cuda_rand_double, random_double.size()); - CopyFromCUDADeviceToHostOuter(random_double.data(), cuda_rand_double, random_double.size(), __FILE__, __LINE__); - /*const int segment_length = 1024; - const int num_segments = (num_test_double + segment_length - 1) / segment_length; - for (int segment_index = 0; segment_index < num_segments; ++segment_index) { - const int segment_start = segment_index * segment_length; - const int segment_end = std::min(segment_start + segment_length, num_test_double); - const bool ascending = (segment_index % 2 == 0); - for (int data_index = segment_start; data_index < segment_end - 1; ++data_index) { - const double value_1 = random_double[data_index]; - const double value_2 = random_double[data_index + 1]; - if (ascending) { - if (value_1 > value_2) { - Log::Warning("data_index = %d error in first stage of bitonic sort value_1 = %f, value_2 = %f", data_index, value_1, value_2); - } - } else { - if (value_1 < value_2) { - Log::Warning("data_index = %d error in first stage of bitonic sort value_1 = %f, value_2 = %f", data_index, value_1, value_2); - } - } - } - }*/ - for (int i = 0; i < num_test_double - 1; ++i) { - const double value_1 = random_double[i]; - const double value_2 = random_double[i + 1]; - if (value_1 > value_2) { - Log::Warning("error in double value_1 = %.20f, value_2 = %.20f", value_1, value_2); - break; - } - } - for (int i = 0; i < num_test_double; ++i) { - CHECK_GE(random_double[i], 0.0f); - CHECK_LE(random_double[i], 1.0f); - } - Log::Warning("test doublecd global sort passed"); - } - { - double* cuda_copied_scores = nullptr; - std::vector host_copied_scores = const_copied_scores; - InitCUDAMemoryFromHostMemoryOuter(&cuda_copied_scores, copied_scores.data(), copied_scores.size(), __FILE__, __LINE__); - BitonicSortGlobal(cuda_copied_scores, copied_scores.size()); - std::sort(host_copied_scores.begin(), host_copied_scores.end(), [] (double a, double b) { return a > b; }); - CopyFromCUDADeviceToHostOuter(copied_scores.data(), cuda_copied_scores, copied_scores.size(), __FILE__, __LINE__); - for (int i = 0; i < copied_scores.size(); ++i) { - const double host_value = host_copied_scores[i]; - const double cuda_value = copied_scores[i]; - const double host_sort_value = host_sort_item_scores[i]; - if (host_value != cuda_value) { - Log::Warning("error in sort item scores %f vs %f", host_value, cuda_value); - break; - } - } - } - { - double* cuda_copied_scores = nullptr; - InitCUDAMemoryFromHostMemoryOuter(&cuda_copied_scores, const_copied_scores.data(), const_copied_scores.size(), __FILE__, __LINE__); - data_size_t* cuda_indices = nullptr; - AllocateCUDAMemoryOuter(&cuda_indices, const_copied_scores.size(), __FILE__, __LINE__); - BitonicArgSortGlobal(cuda_copied_scores, cuda_indices, const_copied_scores.size()); - std::vector host_indices(const_copied_scores.size()); - const int num_threads = OMP_NUM_THREADS(); - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (data_size_t i = 0; i < static_cast(const_copied_scores.size()); ++i) { - host_indices[i] = i; - } - std::sort(host_indices.begin(), host_indices.end(), - [&const_copied_scores] (data_size_t a, data_size_t b) { return const_copied_scores[a] > const_copied_scores[b]; }); - std::vector host_cuda_indices(const_copied_scores.size()); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(host_cuda_indices.data(), cuda_indices, const_copied_scores.size(), __FILE__, __LINE__); - /*const int segment_length = 1024; - const int num_segments = (static_cast(const_copied_scores.size()) + segment_length - 1) / segment_length; - for (int segment_index = 0; segment_index < num_segments; ++segment_index) { - const int segment_start = segment_index * segment_length; - const int segment_end = std::min(segment_start + segment_length, static_cast(const_copied_scores.size())); - const bool ascending = (segment_index % 2 == 1); - for (data_size_t data_index = segment_start; data_index < segment_end - 1; ++data_index) { - const data_size_t index_1 = host_cuda_indices[data_index]; - const data_size_t index_2 = host_cuda_indices[data_index + 1]; - const double value_1 = const_copied_scores[index_1]; - const double value_2 = const_copied_scores[index_2]; - if (ascending) { - if (value_1 > value_2) { - Log::Warning("error ascending = %d, index_1 = %d, index_2 = %d, value_1 = %f, value_2 = %f", static_cast(ascending), - index_1, index_2, value_1, value_2); - } - } else { - if (value_1 < value_2) { - Log::Warning("error ascending = %d, index_1 = %d, index_2 = %d, value_1 = %f, value_2 = %f", static_cast(ascending), - index_1, index_2, value_1, value_2); - } - } - } - }*/ - BitonicSortGlobal(cuda_copied_scores, const_copied_scores.size()); - std::vector host_cuda_sort_scores(const_copied_scores.size()); - CopyFromCUDADeviceToHostOuter(host_cuda_sort_scores.data(), cuda_copied_scores, const_copied_scores.size(), __FILE__, __LINE__); - for (int i = 0; i < static_cast(const_copied_scores.size()); ++i) { - const double sort_score = host_cuda_sort_scores[i]; - const double argsort_score = const_copied_scores[host_cuda_indices[i]]; - if (sort_score != argsort_score) { - Log::Warning("error sort_score = %.20f, argsort_score = %.20f", sort_score, argsort_score); - } - CHECK_EQ(sort_score, argsort_score); - } - for (int i = 0; i < copied_scores.size(); ++i) { - const data_size_t host_index = host_indices[i]; - const data_size_t cuda_index = host_cuda_indices[i]; - const double host_value = const_copied_scores[host_index]; - const double cuda_value = const_copied_scores[cuda_index]; - if (host_index != cuda_index) { - Log::Warning("i = %d error in arg sort scores %d vs %d, host_value = %f, cuda_value = %f", i, host_index, cuda_index, host_value, cuda_value); - break; - } - } - } - { - std::vector item_argsort_scores = const_copied_scores; - std::vector item_argsort_query_boundaries{0, static_cast(item_scores.size())}; - data_size_t* cuda_query_boundaries = nullptr; - InitCUDAMemoryFromHostMemoryOuter(&cuda_query_boundaries, item_argsort_query_boundaries.data(), item_argsort_query_boundaries.size(), __FILE__, __LINE__); - double* cuda_argsort_scores = nullptr; - InitCUDAMemoryFromHostMemoryOuter(&cuda_argsort_scores, item_argsort_scores.data(), item_argsort_scores.size(), __FILE__, __LINE__); - data_size_t* out_indices = nullptr; - AllocateCUDAMemoryOuter(&out_indices, item_argsort_scores.size(), __FILE__, __LINE__); - BitonicArgSortItemsGlobal(cuda_argsort_scores, 1, cuda_query_boundaries, out_indices); - std::vector cuda_sort_indices(item_argsort_scores.size()); - CopyFromCUDADeviceToHostOuter(cuda_sort_indices.data(), out_indices, cuda_sort_indices.size(), __FILE__, __LINE__); - - double* cuda_sort_scores = nullptr; - InitCUDAMemoryFromHostMemoryOuter(&cuda_sort_scores, const_copied_scores.data(), const_copied_scores.size(), __FILE__, __LINE__); - BitonicSortGlobal(cuda_sort_scores, const_copied_scores.size()); - std::vector cuda_sort_scores_to_host(const_copied_scores.size()); - CopyFromCUDADeviceToHostOuter(cuda_sort_scores_to_host.data(), cuda_sort_scores, const_copied_scores.size(), __FILE__, __LINE__); - - std::vector host_sort_result = const_copied_scores; - Log::Warning("num scores = %d", const_copied_scores.size()); - std::sort(host_sort_result.begin(), host_sort_result.end(), [] (const double a, const double b) { return a > b; }); - for (data_size_t i = 0; i < static_cast(const_copied_scores.size()); ++i) { - CHECK_EQ(host_sort_result[i], const_copied_scores[cuda_sort_indices[i]]); - } - Log::Warning("bitonic item arg sort items success"); - for (data_size_t i = 0; i < static_cast(const_copied_scores.size()); ++i) { - CHECK_EQ(host_sort_result[i], cuda_sort_scores_to_host[i]); - } - Log::Warning("bitonic sort items success"); + if (cuda_weights_ == nullptr) { + GlobalCalcAUC(cuda_sum_pos_buffer_, nullptr, cuda_threshold_mark_, num_data_, cuda_block_sum_pos_buffer_); + } else { + GlobalCalcAUC(cuda_sum_pos_buffer_, cuda_sum_neg_buffer_, cuda_threshold_mark_, num_data_, cuda_block_sum_pos_buffer_); } } diff --git a/src/metric/cuda/cuda_binary_metric.hpp b/src/metric/cuda/cuda_binary_metric.hpp index 9eab91de03d6..7c069198d246 100644 --- a/src/metric/cuda/cuda_binary_metric.hpp +++ b/src/metric/cuda/cuda_binary_metric.hpp @@ -108,11 +108,11 @@ class CUDAAUCMetric : public AUCMetric { private: void LaunchEvalKernel(const double* score) const; - void TestCUDABitonicSortForQueryItems() const; - data_size_t* cuda_indices_buffer_; double* cuda_sum_pos_buffer_; double* cuda_block_sum_pos_buffer_; + double* cuda_sum_neg_buffer_; + double* cuda_block_sum_neg_buffer_; data_size_t* cuda_threshold_mark_; data_size_t* cuda_block_threshold_mark_buffer_; uint16_t* cuda_block_mark_first_zero_; From ea605661ec2f442f4033efa90fa402b71d0c1a32 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 19 Aug 2021 05:56:37 +0000 Subject: [PATCH 057/166] add regression metrics --- include/LightGBM/cuda/cuda_algorithms.hpp | 7 + src/boosting/gbdt.cpp | 1 + src/cuda/cuda_algorithms.cu | 61 +++++ src/io/cuda/cuda_tree.cpp | 1 + src/io/cuda/cuda_tree.cu | 5 + src/metric/binary_metric.hpp | 2 +- src/metric/cuda/cuda_binary_metric.cpp | 36 +++ src/metric/cuda/cuda_binary_metric.cu | 17 ++ src/metric/cuda/cuda_binary_metric.hpp | 53 ++-- src/metric/cuda/cuda_regression_metric.cpp | 59 ++++ src/metric/cuda/cuda_regression_metric.cu | 128 +++++++++ src/metric/cuda/cuda_regression_metric.hpp | 258 ++++++++++++++++++ src/metric/metric.cpp | 29 +- src/metric/regression_metric.hpp | 2 +- .../cuda/cuda_regression_objective.cu | 6 +- 15 files changed, 637 insertions(+), 28 deletions(-) create mode 100644 src/metric/cuda/cuda_regression_metric.cpp create mode 100644 src/metric/cuda/cuda_regression_metric.cu create mode 100644 src/metric/cuda/cuda_regression_metric.hpp diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 855dfd06676f..4cb9d7162aa4 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -82,6 +82,13 @@ void GlobalCalcAUC(const double* sum_pos_buffer, const data_size_t num_data, double* block_buffer); +template +void GlobalCalcAveragePrecision(const double* sum_pos_buffer, + const double* sum_neg_buffer, + const data_size_t* mark_buffer, + const data_size_t num_data, + double* block_buffer); + template __device__ void PrefixSum(T* values, size_t n) { unsigned int offset = 1; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index c34c2610720c..e49173beb19e 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -529,6 +529,7 @@ std::vector GBDT::EvalOneMetric(const Metric* metric, const double* scor std::vector tmp_score(num_data * num_class_, 0.0f); CopyFromCUDADeviceToHostOuter(tmp_score.data(), score, static_cast(num_data * num_class_), __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + Log::Warning("evaluating"); return metric->Eval(tmp_score.data(), objective_function_); } else*/ { return metric->Eval(score, objective_function_); diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 5488e4d9ee93..4cb8489ca317 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -943,4 +943,65 @@ void GlobalCalcAUC(const double* sum_pos_buffer, GlobalCalcAUCInner(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); } +template +__global__ void GlobalCalcAveragePrecisionKernel( + const double* sum_pos_buffer, + const double* sum_neg_buffer, + const data_size_t* mark_buffer, + const data_size_t num_data, + double* block_buffer) { + __shared__ double shared_buffer[32]; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + double area = 0.0f; + if (data_index < num_data) { + if (data_index == num_data - 1 || mark_buffer[data_index + 1] == 0) { + const data_size_t prev_data_index = data_index - mark_buffer[data_index] - 1; + const double prev_sum_pos = (prev_data_index < 0 ? 0.0f : sum_pos_buffer[prev_data_index]); + if (USE_WEIGHT) { + const double prev_sum_neg = (prev_data_index < 0 ? 0.0f : sum_neg_buffer[prev_data_index]); + const double cur_pos = sum_pos_buffer[data_index] - prev_sum_pos; + const double cur_neg = sum_neg_buffer[data_index] - prev_sum_neg; + area = cur_pos * (cur_pos + prev_sum_pos) / (prev_sum_neg + prev_sum_pos + cur_pos + cur_neg); + } else { + const double cur_pos = sum_pos_buffer[data_index] - prev_sum_pos; + const double cur_neg = static_cast(data_index - prev_data_index) - cur_pos; + area = cur_pos * (cur_pos + prev_sum_pos) / static_cast(data_index + 1); + } + } + } + area = ShuffleReduceSum(area, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + block_buffer[blockIdx.x] = area; + } +} + +template +void GlobalCalcAveragePrecisionInner(const double* sum_pos_buffer, + const double* sum_neg_buffer, + const data_size_t* mark_buffer, + const data_size_t num_data, + double* block_buffer) { + const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + GlobalCalcAveragePrecisionKernel<<>>(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); + BlockReduceSum<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); +} + +template <> +void GlobalCalcAveragePrecision(const double* sum_pos_buffer, + const double* sum_neg_buffer, + const data_size_t* mark_buffer, + const data_size_t num_data, + double* block_buffer) { + GlobalCalcAveragePrecisionInner(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); +} + +template <> +void GlobalCalcAveragePrecision(const double* sum_pos_buffer, + const double* sum_neg_buffer, + const data_size_t* mark_buffer, + const data_size_t num_data, + double* block_buffer) { + GlobalCalcAveragePrecisionInner(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); +} + } // namespace LightGBM diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index cacac6ef0775..9e5a9e87ebe4 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -92,6 +92,7 @@ void CUDATree::InitCUDAMemory() { SetCUDAMemoryOuter(cuda_leaf_weight_, 0.0f, 1, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_leaf_parent_, -1, 1, __FILE__, __LINE__); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } void CUDATree::InitCUDA() { diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index dea410e205d5..450081e9bea2 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -176,6 +176,7 @@ __global__ void AddPredictionToScoreKernel( const data_size_t data_index = USE_INDICES ? cuda_used_indices[inner_data_index] : inner_data_index; if (data_index < num_data) { int node = 0; + int iter = 0; while (node >= 0) { const int split_feature_inner = cuda_split_feature_inner[node]; const int column = cuda_feature_to_column[split_feature_inner]; @@ -215,6 +216,10 @@ __global__ void AddPredictionToScoreKernel( node = cuda_right_child[node]; } } + ++iter; + if (iter > 1000) { + printf("error iter = %d\n", iter); + } } score[data_index] += cuda_leaf_value[~node]; } diff --git a/src/metric/binary_metric.hpp b/src/metric/binary_metric.hpp index b4df28b781ae..c75fd75bcf1e 100644 --- a/src/metric/binary_metric.hpp +++ b/src/metric/binary_metric.hpp @@ -371,7 +371,7 @@ class AveragePrecisionMetric: public Metric { return std::vector(1, ap); } - private: + protected: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ diff --git a/src/metric/cuda/cuda_binary_metric.cpp b/src/metric/cuda/cuda_binary_metric.cpp index e802ad142508..df4d4f975369 100644 --- a/src/metric/cuda/cuda_binary_metric.cpp +++ b/src/metric/cuda/cuda_binary_metric.cpp @@ -75,4 +75,40 @@ std::vector CUDAAUCMetric::Eval(const double* score, const ObjectiveFunc } } +CUDAAveragePrecisionMetric::CUDAAveragePrecisionMetric(const Config& config): AveragePrecisionMetric(config) {} + +CUDAAveragePrecisionMetric::~CUDAAveragePrecisionMetric() {} + +void CUDAAveragePrecisionMetric::Init(const Metadata& metadata, data_size_t num_data) { + AveragePrecisionMetric::Init(metadata, num_data); + AllocateCUDAMemoryOuter(&cuda_indices_buffer_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_pos_buffer_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_threshold_mark_, static_cast(num_data), __FILE__, __LINE__); + const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_BINARY_METRIC - 1) / EVAL_BLOCK_SIZE_BINARY_METRIC; + AllocateCUDAMemoryOuter(&cuda_block_sum_pos_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_sum_pos_buffer_, 0, 1, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_threshold_mark_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_threshold_mark_buffer_, 0, 1, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_mark_first_zero_, static_cast(num_blocks) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_mark_first_zero_, 0, 1, __FILE__, __LINE__); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + cuda_label_ = metadata.cuda_metadata()->cuda_label(); + if (cuda_weights_ != nullptr) { + AllocateCUDAMemoryOuter(&cuda_block_sum_neg_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_sum_neg_buffer_, 0, 1, __FILE__, __LINE__); + } +} + +std::vector CUDAAveragePrecisionMetric::Eval(const double* score, const ObjectiveFunction*) const { + LaunchEvalKernel(score); + double total_area = 0.0f, sum_pos = 0.0f; + CopyFromCUDADeviceToHostOuter(&total_area, cuda_block_sum_pos_buffer_, 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(&sum_pos, cuda_sum_pos_buffer_ + static_cast(num_data_ - 1), 1, __FILE__, __LINE__); + if (sum_pos != sum_weights_ && sum_pos > 0.0f) { + return std::vector(1, total_area / sum_pos); + } else { + return std::vector(1, 1.0f); + } +} + } // namespace LightGBM diff --git a/src/metric/cuda/cuda_binary_metric.cu b/src/metric/cuda/cuda_binary_metric.cu index 5d70351ff514..b15df092d48a 100644 --- a/src/metric/cuda/cuda_binary_metric.cu +++ b/src/metric/cuda/cuda_binary_metric.cu @@ -88,4 +88,21 @@ void CUDAAUCMetric::LaunchEvalKernel(const double* score) const { } } +void CUDAAveragePrecisionMetric::LaunchEvalKernel(const double* score) const { + BitonicArgSortGlobal(score, cuda_indices_buffer_, static_cast(num_data_)); + SetCUDAMemoryOuter(cuda_block_sum_pos_buffer_, 0, 1, __FILE__, __LINE__); + if (cuda_weights_ == nullptr) { + GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); + } else { + GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); + GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_neg_buffer_, cuda_block_sum_neg_buffer_, num_data_); + } + GloblGenAUCMark(score, cuda_indices_buffer_, cuda_threshold_mark_, cuda_block_threshold_mark_buffer_, cuda_block_mark_first_zero_, num_data_); + if (cuda_weights_ == nullptr) { + GlobalCalcAveragePrecision(cuda_sum_pos_buffer_, nullptr, cuda_threshold_mark_, num_data_, cuda_block_sum_pos_buffer_); + } else { + GlobalCalcAveragePrecision(cuda_sum_pos_buffer_, cuda_sum_neg_buffer_, cuda_threshold_mark_, num_data_, cuda_block_sum_pos_buffer_); + } +} + } // namespace LightGBM diff --git a/src/metric/cuda/cuda_binary_metric.hpp b/src/metric/cuda/cuda_binary_metric.hpp index 7c069198d246..f28ee59092c6 100644 --- a/src/metric/cuda/cuda_binary_metric.hpp +++ b/src/metric/cuda/cuda_binary_metric.hpp @@ -23,6 +23,11 @@ class CUDABinaryMetric : public CUDAMetricInterface, public BinaryMetric Eval(const double* score, const ObjectiveFunction* objective) const override; + inline static double LossOnPoint(label_t /*label*/, double /*score*/) { + Log::Fatal("Calling host LossOnPoint for a CUDA metric."); + return 0.0f; + } + protected: void LaunchEvalKernel(const double* score) const; @@ -39,19 +44,6 @@ class CUDABinaryLoglossMetric : public CUDABinaryMetric public: explicit CUDABinaryLoglossMetric(const Config& config); - inline static double LossOnPoint(label_t label, double prob) { - if (label <= 0) { - if (1.0f - prob > kEpsilon) { - return -std::log(1.0f - prob); - } - } else { - if (prob > kEpsilon) { - return -std::log(prob); - } - } - return -std::log(kEpsilon); - } - __device__ inline static double LossOnPointCUDA(label_t label, double prob) { if (label <= 0) { if (1.0f - prob > kEpsilon) { @@ -74,14 +66,6 @@ class CUDABinaryErrorMetric: public CUDABinaryMetric { public: explicit CUDABinaryErrorMetric(const Config& config); - inline static double LossOnPoint(label_t label, double prob) { - if (prob <= 0.5f) { - return label > 0; - } else { - return label <= 0; - } - } - __device__ inline static double LossOnPointCUDA(label_t label, double prob) { if (prob <= 0.5f) { return label > 0; @@ -95,7 +79,7 @@ class CUDABinaryErrorMetric: public CUDABinaryMetric { } }; -class CUDAAUCMetric : public AUCMetric { +class CUDAAUCMetric : public CUDAMetricInterface, public AUCMetric { public: CUDAAUCMetric(const Config& config); @@ -120,6 +104,31 @@ class CUDAAUCMetric : public AUCMetric { const label_t* cuda_weights_; }; +class CUDAAveragePrecisionMetric : public CUDAMetricInterface, public AveragePrecisionMetric { + public: + explicit CUDAAveragePrecisionMetric(const Config&); + + ~CUDAAveragePrecisionMetric(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + std::vector Eval(const double* score, const ObjectiveFunction*) const override; + + private: + void LaunchEvalKernel(const double* score) const; + + data_size_t* cuda_indices_buffer_; + double* cuda_sum_pos_buffer_; + double* cuda_block_sum_pos_buffer_; + double* cuda_sum_neg_buffer_; + double* cuda_block_sum_neg_buffer_; + data_size_t* cuda_threshold_mark_; + data_size_t* cuda_block_threshold_mark_buffer_; + uint16_t* cuda_block_mark_first_zero_; + const label_t* cuda_label_; + const label_t* cuda_weights_; +}; + } // namespace LightGBM #endif // LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_ diff --git a/src/metric/cuda/cuda_regression_metric.cpp b/src/metric/cuda/cuda_regression_metric.cpp new file mode 100644 index 000000000000..325f6e214351 --- /dev/null +++ b/src/metric/cuda/cuda_regression_metric.cpp @@ -0,0 +1,59 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_regression_metric.hpp" + +namespace LightGBM { + +template +CUDARegressionMetric::CUDARegressionMetric(const Config& config): RegressionMetric(config) {} + +template +CUDARegressionMetric::~CUDARegressionMetric() {} + +template +void CUDARegressionMetric::Init(const Metadata& metadata, data_size_t num_data) { + RegressionMetric::Init(metadata, num_data); + cuda_label_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + + const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_REGRESSION_METRIC - 1) / EVAL_BLOCK_SIZE_REGRESSION_METRIC; + AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); +} + +template +std::vector CUDARegressionMetric::Eval(const double* score, const ObjectiveFunction* objective) const { + double sum_loss = 0.0f; + objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); + LaunchEvalKernel(cuda_score_convert_buffer_); + CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); + return std::vector(1, CUDAPointWiseLossCalculator::AverageLoss(sum_loss, this->sum_weights_)); +} + +CUDARMSEMetric::CUDARMSEMetric(const Config& config): CUDARegressionMetric(config) {} + +CUDAL2Metric::CUDAL2Metric(const Config& config): CUDARegressionMetric(config) {} + +CUDAL1Metric::CUDAL1Metric(const Config& config): CUDARegressionMetric(config) {} + +CUDAQuantileMetric::CUDAQuantileMetric(const Config& config): CUDARegressionMetric(config) {} + +CUDAHuberLossMetric::CUDAHuberLossMetric(const Config& config): CUDARegressionMetric(config) {} + +CUDAFairLossMetric::CUDAFairLossMetric(const Config& config): CUDARegressionMetric(config) {} + +CUDAPoissonMetric::CUDAPoissonMetric(const Config& config): CUDARegressionMetric(config) {} + +CUDAMAPEMetric::CUDAMAPEMetric(const Config& config): CUDARegressionMetric(config) {} + +CUDAGammaMetric::CUDAGammaMetric(const Config& config): CUDARegressionMetric(config) {} + +CUDAGammaDevianceMetric::CUDAGammaDevianceMetric(const Config& config): CUDARegressionMetric(config) {} + +CUDATweedieMetric::CUDATweedieMetric(const Config& config): CUDARegressionMetric(config) {} + +} // namespace LightGBM diff --git a/src/metric/cuda/cuda_regression_metric.cu b/src/metric/cuda/cuda_regression_metric.cu new file mode 100644 index 000000000000..2b5a0f12ed0f --- /dev/null +++ b/src/metric/cuda/cuda_regression_metric.cu @@ -0,0 +1,128 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + + +#include +#include "cuda_regression_metric.hpp" + +namespace LightGBM { + +template +__global__ void EvalKernel_RegressionPointWiseLoss(const double* score, + const label_t* label, + const label_t* weights, + const data_size_t num_data, + const double sum_weight, + double* cuda_sum_loss_buffer, + const double alpha, + const double fair_c, + const double tweedie_variance_power) { + // assert that warpSize == 32 and maximum number of threads per block is 1024 + __shared__ double shared_buffer[32]; + const int data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const double pointwise_loss = data_index < num_data ? + (USE_WEIGHT ? CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score[data_index], alpha, fair_c, tweedie_variance_power) * weights[data_index] : + CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score[data_index], alpha, fair_c, tweedie_variance_power)) : + 0.0f; + const double loss = ShuffleReduceSum(pointwise_loss, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + cuda_sum_loss_buffer[blockIdx.x] = loss; + } +} + +template +__global__ void ReduceLossKernel_Regression(const double* cuda_sum_loss_buffer, const data_size_t num_blocks, double* out_loss) { + __shared__ double shared_buffer[32]; + double thread_sum_loss = 0.0f; + for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { + thread_sum_loss += cuda_sum_loss_buffer[block_index]; + } + const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, static_cast(num_blocks)); + if (threadIdx.x == 0) { + *out_loss = sum_loss; + } +} + +template +void CUDARegressionMetric::LaunchEvalKernelInner(const double* score) const { + const data_size_t num_blocks = (RegressionMetric::num_data_ + EVAL_BLOCK_SIZE_REGRESSION_METRIC - 1) / EVAL_BLOCK_SIZE_REGRESSION_METRIC; + if (cuda_weights_ == nullptr) { + EvalKernel_RegressionPointWiseLoss<<>>( + score, cuda_label_, cuda_weights_, + this->num_data_, + this->sum_weights_, + cuda_sum_loss_buffer_, + this->config_.alpha, + this->config_.fair_c, + this->config_.tweedie_variance_power); + } else { + EvalKernel_RegressionPointWiseLoss<<>>( + score, cuda_label_, cuda_weights_, + this->num_data_, + this->sum_weights_, + cuda_sum_loss_buffer_, + this->config_.alpha, + this->config_.fair_c, + this->config_.tweedie_variance_power); + } + ReduceLossKernel_Regression<<<1, EVAL_BLOCK_SIZE_REGRESSION_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); +} + +template <> +void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +} // namespace LightGBM diff --git a/src/metric/cuda/cuda_regression_metric.hpp b/src/metric/cuda/cuda_regression_metric.hpp new file mode 100644 index 000000000000..fe82d2739daf --- /dev/null +++ b/src/metric/cuda/cuda_regression_metric.hpp @@ -0,0 +1,258 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_ +#define LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_ + +#include "cuda_metric.hpp" +#include "../regression_metric.hpp" + +#define EVAL_BLOCK_SIZE_REGRESSION_METRIC (1024) + +namespace LightGBM { + +// TODO(shiyu1994): merge CUDARegressionMetric and CUDABinaryLossMetric into CUDAPointWiseMetric +template +class CUDARegressionMetric : public CUDAMetricInterface, public RegressionMetric { + public: + explicit CUDARegressionMetric(const Config& config); + + ~CUDARegressionMetric(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + std::vector Eval(const double* score, const ObjectiveFunction* objective) const override; + + inline static double AverageLoss(double sum_loss, double sum_weights) { + // need sqrt the result for RMSE loss + return (sum_loss / sum_weights); + } + + inline static double LossOnPoint(label_t /*label*/, double /*score*/, const Config& /*config*/) { + Log::Fatal("Calling host LossOnPoint for a CUDA metric."); + return 0.0f; + } + + protected: + void LaunchEvalKernel(const double* score) const; + + void LaunchEvalKernelInner(const double* score) const; + + __device__ inline static double SafeLogCUDA(const double x) { + if (x > 0) { + return log(x); + } else { + return -INFINITY; + } + } + + const label_t* cuda_label_; + const label_t* cuda_weights_; + double* cuda_score_convert_buffer_; + double* cuda_sum_loss_buffer_; + double* cuda_sum_loss_; +}; + +class CUDARMSEMetric : public CUDARegressionMetric { + public: + explicit CUDARMSEMetric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, double score, + const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { + return (score - label) * (score - label); + } + + inline static double AverageLoss(double sum_loss, double sum_weights) { + // need sqrt the result for RMSE loss + return std::sqrt(sum_loss / sum_weights); + } + + inline static const char* Name() { + return "rmse"; + } +}; + +class CUDAL2Metric : public CUDARegressionMetric { + public: + explicit CUDAL2Metric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, double score, + const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { + return (score - label)*(score - label); + } + + inline static const char* Name() { + return "l2"; + } +}; + +class CUDAQuantileMetric : public CUDARegressionMetric { + public: + explicit CUDAQuantileMetric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, double score, + const double alpha, const double /*fair_c*/, const double /*tweedie_variance_power*/) { + double delta = label - score; + if (delta < 0) { + return (alpha - 1.0f) * delta; + } else { + return alpha * delta; + } + } + + inline static const char* Name() { + return "quantile"; + } +}; + +class CUDAL1Metric : public CUDARegressionMetric { + public: + explicit CUDAL1Metric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, double score, + const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { + return fabs(score - label); + } + + inline static const char* Name() { + return "l1"; + } +}; + +class CUDAHuberLossMetric : public CUDARegressionMetric { + public: + explicit CUDAHuberLossMetric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, double score, + const double alpha, const double /*fair_c*/, const double /*tweedie_variance_power*/) { + const double diff = score - label; + if (fabs(diff) <= alpha) { + return 0.5f * diff * diff; + } else { + return alpha * (fabs(diff) - 0.5f * alpha); + } + } + + inline static const char* Name() { + return "huber"; + } +}; + +class CUDAFairLossMetric: public CUDARegressionMetric { + public: + explicit CUDAFairLossMetric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, double score, + const double /*alpha*/, const double fair_c, const double /*tweedie_variance_power*/) { + const double x = fabs(score - label); + const double c = fair_c; + return c * x - c * c * log(1.0f + x / c); + } + + inline static const char* Name() { + return "fair"; + } +}; + +class CUDAPoissonMetric: public CUDARegressionMetric { + public: + explicit CUDAPoissonMetric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, double score, + const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { + const double eps = 1e-10f; + if (score < eps) { + score = eps; + } + return score - label * log(score); + } + + inline static const char* Name() { + return "poisson"; + } +}; + +class CUDAMAPEMetric : public CUDARegressionMetric { + public: + explicit CUDAMAPEMetric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, double score, + const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { + return fabs((label - score)) / fmax(1.0f, fabs(label)); + } + inline static const char* Name() { + return "mape"; + } +}; + +class CUDAGammaMetric : public CUDARegressionMetric { + public: + explicit CUDAGammaMetric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, double score, + const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { + const double psi = 1.0; + const double theta = -1.0 / score; + const double a = psi; + const double b = -SafeLogCUDA(-theta); + const double c = 1. / psi * SafeLogCUDA(label / psi) - SafeLogCUDA(label) - 0; // 0 = std::lgamma(1.0 / psi) = std::lgamma(1.0); + return -((label * theta - b) / a + c); + } + inline static const char* Name() { + return "gamma"; + } + + inline static void CheckLabel(label_t label) { + CHECK_GT(label, 0); + } +}; + +class CUDAGammaDevianceMetric : public CUDARegressionMetric { + public: + explicit CUDAGammaDevianceMetric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, double score, + const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { + const double epsilon = 1.0e-9; + const double tmp = label / (score + epsilon); + return tmp - SafeLogCUDA(tmp) - 1; + } + + inline static const char* Name() { + return "gamma_deviance"; + } + + inline static double AverageLoss(double sum_loss, double) { + return sum_loss * 2; + } + + inline static void CheckLabel(label_t label) { + CHECK_GT(label, 0); + } +}; + +class CUDATweedieMetric : public CUDARegressionMetric { + public: + explicit CUDATweedieMetric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, double score, + const double /*alpha*/, const double /*fair_c*/, const double tweedie_variance_power) { + const double rho = tweedie_variance_power; + const double eps = 1e-10f; + if (score < eps) { + score = eps; + } + const double a = label * exp((1 - rho) * log(score)) / (1 - rho); + const double b = exp((2 - rho) * log(score)) / (2 - rho); + return -a + b; + } + + inline static const char* Name() { + return "tweedie"; + } +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_ diff --git a/src/metric/metric.cpp b/src/metric/metric.cpp index 9a3b41547209..82fb4416805c 100644 --- a/src/metric/metric.cpp +++ b/src/metric/metric.cpp @@ -12,17 +12,44 @@ #include "xentropy_metric.hpp" #include "cuda/cuda_binary_metric.hpp" +#include "cuda/cuda_regression_metric.hpp" namespace LightGBM { Metric* Metric::CreateMetric(const std::string& type, const Config& config) { if (config.device_type == std::string("cuda")) { - if (type == std::string("binary_logloss")) { + if (type == std::string("l2")) { + return new CUDAL2Metric(config); + } else if (type == std::string("rmse")) { + return new CUDARMSEMetric(config); + } else if (type == std::string("rmse")) { + return new CUDARMSEMetric(config); + } else if (type == std::string("l1")) { + return new CUDAL1Metric(config); + } else if (type == std::string("quantile")) { + return new CUDAQuantileMetric(config); + } else if (type == std::string("huber")) { + return new CUDAHuberLossMetric(config); + } else if (type == std::string("fair")) { + return new CUDAFairLossMetric(config); + } else if (type == std::string("poisson")) { + return new CUDAPoissonMetric(config); + } else if (type == std::string("binary_logloss")) { return new CUDABinaryLoglossMetric(config); } else if (type == std::string("binary_error")) { return new CUDABinaryErrorMetric(config); } else if (type == std::string("auc")) { return new CUDAAUCMetric(config); + } else if (type == std::string("average_precision")) { + return new CUDAAveragePrecisionMetric(config); + } else if (type == std::string("mape")) { + return new CUDAMAPEMetric(config); + } else if (type == std::string("gamma")) { + return new CUDAGammaMetric(config); + } else if (type == std::string("gamma_deviance")) { + return new CUDAGammaDevianceMetric(config); + } else if (type == std::string("tweedie")) { + return new CUDATweedieMetric(config); } } else { if (type == std::string("l2")) { diff --git a/src/metric/regression_metric.hpp b/src/metric/regression_metric.hpp index 4d1a36621424..69b52f90305d 100644 --- a/src/metric/regression_metric.hpp +++ b/src/metric/regression_metric.hpp @@ -101,7 +101,7 @@ class RegressionMetric: public Metric { inline static void CheckLabel(label_t) { } - private: + protected: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ diff --git a/src/objective/cuda/cuda_regression_objective.cu b/src/objective/cuda/cuda_regression_objective.cu index 002610a7bae1..3bdb78057007 100644 --- a/src/objective/cuda/cuda_regression_objective.cu +++ b/src/objective/cuda/cuda_regression_objective.cu @@ -69,10 +69,10 @@ __global__ void ConvertOutputCUDAKernel(const bool sqrt, const data_size_t num_d const int data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); if (data_index < num_data) { if (sqrt) { - const double sign = input[0] >= 0.0f ? 1 : -1; - output[0] = sign * input[0] * input[0]; + const double sign = input[data_index] >= 0.0f ? 1 : -1; + output[data_index] = sign * input[data_index] * input[data_index]; } else { - output[0] = input[0]; + output[data_index] = input[data_index]; } } } From 5c84788fc57ecec2ebf1b8c1283d8c7916fe6531 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 19 Aug 2021 05:57:52 +0000 Subject: [PATCH 058/166] remove useless file --- src/cuda/from_git | 146 ---------------------------------------------- 1 file changed, 146 deletions(-) delete mode 100644 src/cuda/from_git diff --git a/src/cuda/from_git b/src/cuda/from_git deleted file mode 100644 index 2263cf1869e1..000000000000 --- a/src/cuda/from_git +++ /dev/null @@ -1,146 +0,0 @@ -template -__global__ void BitonicSortGlobalKernel(T* values, const int num_total_data) { - const int thread_index = static_cast(threadIdx.x); - const int low = static_cast(blockIdx.x * BITONIC_SORT_NUM_ELEMENTS); - const bool outer_ascending = ASCENDING ? (blockIdx.x % 2 == 0) : (blockIdx.x % 2 == 1); - T* values_pointer = values + low; - const int num_data = min(BITONIC_SORT_NUM_ELEMENTS, num_total_data - low); - __shared__ T shared_values[BITONIC_SORT_NUM_ELEMENTS]; - if (thread_index < num_data) { - shared_values[thread_index] = values_pointer[thread_index]; - } - __syncthreads(); - for (int depth = BITONIC_SORT_DEPTH - 1; depth >= 1; --depth) { - const int segment_length = 1 << (BITONIC_SORT_DEPTH - depth); - const int segment_index = thread_index / segment_length; - const bool ascending = outer_ascending ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - const int num_total_segment = (num_data + segment_length - 1) / segment_length; - { - const int inner_depth = depth; - const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); - const int inner_segment_index_half = thread_index / inner_segment_length_half; - const int offset = ((inner_segment_index_half >> 1) == num_total_segment - 1 && ascending == ASCENDING) ? - (num_total_segment * segment_length - num_data) : 0; - const int segment_start = segment_index * segment_length; - if (inner_segment_index_half % 2 == 0) { - if (thread_index >= offset + segment_start) { - const int index_to_compare = thread_index + inner_segment_length_half - offset; - if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { - const T tmp = shared_values[thread_index]; - shared_values[thread_index] = shared_values[index_to_compare]; - shared_values[index_to_compare] = tmp; - } - } - } - __syncthreads(); - } - for (int inner_depth = depth + 1; inner_depth < BITONIC_SORT_DEPTH; ++inner_depth) { - const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); - const int inner_segment_index_half = thread_index / inner_segment_length_half; - if (inner_segment_index_half % 2 == 0) { - const int index_to_compare = thread_index + inner_segment_length_half; - if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { - const T tmp = shared_values[thread_index]; - shared_values[thread_index] = shared_values[index_to_compare]; - shared_values[index_to_compare] = tmp; - } - } - __syncthreads(); - } - } - if (thread_index < num_data) { - values_pointer[thread_index] = shared_values[thread_index]; - } -} - -template -__global__ void BitonicSortMergeKernel(VAL_T* values, const int segment_length, const int len) { - const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const int segment_index = thread_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; - const int offset = static_cast(blockIdx.x * blockDim.x); - const int local_len = min(BITONIC_SORT_NUM_ELEMENTS, len - offset); - if (thread_index < len) { - shared_values[threadIdx.x] = values[thread_index]; - } - __syncthreads(); - int half_segment_length = BITONIC_SORT_NUM_ELEMENTS / 2; - while (half_segment_length >= 1) { - const int half_segment_index = static_cast(threadIdx.x) / half_segment_length; - if (half_segment_index % 2 == 0) { - const int index_to_compare = static_cast(threadIdx.x) + half_segment_length; - if (index_to_compare < local_len && ((shared_values[threadIdx.x] > shared_values[index_to_compare]) == ascending)) { - const VAL_T tmp = shared_values[index_to_compare]; - shared_values[index_to_compare] = shared_values[threadIdx.x]; - shared_values[threadIdx.x] = tmp; - } - } - __syncthreads(); - half_segment_length >>= 1; - } - if (thread_index < len) { - values[thread_index] = shared_values[threadIdx.x]; - } -} - -template -__global__ void BitonicCompareKernel(VAL_T* values, const int half_segment_length, const int outer_segment_length, const int len) { - const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const int segment_index = thread_index / outer_segment_length; - const int half_segment_index = thread_index / half_segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - if (half_segment_index % 2 == 0) { - const int num_total_segment = (len + outer_segment_length - 1) / outer_segment_length; - if (BEGIN && (half_segment_index >> 1) == num_total_segment - 1 && ascending == ASCENDING) { - const int offset = num_total_segment * outer_segment_length - len; - const int segment_start = segment_index * outer_segment_length; - if (thread_index >= offset + segment_start) { - const int index_to_compare = thread_index + half_segment_length - offset; - if (index_to_compare < len && (values[thread_index] > values[index_to_compare]) == ascending) { - const VAL_T tmp = values[index_to_compare]; - values[index_to_compare] = values[thread_index]; - values[thread_index] = tmp; - } - } - } else { - const int index_to_compare = thread_index + half_segment_length; - if (index_to_compare < len) { - if ((values[thread_index] > values[index_to_compare]) == ascending) { - const VAL_T tmp = values[index_to_compare]; - values[index_to_compare] = values[thread_index]; - values[thread_index] = tmp; - } - } - } - } -} - -template -void BitonicSortGlobalHelper(VAL_T* values, const size_t len) { - int max_depth = 1; - int len_to_shift = static_cast(len) - 1; - while (len_to_shift > 0) { - ++max_depth; - len_to_shift >>= 1; - } - const int num_blocks = (static_cast(len) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; - BitonicSortGlobalKernel<<>>(values, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - for (int depth = max_depth - 11; depth >= 1; --depth) { - const int segment_length = (1 << (max_depth - depth)); - int half_segment_length = (segment_length >> 1); - { - BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - half_segment_length >>= 1; - } - for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { - BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - half_segment_length >>= 1; - } - BitonicSortMergeKernel<<>>(values, segment_length, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - } -} \ No newline at end of file From c2c24073988dbe81b228ec554028b4031b34dca4 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 19 Aug 2021 15:51:52 +0000 Subject: [PATCH 059/166] don't use mask in shuffle reduce --- include/LightGBM/cuda/cuda_algorithms.hpp | 10 ++++++---- src/boosting/gbdt.cpp | 5 ++--- src/metric/metric.cpp | 2 ++ src/objective/cuda/cuda_rank_objective.cpp | 4 ++-- src/objective/cuda/cuda_rank_objective.cu | 14 -------------- src/objective/cuda/cuda_rank_objective.hpp | 2 ++ 6 files changed, 14 insertions(+), 23 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 4cb9d7162aa4..883d585dcc6e 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -256,7 +256,8 @@ __device__ __forceinline__ void BitonicArgSort_2048(const score_t* scores, uint1 template __device__ __forceinline__ T ShuffleReduceSumWarp(T value, const data_size_t len) { if (len > 0) { - const uint32_t mask = (0xffffffff >> (warpSize - len)); + // TODO(shiyu1994): check how mask works + const uint32_t mask = 0xffffffff; for (int offset = warpSize / 2; offset > 0; offset >>= 1) { value += __shfl_down_sync(mask, value, offset); } @@ -277,7 +278,7 @@ __device__ __forceinline__ T ShuffleReduceSum(T value, T* shared_mem_buffer, con __syncthreads(); const data_size_t num_warp = static_cast((len + warpSize - 1) / warpSize); if (warpID == 0) { - value = shared_mem_buffer[warpLane]; + value = (warpLane < num_warp ? shared_mem_buffer[warpLane] : 0); value = ShuffleReduceSumWarp(value, num_warp); } return value; @@ -286,7 +287,8 @@ __device__ __forceinline__ T ShuffleReduceSum(T value, T* shared_mem_buffer, con template __device__ __forceinline__ T ShuffleReduceMaxWarp(T value, const data_size_t len) { if (len > 0) { - const uint32_t mask = (0xffffffff >> (warpSize - len)); + // TODO(shiyu1994): check how mask works + const uint32_t mask = 0xffffffff; for (int offset = warpSize / 2; offset > 0; offset >>= 1) { const T other_value = __shfl_down_sync(mask, value, offset); value = (other_value > value) ? other_value : value; @@ -308,7 +310,7 @@ __device__ __forceinline__ T ShuffleReduceMax(T value, T* shared_mem_buffer, con __syncthreads(); const data_size_t num_warp = static_cast((len + warpSize - 1) / warpSize); if (warpID == 0) { - value = shared_mem_buffer[warpLane]; + value = (warpLane < num_warp ? shared_mem_buffer[warpLane] : shared_mem_buffer[0]); value = ShuffleReduceMaxWarp(value, num_warp); } return value; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index e49173beb19e..f3a2521c11eb 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -525,13 +525,12 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { } std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { - /*if (config_->device_type == std::string("cuda")) { + if (config_->device_type == std::string("cuda")) { std::vector tmp_score(num_data * num_class_, 0.0f); CopyFromCUDADeviceToHostOuter(tmp_score.data(), score, static_cast(num_data * num_class_), __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - Log::Warning("evaluating"); return metric->Eval(tmp_score.data(), objective_function_); - } else*/ { + } else { return metric->Eval(score, objective_function_); } } diff --git a/src/metric/metric.cpp b/src/metric/metric.cpp index 82fb4416805c..b09b640a54f8 100644 --- a/src/metric/metric.cpp +++ b/src/metric/metric.cpp @@ -50,6 +50,8 @@ Metric* Metric::CreateMetric(const std::string& type, const Config& config) { return new CUDAGammaDevianceMetric(config); } else if (type == std::string("tweedie")) { return new CUDATweedieMetric(config); + } else if (type == std::string("ndcg")) { + return new NDCGMetric(config); } } else { if (type == std::string("l2")) { diff --git a/src/objective/cuda/cuda_rank_objective.cpp b/src/objective/cuda/cuda_rank_objective.cpp index e3c84158f1d2..12143543f190 100644 --- a/src/objective/cuda/cuda_rank_objective.cpp +++ b/src/objective/cuda/cuda_rank_objective.cpp @@ -70,9 +70,9 @@ void CUDARankXENDCG::Init(const Metadata& metadata, data_size_t num_data) { } item_rands_.resize(num_data, 0.0f); AllocateCUDAMemoryOuter(&cuda_item_rands_, static_cast(num_data), __FILE__, __LINE__); - //if (max_items_in_query_aligned_ >= 2048) { + if (max_items_in_query_aligned_ >= 2048) { AllocateCUDAMemoryOuter(&cuda_params_buffer_, static_cast(num_data_), __FILE__, __LINE__); - //} + } } void CUDARankXENDCG::GenerateItemRands() const { diff --git a/src/objective/cuda/cuda_rank_objective.cu b/src/objective/cuda/cuda_rank_objective.cu index a1eeb1bd1a61..8322531df3ab 100644 --- a/src/objective/cuda/cuda_rank_objective.cu +++ b/src/objective/cuda/cuda_rank_objective.cu @@ -424,9 +424,6 @@ __global__ void GetGradientsKernel_RankXENDCG_SharedMemory( thread_reduce_result = ShuffleReduceMax(thread_reduce_result, shared_buffer, block_reduce_size); if (threadIdx.x == 0) { reduce_result = thread_reduce_result; - if (blockIdx.x == 0) { - printf("reduce max score = %f\n", reduce_result); - } } __syncthreads(); thread_reduce_result = 0.0f; @@ -648,17 +645,6 @@ void CUDARankXENDCG::LaunchGetGradientsKernel(const double* score, score_t* grad hessians); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - PrintLastCUDAErrorOuter(__FILE__, __LINE__); - const int num_show = 1000; - std::vector host_gradients(num_show, 0.0f); - std::vector host_hessians(num_show, 0.0f); - std::vector host_scores(num_show, 0.0f); - CopyFromCUDADeviceToHostOuter(host_gradients.data(), gradients, num_show, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(host_hessians.data(), hessians, num_show, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(host_scores.data(), score, num_show, __FILE__, __LINE__); - for (int i = 0; i < num_show; ++i) { - Log::Warning("host_gradients[%d] = %f, host_hessians[%d] = %f, host_scores[%d] = %f", i, host_gradients[i], i, host_hessians[i], i, host_scores[i]); - } } } // namespace LightGBM diff --git a/src/objective/cuda/cuda_rank_objective.hpp b/src/objective/cuda/cuda_rank_objective.hpp index 4e71b68f87b4..b35152c423ba 100644 --- a/src/objective/cuda/cuda_rank_objective.hpp +++ b/src/objective/cuda/cuda_rank_objective.hpp @@ -16,6 +16,8 @@ #include "../rank_objective.hpp" #include +#include + namespace LightGBM { class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG { From b43d3671d429ed6967ca1915cec0762e6caa9763 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 2 Sep 2021 03:12:15 +0000 Subject: [PATCH 060/166] add more regression objectives --- include/LightGBM/cuda/cuda_algorithms.hpp | 119 +++- include/LightGBM/cuda/cuda_tree.hpp | 2 + include/LightGBM/objective_function.h | 3 + include/LightGBM/tree_learner.h | 2 +- src/boosting/gbdt.cpp | 6 +- src/boosting/rf.hpp | 2 +- src/cuda/cuda_algorithms.cu | 385 +++++++++++ src/io/cuda/cuda_tree.cu | 39 +- .../cuda/cuda_regression_objective.cpp | 159 ++++- .../cuda/cuda_regression_objective.cu | 616 ++++++++++++++++-- .../cuda/cuda_regression_objective.hpp | 231 ++++++- src/objective/objective_function.cpp | 16 + src/objective/regression_objective.hpp | 2 + src/treelearner/cuda/cuda_data_partition.cpp | 4 +- src/treelearner/cuda/cuda_data_partition.cu | 12 +- src/treelearner/cuda/cuda_data_partition.hpp | 8 +- .../cuda/cuda_histogram_constructor.hpp | 2 +- .../cuda/new_cuda_tree_learner.cpp | 28 +- .../cuda/new_cuda_tree_learner.hpp | 3 + src/treelearner/serial_tree_learner.cpp | 8 +- src/treelearner/serial_tree_learner.h | 2 +- 21 files changed, 1549 insertions(+), 100 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 883d585dcc6e..82c05bec324a 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -54,11 +54,20 @@ __device__ void ReduceSumConflictFree(T* values, size_t n) { ReduceSumConflictFreeInner(values, n); } +template +void ReduceSumGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer); + +template +void ReduceMaxGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer); + +template +void ReduceMinGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer); + template __device__ void ReduceMax(T* values, size_t n); template -void GlobalInclusivePrefixSum(T* values, size_t n); +void GlobalInclusivePrefixSum(T* values, T* block_buffer, size_t n); template void GlobalGenAUCPosNegSum(const label_t* labels, @@ -341,12 +350,118 @@ __device__ __forceinline__ T ShuffleReduceMin(T value, T* shared_mem_buffer, con __syncthreads(); const data_size_t num_warp = static_cast((len + warpSize - 1) / warpSize); if (warpID == 0) { - value = shared_mem_buffer[warpLane]; + value = (warpLane < num_warp ? shared_mem_buffer[warpLane] : shared_mem_buffer[0]); value = ShuffleReduceMinWarp(value, num_warp); } return value; } +template +__device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, const int len); + +template +__device__ void PrefixSumDevice(const VAL_T* in_values, + const INDEX_T* sorted_indices, + REDUCE_VAL_T* out_values, + const INDEX_T num_data) { + __shared__ REDUCE_VAL_T shared_buffer[1025]; + const INDEX_T num_data_per_thread = (num_data + static_cast(blockDim.x) - 1) / static_cast(blockDim.x); + const INDEX_T start = num_data_per_thread * static_cast(threadIdx.x); + const INDEX_T end = min(start + num_data_per_thread, num_data); + REDUCE_VAL_T thread_sum = 0; + for (INDEX_T index = start; index < end; ++index) { + thread_sum += static_cast(in_values[sorted_indices[index]]); + } + shared_buffer[threadIdx.x] = thread_sum; + __syncthreads(); + PrefixSum(shared_buffer, num_data); + const REDUCE_VAL_T thread_base = shared_buffer[threadIdx.x]; + for (INDEX_T index = start; index < end; ++index) { + out_values[index] = thread_base + static_cast(in_values[sorted_indices[index]]); + } + __syncthreads(); +} + +template +__device__ VAL_T PercentileDevice(const VAL_T* values, + const WEIGHT_T* weights, + INDEX_T* indices, + REDUCE_WEIGHT_T* weights_prefix_sum, + const double alpha, + const INDEX_T len); + +template +__global__ void PercentileGlobalKernel(const VAL_T* values, + const WEIGHT_T* weights, + const INDEX_T* sorted_indices, + const WEIGHT_REDUCE_T* weights_prefix_sum, + const double alpha, + const INDEX_T len, + VAL_T* out_value) { + if (!USE_WEIGHT) { + const double float_pos = (1.0f - alpha) * len; + const INDEX_T pos = static_cast(float_pos); + if (pos < 1) { + *out_value = values[sorted_indices[0]]; + } else if (pos >= len) { + *out_value = values[sorted_indices[len - 1]]; + } else { + const double bias = float_pos - static_cast(pos); + const VAL_T v1 = values[sorted_indices[pos - 1]]; + const VAL_T v2 = values[sorted_indices[pos]]; + *out_value = static_cast(v1 - (v1 - v2) * bias); + } + } else { + const WEIGHT_REDUCE_T threshold = weights_prefix_sum[len - 1] * (1.0f - alpha); + __shared__ INDEX_T pos; + if (threadIdx.x == 0) { + pos = len; + } + __syncthreads(); + for (INDEX_T index = static_cast(threadIdx.x); index < len; index += static_cast(blockDim.x)) { + if (weights_prefix_sum[index] > threshold && (index == 0 || weights_prefix_sum[index - 1] <= threshold)) { + pos = index; + } + } + __syncthreads(); + pos = min(pos, len - 1); + if (pos == 0 || pos == len - 1) { + *out_value = values[pos]; + } + const VAL_T v1 = values[sorted_indices[pos - 1]]; + const VAL_T v2 = values[sorted_indices[pos]]; + *out_value = static_cast(v1 - (v1 - v2) * (threshold - weights_prefix_sum[pos - 1]) / (weights_prefix_sum[pos] - weights_prefix_sum[pos - 1])); + } +} + +template +void GlobalInclusiveArgPrefixSum(const INDEX_T* sorted_indices, const VAL_T* in_values, REDUCE_T* out_values, REDUCE_T* block_buffer, size_t n); + +template +void PercentileGlobal(const VAL_T* values, + const WEIGHT_T* weights, + INDEX_T* indices, + WEIGHT_REDUCE_T* weights_prefix_sum, + WEIGHT_REDUCE_T* weights_prefix_sum_buffer, + const double alpha, + const INDEX_T len, + VAL_T* cuda_out_value) { + if (len <= 1) { + CopyFromCUDADeviceToCUDADeviceOuter(cuda_out_value, values, 1, __FILE__, __LINE__); + } + BitonicArgSortGlobal(values, indices, len); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + if (USE_WEIGHT) { + Log::Warning("before prefix sum"); + GlobalInclusiveArgPrefixSum(indices, weights, weights_prefix_sum, weights_prefix_sum_buffer, static_cast(len)); + } + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + Log::Warning("after prefix sum"); + PercentileGlobalKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(values, weights, indices, weights_prefix_sum, alpha, len, cuda_out_value); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + Log::Warning("after percentile"); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index f1c41222b3d1..b00b1406c1d1 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -80,6 +80,8 @@ class CUDATree : public Tree { const double* cuda_leaf_value() const { return cuda_leaf_value_; } + double* cuda_leaf_value_ref() { return cuda_leaf_value_; } + inline void Shrinkage(double rate) override; inline void AddBias(double val) override; diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index 7fc9123cc49a..c5f16769d4b2 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -48,6 +48,9 @@ class ObjectiveFunction { const data_size_t*, data_size_t) const { return ori_output; } + virtual void RenewTreeOutputCUDA(const double* /*score*/, const data_size_t* /*data_indices_in_leaf*/, const data_size_t* /*num_data_in_leaf*/, + const data_size_t* /*data_start_in_leaf*/, const int /*num_leaves*/, double* /*leaf_value*/) const {} + virtual double BoostFromScore(int /*class_id*/) const { return 0.0; } virtual bool ClassNeedTrain(int /*class_id*/) const { return true; } diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index 55343ad714ee..f65fc591200d 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -86,7 +86,7 @@ class TreeLearner { virtual void AddPredictionToScore(const Tree* tree, double* out_score) const = 0; virtual void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, - data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const = 0; + const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const = 0; TreeLearner() = default; /*! \brief Disable copy */ diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index f3a2521c11eb..a1c13d7b27fc 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -419,7 +419,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto score_ptr = train_score_updater_->score() + offset; auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast(label[i]) - score_ptr[i]; }; tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter, - num_data_, bag_data_indices_.data(), bag_data_cnt_); + score_ptr, num_data_, bag_data_indices_.data(), bag_data_cnt_); // shrinkage by learning rate new_tree->Shrinkage(shrinkage_rate_); // update score @@ -525,12 +525,12 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { } std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { - if (config_->device_type == std::string("cuda")) { + /*if (config_->device_type == std::string("cuda")) { std::vector tmp_score(num_data * num_class_, 0.0f); CopyFromCUDADeviceToHostOuter(tmp_score.data(), score, static_cast(num_data * num_class_), __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); return metric->Eval(tmp_score.data(), objective_function_); - } else { + } else*/ { return metric->Eval(score, objective_function_); } } diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 5a9eb226fef5..35646c6a170e 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -132,7 +132,7 @@ class RF : public GBDT { double pred = init_scores_[cur_tree_id]; auto residual_getter = [pred](const label_t* label, int i) {return static_cast(label[i]) - pred; }; tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter, - num_data_, bag_data_indices_.data(), bag_data_cnt_); + train_score_updater_->score(), num_data_, bag_data_indices_.data(), bag_data_cnt_); if (std::fabs(init_scores_[cur_tree_id]) > kEpsilon) { new_tree->AddBias(init_scores_[cur_tree_id]); } diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 4cb8489ca317..d16817e14c08 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -358,6 +358,12 @@ void BitonicArgSortGlobal(const double* values, data_ BitonicArgSortGlobalHelper(values, indices, len); } +template <> +void BitonicArgSortGlobal(const label_t* values, data_size_t* indices, const size_t len) { + BitonicArgSortGlobalHelper(values, indices, len); +} + + template __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, const int len) { __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; @@ -522,6 +528,170 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons } } +template +__device__ void BitonicArgSortDevice512(const VAL_T* values, INDEX_T* indices, const int len) { + __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS / 2]; + __shared__ INDEX_T shared_indices[BITONIC_SORT_NUM_ELEMENTS / 2]; + int len_to_shift = len - 1; + int max_depth = 1; + while (len_to_shift > 0) { + len_to_shift >>= 1; + ++max_depth; + } + const int num_blocks = (len + (BITONIC_SORT_NUM_ELEMENTS / 2) - 1) / (BITONIC_SORT_NUM_ELEMENTS / 2); + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS / 2 + static_cast(threadIdx.x); + if (this_index < len) { + shared_values[threadIdx.x] = values[this_index]; + shared_indices[threadIdx.x] = this_index; + } else { + shared_indices[threadIdx.x] = len; + } + __syncthreads(); + for (int depth = max_depth - 1; depth > max_depth - 10; --depth) { + const int segment_length = (1 << (max_depth - depth)); + const int segment_index = this_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + { + const int half_segment_length = (segment_length >> 1); + const int half_segment_index = this_index / half_segment_length; + const int num_total_segment = (len + segment_length - 1) / segment_length; + const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ? + (num_total_segment * segment_length - len) : 0; + if (half_segment_index % 2 == 0) { + const int segment_start = segment_index * segment_length; + if (this_index >= offset + segment_start) { + const int other_index = static_cast(threadIdx.x) + half_segment_length - offset; + const INDEX_T this_data_index = shared_indices[threadIdx.x]; + const INDEX_T other_data_index = shared_indices[other_index]; + const VAL_T this_value = shared_values[threadIdx.x]; + const VAL_T other_value = shared_values[other_index]; + if (other_data_index < len && (this_value > other_value) == ascending) { + shared_indices[threadIdx.x] = other_data_index; + shared_indices[other_index] = this_data_index; + shared_values[threadIdx.x] = other_value; + shared_values[other_index] = this_value; + } + } + } + __syncthreads(); + } + for (int inner_depth = depth + 1; inner_depth < max_depth; ++inner_depth) { + const int half_segment_length = (1 << (max_depth - inner_depth - 1)); + const int half_segment_index = this_index / half_segment_length; + if (half_segment_index % 2 == 0) { + const int other_index = static_cast(threadIdx.x) + half_segment_length; + const INDEX_T this_data_index = shared_indices[threadIdx.x]; + const INDEX_T other_data_index = shared_indices[other_index]; + const VAL_T this_value = shared_values[threadIdx.x]; + const VAL_T other_value = shared_values[other_index]; + if (other_data_index < len && (this_value > other_value) == ascending) { + shared_indices[threadIdx.x] = other_data_index; + shared_indices[other_index] = this_data_index; + shared_values[threadIdx.x] = other_value; + shared_values[other_index] = this_value; + } + } + __syncthreads(); + } + } + if (this_index < len) { + indices[this_index] = shared_indices[threadIdx.x]; + } + __syncthreads(); + } + for (int depth = max_depth - 10; depth >= 1; --depth) { + const int segment_length = (1 << (max_depth - depth)); + { + const int num_total_segment = (len + segment_length - 1) / segment_length; + const int half_segment_length = (segment_length >> 1); + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS / 2 + static_cast(threadIdx.x); + const int segment_index = this_index / segment_length; + const int half_segment_index = this_index / half_segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ? + (num_total_segment * segment_length - len) : 0; + if (half_segment_index % 2 == 0) { + const int segment_start = segment_index * segment_length; + if (this_index >= offset + segment_start) { + const int other_index = this_index + half_segment_length - offset; + if (other_index < len) { + const INDEX_T this_data_index = indices[this_index]; + const INDEX_T other_data_index = indices[other_index]; + const VAL_T this_value = values[this_data_index]; + const VAL_T other_value = values[other_data_index]; + if ((this_value > other_value) == ascending) { + indices[this_index] = other_data_index; + indices[other_index] = this_data_index; + } + } + } + } + } + __syncthreads(); + } + for (int inner_depth = depth + 1; inner_depth <= max_depth - 10; ++inner_depth) { + const int half_segment_length = (1 << (max_depth - inner_depth - 1)); + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS / 2 + static_cast(threadIdx.x); + const int segment_index = this_index / segment_length; + const int half_segment_index = this_index / half_segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + if (half_segment_index % 2 == 0) { + const int other_index = this_index + half_segment_length; + if (other_index < len) { + const INDEX_T this_data_index = indices[this_index]; + const INDEX_T other_data_index = indices[other_index]; + const VAL_T this_value = values[this_data_index]; + const VAL_T other_value = values[other_data_index]; + if ((this_value > other_value) == ascending) { + indices[this_index] = other_data_index; + indices[other_index] = this_data_index; + } + } + } + __syncthreads(); + } + } + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS / 2 + static_cast(threadIdx.x); + const int segment_index = this_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + if (this_index < len) { + const INDEX_T index = indices[this_index]; + shared_values[threadIdx.x] = values[index]; + shared_indices[threadIdx.x] = index; + } else { + shared_indices[threadIdx.x] = len; + } + __syncthreads(); + for (int inner_depth = max_depth - 9; inner_depth < max_depth; ++inner_depth) { + const int half_segment_length = (1 << (max_depth - inner_depth - 1)); + const int half_segment_index = this_index / half_segment_length; + if (half_segment_index % 2 == 0) { + const int other_index = static_cast(threadIdx.x) + half_segment_length; + const INDEX_T this_data_index = shared_indices[threadIdx.x]; + const INDEX_T other_data_index = shared_indices[other_index]; + const VAL_T this_value = shared_values[threadIdx.x]; + const VAL_T other_value = shared_values[other_index]; + if (other_data_index < len && (this_value > other_value) == ascending) { + shared_indices[threadIdx.x] = other_data_index; + shared_indices[other_index] = this_data_index; + shared_values[threadIdx.x] = other_value; + shared_values[other_index] = this_value; + } + } + __syncthreads(); + } + if (this_index < len) { + indices[this_index] = shared_indices[threadIdx.x]; + } + __syncthreads(); + } + } +} + __global__ void BitonicArgSortItemsGlobalKernel(const double* scores, const int num_queries, const data_size_t* cuda_query_boundaries, @@ -663,6 +833,27 @@ __global__ void GlobalInclusivePrefixSumKernel(T* values, T* block_buffer, data_ } } +template +__global__ void GlobalInclusiveArgPrefixSumKernel( + const INDEX_T* sorted_indices, const VAL_T* in_values, REDUCE_T* out_values, REDUCE_T* block_buffer, data_size_t num_data) { + __shared__ REDUCE_T shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + if (sorted_indices[data_index] >= num_data || sorted_indices[data_index] < 0) { + printf("error find sorted_indices[%d] = %d\n", data_index, sorted_indices[data_index]); + } + } + shared_buffer[threadIdx.x] = (data_index < num_data ? in_values[sorted_indices[data_index]] : 0); + __syncthreads(); + PrefixSum(shared_buffer, blockDim.x); + if (data_index < num_data) { + out_values[data_index] = shared_buffer[threadIdx.x + 1]; + } + if (threadIdx.x == 0) { + block_buffer[blockIdx.x + 1] = shared_buffer[blockDim.x]; + } +} + template __global__ void GlobalInclusivePrefixSumReduceBlockKernel(T* block_buffer, data_size_t num_blocks) { __shared__ T shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; @@ -748,6 +939,26 @@ void GlobalInclusivePrefixSum(T* values, T* block_buffer, size_t n) { block_buffer, values, num_data); } +template +void GlobalInclusiveArgPrefixSumInner(const INDEX_T* sorted_indices, const VAL_T* in_values, REDUCE_T* out_values, REDUCE_T* block_buffer, size_t n) { + const data_size_t num_data = static_cast(n); + const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + GlobalInclusiveArgPrefixSumKernel<<>>( + sorted_indices, in_values, out_values, block_buffer, num_data); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + GlobalInclusivePrefixSumReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( + block_buffer, num_blocks); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + GlobalInclusivePrefixSumAddBlockBaseKernel<<>>( + block_buffer, out_values, num_data); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +template <> +void GlobalInclusiveArgPrefixSum(const data_size_t* sorted_indices, const label_t* in_values, double* out_values, double* block_buffer, size_t n) { + GlobalInclusiveArgPrefixSumInner(sorted_indices, in_values, out_values, block_buffer, n); +} + __global__ void GlobalGenAUCMarkKernel(const double* scores, const data_size_t* sorted_indices, data_size_t* mark_buffer, @@ -914,6 +1125,38 @@ __global__ void BlockReduceSum(T* block_buffer, const data_size_t num_blocks) { } } +template +__global__ void BlockReduceMax(T* block_buffer, const data_size_t num_blocks) { + __shared__ T shared_buffer[32]; + T thread_max = 0; + for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { + const T value = block_buffer[block_index]; + if (value > thread_max) { + thread_max = value; + } + } + thread_max = ShuffleReduceMax(thread_max, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + block_buffer[0] = thread_max; + } +} + +template +__global__ void BlockReduceMin(T* block_buffer, const data_size_t num_blocks) { + __shared__ T shared_buffer[32]; + T thread_min = 0; + for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { + const T value = block_buffer[block_index]; + if (value < thread_min) { + thread_min = value; + } + } + thread_min = ShuffleReduceMin(thread_min, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + block_buffer[0] = thread_min; + } +} + template void GlobalCalcAUCInner(const double* sum_pos_buffer, const double* sum_neg_buffer, @@ -1004,4 +1247,146 @@ void GlobalCalcAveragePrecision(const double* sum_pos_buffer, GlobalCalcAveragePrecisionInner(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); } +template +__global__ void ReduceSumGlobalKernel(const VAL_T* values, const data_size_t num_value, REDUCE_T* block_buffer) { + __shared__ REDUCE_T shared_buffer[32]; + const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + const REDUCE_T value = (data_index < num_value ? static_cast(values[data_index]) : 0.0f); + const REDUCE_T reduce_value = ShuffleReduceSum(value, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + block_buffer[blockIdx.x] = reduce_value; + } +} + +template +void ReduceSumGlobalInner(const VAL_T* values, size_t n, REDUCE_T* block_buffer) { + const data_size_t num_value = static_cast(n); + const data_size_t num_blocks = (num_value + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + ReduceSumGlobalKernel<<>>(values, num_value, block_buffer); + BlockReduceSum<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); +} + +template <> +void ReduceSumGlobal(const label_t* values, size_t n, double* block_buffer) { + ReduceSumGlobalInner(values, n, block_buffer); +} + +template +__global__ void ReduceMaxGlobalKernel(const VAL_T* values, const data_size_t num_value, REDUCE_T* block_buffer) { + __shared__ REDUCE_T shared_buffer[32]; + const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + const REDUCE_T value = (data_index < num_value ? static_cast(values[data_index]) : 0.0f); + const REDUCE_T reduce_value = ShuffleReduceMax(value, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + block_buffer[blockIdx.x] = reduce_value; + } +} + +template +void ReduceMaxGlobalInner(const VAL_T* values, size_t n, REDUCE_T* block_buffer) { + const data_size_t num_value = static_cast(n); + const data_size_t num_blocks = (num_value + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + ReduceMaxGlobalKernel<<>>(values, num_value, block_buffer); + BlockReduceMax<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); +} + +template <> +void ReduceMaxGlobal(const label_t* values, size_t n, double* block_buffer) { + ReduceMaxGlobalInner(values, n, block_buffer); +} + +template +__global__ void ReduceMinGlobalKernel(const VAL_T* values, const data_size_t num_value, REDUCE_T* block_buffer) { + __shared__ REDUCE_T shared_buffer[32]; + const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + const REDUCE_T value = (data_index < num_value ? static_cast(values[data_index]) : 0.0f); + const REDUCE_T reduce_value = ShuffleReduceMin(value, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + block_buffer[blockIdx.x] = reduce_value; + } +} + +template +void ReduceMinGlobalInner(const VAL_T* values, size_t n, REDUCE_T* block_buffer) { + const data_size_t num_value = static_cast(n); + const data_size_t num_blocks = (num_value + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + ReduceMinGlobalKernel<<>>(values, num_value, block_buffer); + BlockReduceMin<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); +} + +template <> +void ReduceMinGlobal(const label_t* values, size_t n, double* block_buffer) { + ReduceMinGlobalInner(values, n, block_buffer); +} + +template +__device__ VAL_T PercentileDeviceInner(const VAL_T* values, + const WEIGHT_T* weights, + INDEX_T* indices, + REDUCE_WEIGHT_T* weights_prefix_sum, + const double alpha, + const INDEX_T len) { + if (len <= 1) { + return values[0]; + } + BitonicArgSortDevice512(values, indices, len); + if (!USE_WEIGHT) { + const double float_pos = (1.0f - alpha) * len; + const INDEX_T pos = static_cast(float_pos); + if (pos < 1) { + return values[indices[0]]; + } else if (pos >= len) { + return values[indices[len - 1]]; + } else { + const double bias = float_pos - pos; + const VAL_T v1 = values[indices[pos - 1]]; + const VAL_T v2 = values[indices[pos]]; + return static_cast(v1 - (v1 - v2) * bias); + } + } else { + PrefixSumDevice(weights, indices, weights_prefix_sum, len); + const REDUCE_WEIGHT_T threshold = weights_prefix_sum[len - 1] * (1.0f - alpha); + __shared__ INDEX_T pos; + if (threadIdx.x == 0) { + pos = len; + } + __syncthreads(); + for (INDEX_T index = static_cast(threadIdx.x); index < len; index += static_cast(blockDim.x)) { + if (weights_prefix_sum[index] > threshold && (index == 0 || weights_prefix_sum[index - 1] <= threshold)) { + pos = index; + } + } + __syncthreads(); + pos = min(pos, len - 1); + if (pos == 0 || pos == len - 1) { + return values[pos]; + } + const VAL_T v1 = values[indices[pos - 1]]; + const VAL_T v2 = values[indices[pos]]; + return static_cast(v1 - (v1 - v2) * (threshold - weights_prefix_sum[pos - 1]) / (weights_prefix_sum[pos] - weights_prefix_sum[pos - 1])); + } +} + +template <> +__device__ double PercentileDevice( + const double* values, + const label_t* weights, + data_size_t* indices, + double* weights_prefix_sum, + const double alpha, + const data_size_t len) { + return PercentileDeviceInner(values, weights, indices, weights_prefix_sum, alpha, len); +} + +template <> +__device__ double PercentileDevice( + const double* values, + const label_t* weights, + data_size_t* indices, + double* weights_prefix_sum, + const double alpha, + const data_size_t len) { + return PercentileDeviceInner(values, weights, indices, weights_prefix_sum, alpha, len); +} + } // namespace LightGBM diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index 450081e9bea2..ded06eb875f3 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -68,6 +68,10 @@ __global__ void SplitKernel(// split information right_child[parent_index] = new_node_index; } } + left_child[new_node_index] = ~leaf_index; + right_child[new_node_index] = ~num_leaves; + leaf_parent[leaf_index] = new_node_index; + leaf_parent[num_leaves] = new_node_index; } else if (thread_index == 1) { // add new node split_feature_inner[new_node_index] = cuda_split_info->inner_feature_index; @@ -76,44 +80,34 @@ __global__ void SplitKernel(// split information } else if (thread_index == 3) { split_gain[new_node_index] = static_cast(cuda_split_info->gain); } else if (thread_index == 4) { - // add two new leaves - left_child[new_node_index] = ~leaf_index; - } else if (thread_index == 5) { - right_child[new_node_index] = ~num_leaves; - } else if (thread_index == 6) { - // update new leaves - leaf_parent[leaf_index] = new_node_index; - } else if (thread_index == 7) { - leaf_parent[num_leaves] = new_node_index; - } else if (thread_index == 8) { // save current leaf value to internal node before change internal_weight[new_node_index] = leaf_weight[leaf_index]; leaf_weight[leaf_index] = cuda_split_info->left_sum_hessians; - } else if (thread_index == 9) { + } else if (thread_index == 5) { internal_value[new_node_index] = leaf_value[leaf_index]; leaf_value[leaf_index] = isnan(cuda_split_info->left_value) ? 0.0f : cuda_split_info->left_value; - } else if (thread_index == 10) { + } else if (thread_index == 6) { internal_count[new_node_index] = cuda_split_info->left_count + cuda_split_info->right_count; - } else if (thread_index == 11) { + } else if (thread_index == 7) { leaf_count[leaf_index] = cuda_split_info->left_count; - } else if (thread_index == 12) { + } else if (thread_index == 8) { leaf_value[num_leaves] = isnan(cuda_split_info->right_value) ? 0.0f : cuda_split_info->right_value; - } else if (thread_index == 13) { + } else if (thread_index == 9) { leaf_weight[num_leaves] = cuda_split_info->right_sum_hessians; - } else if (thread_index == 14) { + } else if (thread_index == 10) { leaf_count[num_leaves] = cuda_split_info->right_count; - } else if (thread_index == 15) { + } else if (thread_index == 11) { // update leaf depth leaf_depth[num_leaves] = leaf_depth[leaf_index] + 1; leaf_depth[leaf_index]++; - } else if (thread_index == 16) { + } else if (thread_index == 12) { decision_type[new_node_index] = 0; SetDecisionTypeCUDA(&decision_type[new_node_index], false, kCategoricalMask); SetDecisionTypeCUDA(&decision_type[new_node_index], cuda_split_info->default_left, kDefaultLeftMask); SetMissingTypeCUDA(&decision_type[new_node_index], static_cast(missing_type)); - } else if (thread_index == 17) { + } else if (thread_index == 13) { threshold_in_bin[new_node_index] = cuda_split_info->threshold; - } else if (thread_index == 18) { + } else if (thread_index == 14) { threshold[new_node_index] = real_threshold; } } @@ -123,7 +117,7 @@ void CUDATree::LaunchSplitKernel(const int leaf_index, const double real_threshold, const MissingType missing_type, const CUDASplitInfo* cuda_split_info) { - SplitKernel<<<4, 5, 0, cuda_stream_>>>( + SplitKernel<<<3, 5, 0, cuda_stream_>>>( // split information leaf_index, real_feature_index, @@ -218,7 +212,8 @@ __global__ void AddPredictionToScoreKernel( } ++iter; if (iter > 1000) { - printf("error iter = %d\n", iter); + printf("error iter = %d, node = %d, cuda_left_child[%d] = %d, cuda_right_child[%d] = %d\n", + iter, node, node, cuda_left_child[node], node, cuda_right_child[node]); } } score[data_index] += cuda_leaf_value[~node]; diff --git a/src/objective/cuda/cuda_regression_objective.cpp b/src/objective/cuda/cuda_regression_objective.cpp index 0c3a5333c016..c8b9576c8ded 100644 --- a/src/objective/cuda/cuda_regression_objective.cpp +++ b/src/objective/cuda/cuda_regression_objective.cpp @@ -22,8 +22,12 @@ void CUDARegressionL2loss::Init(const Metadata& metadata, data_size_t num_data) RegressionL2loss::Init(metadata, num_data); cuda_labels_ = metadata.cuda_metadata()->cuda_label(); cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - AllocateCUDAMemoryOuter(&cuda_boost_from_score_, 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_boost_from_score_, 0, 1, __FILE__, __LINE__); + num_get_gradients_blocks_ = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + AllocateCUDAMemoryOuter(&cuda_block_buffer_, static_cast(num_get_gradients_blocks_), __FILE__, __LINE__); + if (sqrt_) { + InitCUDAMemoryFromHostMemoryOuter(&cuda_trans_label_, trans_label_.data(), trans_label_.size(), __FILE__, __LINE__); + cuda_labels_ = cuda_trans_label_; + } } void CUDARegressionL2loss::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { @@ -31,16 +35,159 @@ void CUDARegressionL2loss::GetGradients(const double* score, score_t* gradients, } double CUDARegressionL2loss::BoostFromScore(int) const { - LaunchCalcInitScoreKernel(); - double boost_from_score = 0.0f; - CopyFromCUDADeviceToHostOuter(&boost_from_score, cuda_boost_from_score_, 1, __FILE__, __LINE__); - return boost_from_score; + return LaunchCalcInitScoreKernel(); } void CUDARegressionL2loss::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { LaunchConvertOutputCUDAKernel(num_data, input, output); } +void CUDARegressionL2loss::RenewTreeOutputCUDA( + const double* score, + const data_size_t* data_indices_in_leaf, + const data_size_t* num_data_in_leaf, + const data_size_t* data_start_in_leaf, + const int num_leaves, + double* leaf_value) const { + global_timer.Start("CUDARegressionL1loss::LaunchRenewTreeOutputCUDAKernel"); + LaunchRenewTreeOutputCUDAKernel(score, data_indices_in_leaf, num_data_in_leaf, data_start_in_leaf, num_leaves, leaf_value); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + global_timer.Stop("CUDARegressionL1loss::LaunchRenewTreeOutputCUDAKernel"); +} + +CUDARegressionL1loss::CUDARegressionL1loss(const Config& config): +CUDARegressionL2loss(config) {} + +CUDARegressionL1loss::CUDARegressionL1loss(const std::vector& strs): +CUDARegressionL2loss(strs) {} + +CUDARegressionL1loss::~CUDARegressionL1loss() {} + +void CUDARegressionL1loss::Init(const Metadata& metadata, data_size_t num_data) { + CUDARegressionL2loss::Init(metadata, num_data); + AllocateCUDAMemoryOuter(&cuda_data_indices_buffer_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_percentile_result_, 1, __FILE__, __LINE__); + if (cuda_weights_ != nullptr) { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION + 1; + AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_buffer_, static_cast(num_blocks), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_weight_by_leaf_buffer_, static_cast(num_data_), __FILE__, __LINE__); + } + AllocateCUDAMemoryOuter(&cuda_residual_buffer_, static_cast(num_data_), __FILE__, __LINE__); +} + +CUDARegressionHuberLoss::CUDARegressionHuberLoss(const Config& config): +CUDARegressionL2loss(config), alpha_(config.alpha) { + if (sqrt_) { + Log::Warning("Cannot use sqrt transform in %s Regression, will auto disable it", GetName()); + sqrt_ = false; + } +} + +CUDARegressionHuberLoss::CUDARegressionHuberLoss(const std::vector& strs): +CUDARegressionL2loss(strs) {} + +CUDARegressionHuberLoss::~CUDARegressionHuberLoss() {} + +CUDARegressionFairLoss::CUDARegressionFairLoss(const Config& config): +CUDARegressionL2loss(config), c_(config.fair_c) {} + +CUDARegressionFairLoss::CUDARegressionFairLoss(const std::vector& strs): +CUDARegressionL2loss(strs) {} + +CUDARegressionFairLoss::~CUDARegressionFairLoss() {} + +CUDARegressionPoissonLoss::CUDARegressionPoissonLoss(const Config& config): +CUDARegressionL2loss(config), max_delta_step_(config.poisson_max_delta_step) { + if (sqrt_) { + Log::Warning("Cannot use sqrt transform in %s Regression, will auto disable it", GetName()); + sqrt_ = false; + } +} + +CUDARegressionPoissonLoss::CUDARegressionPoissonLoss(const std::vector& strs): +CUDARegressionL2loss(strs) {} + +CUDARegressionPoissonLoss::~CUDARegressionPoissonLoss() {} + +void CUDARegressionPoissonLoss::Init(const Metadata& metadata, data_size_t num_data) { + CUDARegressionL2loss::Init(metadata, num_data); + AllocateCUDAMemoryOuter(&cuda_block_buffer_, static_cast(num_get_gradients_blocks_), __FILE__, __LINE__); + LaunchCheckLabelKernel(); +} + +double CUDARegressionPoissonLoss::LaunchCalcInitScoreKernel() const { + return Common::SafeLog(CUDARegressionL2loss::LaunchCalcInitScoreKernel()); +} + +CUDARegressionQuantileloss::CUDARegressionQuantileloss(const Config& config): +CUDARegressionL2loss(config), alpha_(config.alpha) { + CHECK(alpha_ > 0 && alpha_ < 1); +} + +CUDARegressionQuantileloss::CUDARegressionQuantileloss(const std::vector& strs): +CUDARegressionL2loss(strs) {} + +void CUDARegressionQuantileloss::Init(const Metadata& metadata, data_size_t num_data) { + CUDARegressionL2loss::Init(metadata, num_data); + AllocateCUDAMemoryOuter(&cuda_data_indices_buffer_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_percentile_result_, 1, __FILE__, __LINE__); + if (cuda_weights_ != nullptr) { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION + 1; + AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_buffer_, static_cast(num_blocks), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_weight_by_leaf_buffer_, static_cast(num_data_), __FILE__, __LINE__); + } + AllocateCUDAMemoryOuter(&cuda_residual_buffer_, static_cast(num_data_), __FILE__, __LINE__); +} + +CUDARegressionQuantileloss::~CUDARegressionQuantileloss() {} + +CUDARegressionMAPELOSS::CUDARegressionMAPELOSS(const Config& config): +CUDARegressionL1loss(config) {} + +CUDARegressionMAPELOSS::CUDARegressionMAPELOSS(const std::vector& strs): +CUDARegressionL1loss(strs) {} + +CUDARegressionMAPELOSS::~CUDARegressionMAPELOSS() {} + +void CUDARegressionMAPELOSS::Init(const Metadata& metadata, data_size_t num_data) { + CUDARegressionL1loss::Init(metadata, num_data); + if (cuda_weights_ == nullptr) { + // allocate buffer for weights when they are not allocated in L1 loss + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION + 1; + AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_buffer_, static_cast(num_blocks), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_weight_by_leaf_buffer_, static_cast(num_data_), __FILE__, __LINE__); + } + for (data_size_t i = 0; i < num_data_; ++i) { + if (std::fabs(label_[i]) < 1) { + Log::Warning( + "Some label values are < 1 in absolute value. MAPE is unstable with such values, " + "so LightGBM rounds them to 1.0 when calculating MAPE."); + break; + } + } + AllocateCUDAMemoryOuter(&cuda_label_weights_, static_cast(num_data_), __FILE__, __LINE__); + LaunchCalcLabelWeightKernel(); +} + +CUDARegressionGammaLoss::CUDARegressionGammaLoss(const Config& config): +CUDARegressionPoissonLoss(config) {} + +CUDARegressionGammaLoss::CUDARegressionGammaLoss(const std::vector& strs): +CUDARegressionPoissonLoss(strs) {} + +CUDARegressionGammaLoss::~CUDARegressionGammaLoss() {} + +CUDARegressionTweedieLoss::CUDARegressionTweedieLoss(const Config& config): +CUDARegressionPoissonLoss(config), rho_(config.tweedie_variance_power) {} + +CUDARegressionTweedieLoss::CUDARegressionTweedieLoss(const std::vector& strs): +CUDARegressionPoissonLoss(strs) {} + +CUDARegressionTweedieLoss::~CUDARegressionTweedieLoss() {} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/objective/cuda/cuda_regression_objective.cu b/src/objective/cuda/cuda_regression_objective.cu index 3bdb78057007..4ad35ab773ed 100644 --- a/src/objective/cuda/cuda_regression_objective.cu +++ b/src/objective/cuda/cuda_regression_objective.cu @@ -7,79 +7,613 @@ #ifdef USE_CUDA #include "cuda_regression_objective.hpp" +#include namespace LightGBM { -__global__ void CalcInitScoreKernel_1_Regression(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_boost_from_score) { - __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION]; - const unsigned int tid = threadIdx.x; - const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; - shared_label[tid] = 0.0f; - __syncthreads(); - for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; ++j) { - if (i + j < num_data) { - shared_label[tid] += cuda_labels[i + j]; +double CUDARegressionL2loss::LaunchCalcInitScoreKernel() const { + double label_sum = 0.0f, weight_sum = 0.0f; + ReduceSumGlobal(cuda_labels_, static_cast(num_data_), cuda_block_buffer_); + CopyFromCUDADeviceToHostOuter(&label_sum, cuda_block_buffer_, 1, __FILE__, __LINE__); + if (cuda_weights_ == nullptr) { + weight_sum = static_cast(num_data_); + } else { + ReduceSumGlobal(cuda_weights_, static_cast(num_data_), cuda_block_buffer_); + CopyFromCUDADeviceToHostOuter(&weight_sum, cuda_block_buffer_, 1, __FILE__, __LINE__); + } + return label_sum / weight_sum; +} + +// TODO(shiyu1994): try to use global kernels as class methods +__global__ void ConvertOutputCUDAKernel_Regression(const bool sqrt, const data_size_t num_data, const double* input, double* output) { + const int data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (data_index < num_data) { + if (sqrt) { + const double sign = input[data_index] >= 0.0f ? 1 : -1; + output[data_index] = sign * input[data_index] * input[data_index]; + } else { + output[data_index] = input[data_index]; + } + } +} + +void CUDARegressionL2loss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { + const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + ConvertOutputCUDAKernel_Regression<<>>(sqrt_, num_data, input, output); +} + + +template +__global__ void GetGradientsKernel_RegressionL2(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, + score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + if (!USE_WEIGHT) { + cuda_out_gradients[data_index] = static_cast(cuda_scores[data_index] - cuda_labels[data_index]); + cuda_out_hessians[data_index] = 1.0f; + } else { + const score_t weight = static_cast(cuda_weights[data_index]); + cuda_out_gradients[data_index] = static_cast(cuda_scores[data_index] - cuda_labels[data_index]) * weight; + cuda_out_hessians[data_index] = weight; + } + } +} + +void CUDARegressionL2loss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + if (cuda_weights_ == nullptr) { + GetGradientsKernel_RegressionL2<<>>(score, cuda_labels_, nullptr, num_data_, gradients, hessians); + } else { + GetGradientsKernel_RegressionL2<<>>(score, cuda_labels_, cuda_weights_, num_data_, gradients, hessians); + } +} + +double CUDARegressionL1loss::LaunchCalcInitScoreKernel() const { + const double alpha = 0.9f; + if (cuda_weights_ == nullptr) { + PercentileGlobal( + cuda_labels_, nullptr, cuda_data_indices_buffer_, nullptr, nullptr, alpha, num_data_, cuda_percentile_result_); + } else { + PercentileGlobal( + cuda_labels_, cuda_weights_, cuda_data_indices_buffer_, cuda_weights_prefix_sum_, cuda_weights_prefix_sum_buffer_, alpha, num_data_, cuda_percentile_result_); + } + label_t percentile_result = 0.0f; + CopyFromCUDADeviceToHostOuter(&percentile_result, cuda_percentile_result_, 1, __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + return static_cast(percentile_result); +} + +template +__global__ void GetGradientsKernel_RegressionL1(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, + score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + if (!USE_WEIGHT) { + const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); + cuda_out_gradients[data_index] = static_cast((diff > 0.0f) - (diff < 0.0f)); + cuda_out_hessians[data_index] = 1.0f; + } else { + const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); + const score_t weight = static_cast(cuda_weights[data_index]); + cuda_out_gradients[data_index] = static_cast((diff > 0.0f) - (diff < 0.0f)) * weight; + cuda_out_hessians[data_index] = weight; } } +} + +void CUDARegressionL1loss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + if (cuda_weights_ == nullptr) { + GetGradientsKernel_RegressionL1<<>>(score, cuda_labels_, nullptr, num_data_, gradients, hessians); + } else { + GetGradientsKernel_RegressionL1<<>>(score, cuda_labels_, cuda_weights_, num_data_, gradients, hessians); + } +} + +template +__global__ void RenewTreeOutputCUDAKernel_RegressionL1( + const double* score, + const label_t* label, + const label_t* weight, + double* residual_buffer, + label_t* weight_by_leaf, + double* weight_prefix_sum_buffer, + const data_size_t* data_indices_in_leaf, + const data_size_t* num_data_in_leaf, + const data_size_t* data_start_in_leaf, + data_size_t* data_indices_buffer, + double* leaf_value) { + const int leaf_index = static_cast(blockIdx.x); + const data_size_t data_start = data_start_in_leaf[leaf_index]; + const data_size_t num_data = num_data_in_leaf[leaf_index]; + data_size_t* data_indices_buffer_pointer = data_indices_buffer + data_start; + const label_t* weight_by_leaf_pointer = weight_by_leaf + data_start; + double* weight_prefix_sum_buffer_pointer = weight_prefix_sum_buffer + data_start; + const double* residual_buffer_pointer = residual_buffer + data_start; + const double alpha = 0.5f; + for (data_size_t inner_data_index = data_start + static_cast(threadIdx.x); inner_data_index < data_start + num_data; inner_data_index += static_cast(blockDim.x)) { + const data_size_t data_index = data_indices_in_leaf[inner_data_index]; + const label_t data_label = label[data_index]; + const double data_score = score[data_index]; + residual_buffer[inner_data_index] = static_cast(data_label) - data_score; + if (USE_WEIGHT) { + weight_by_leaf[inner_data_index] = weight[data_index]; + } + } __syncthreads(); - for (unsigned int s = 1; s < blockDim.x; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION) { - shared_label[tid] += shared_label[tid + s]; + // TODO(shiyu1994): replace this bitonic sort based percentile method with a more efficient one + const double renew_leaf_value = PercentileDevice( + residual_buffer_pointer, weight_by_leaf_pointer, data_indices_buffer_pointer, + weight_prefix_sum_buffer_pointer, alpha, num_data); + if (threadIdx.x == 0) { + leaf_value[leaf_index] = renew_leaf_value; + } +} + +void CUDARegressionL1loss::LaunchRenewTreeOutputCUDAKernel( + const double* score, + const data_size_t* data_indices_in_leaf, + const data_size_t* num_data_in_leaf, + const data_size_t* data_start_in_leaf, + const int num_leaves, + double* leaf_value) const { + if (cuda_weights_ == nullptr) { + RenewTreeOutputCUDAKernel_RegressionL1<<>>( + score, + cuda_labels_, + cuda_weights_, + cuda_residual_buffer_, + cuda_weight_by_leaf_buffer_, + cuda_weights_prefix_sum_, + data_indices_in_leaf, + num_data_in_leaf, + data_start_in_leaf, + cuda_data_indices_buffer_, + leaf_value); + } else { + RenewTreeOutputCUDAKernel_RegressionL1<<>>( + score, + cuda_labels_, + cuda_weights_, + cuda_residual_buffer_, + cuda_weight_by_leaf_buffer_, + cuda_weights_prefix_sum_, + data_indices_in_leaf, + num_data_in_leaf, + data_start_in_leaf, + cuda_data_indices_buffer_, + leaf_value); + } + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +template +__global__ void GetGradientsKernel_Huber(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, + const double alpha, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + if (!USE_WEIGHT) { + const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); + if (fabs(diff) <= alpha) { + cuda_out_gradients[data_index] = static_cast(diff); + } else { + const score_t sign = static_cast((diff > 0.0f) - (diff < 0.0f)); + cuda_out_gradients[data_index] = static_cast(sign * alpha); + } + cuda_out_hessians[data_index] = 1.0f; + } else { + const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); + const score_t weight = static_cast(cuda_weights[data_index]); + if (fabs(diff) <= alpha) { + cuda_out_gradients[data_index] = static_cast(diff) * weight; + } else { + const score_t sign = static_cast((diff > 0.0f) - (diff < 0.0f)); + cuda_out_gradients[data_index] = static_cast(sign * alpha) * weight; + } + cuda_out_hessians[data_index] = weight; + } + } +} + +void CUDARegressionHuberLoss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + if (cuda_weights_ == nullptr) { + GetGradientsKernel_Huber<<>>(score, cuda_labels_, nullptr, num_data_, alpha_, gradients, hessians); + } else { + GetGradientsKernel_Huber<<>>(score, cuda_labels_, cuda_weights_, num_data_, alpha_, gradients, hessians); + } +} + +template +__global__ void GetGradientsKernel_Fair(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, + const double c, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + if (!USE_WEIGHT) { + const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); + cuda_out_gradients[data_index] = static_cast(c * diff / (fabs(diff) + c)); + cuda_out_hessians[data_index] = static_cast(c * c / ((fabs(diff) + c) * (fabs(diff) + c))); + } else { + const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); + const score_t weight = static_cast(cuda_weights[data_index]); + cuda_out_gradients[data_index] = static_cast(c * diff / (fabs(diff) + c) * weight); + cuda_out_hessians[data_index] = static_cast(c * c / ((fabs(diff) + c) * (fabs(diff) + c)) * weight); + } + } +} + +void CUDARegressionFairLoss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + if (cuda_weights_ == nullptr) { + GetGradientsKernel_Fair<<>>(score, cuda_labels_, nullptr, num_data_, c_, gradients, hessians); + } else { + GetGradientsKernel_Fair<<>>(score, cuda_labels_, cuda_weights_, num_data_, c_, gradients, hessians); + } +} + +void CUDARegressionPoissonLoss::LaunchCheckLabelKernel() const { + ReduceSumGlobal(cuda_labels_, static_cast(num_data_), cuda_block_buffer_); + double label_sum = 0.0f; + CopyFromCUDADeviceToHostOuter(&label_sum, cuda_block_buffer_, 1, __FILE__, __LINE__); + + ReduceMinGlobal(cuda_labels_, static_cast(num_data_), cuda_block_buffer_); + double label_min = 0.0f; + CopyFromCUDADeviceToHostOuter(&label_min, cuda_block_buffer_, 1, __FILE__, __LINE__); + + if (label_min < 0.0f) { + Log::Fatal("[%s]: at least one target label is negative", GetName()); + } + if (label_sum == 0.0f) { + Log::Fatal("[%s]: sum of labels is zero", GetName()); + } +} + +template +__global__ void GetGradientsKernel_Poisson(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, + const double max_delta_step, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + if (!USE_WEIGHT) { + cuda_out_gradients[data_index] = static_cast(exp(cuda_scores[data_index]) - cuda_labels[data_index]); + cuda_out_hessians[data_index] = static_cast(std::exp(cuda_scores[data_index] + max_delta_step)); + } else { + const score_t weight = static_cast(cuda_weights[data_index]); + cuda_out_gradients[data_index] = static_cast(exp(cuda_scores[data_index]) - cuda_labels[data_index]) * weight; + cuda_out_hessians[data_index] = static_cast(std::exp(cuda_scores[data_index] + max_delta_step)) * weight; } - __syncthreads(); } - if (tid == 0) { - atomicAdd_system(out_cuda_boost_from_score, shared_label[0]); +} + +void CUDARegressionPoissonLoss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + if (cuda_weights_ == nullptr) { + GetGradientsKernel_Poisson<<>>(score, cuda_labels_, nullptr, num_data_, max_delta_step_, gradients, hessians); + } else { + GetGradientsKernel_Poisson<<>>(score, cuda_labels_, cuda_weights_, num_data_, max_delta_step_, gradients, hessians); } } -__global__ void CalcInitScoreKernel_2_Regression(double* out_cuda_boost_from_score, const data_size_t num_data) { - const double suml = *out_cuda_boost_from_score; - const double sumw = static_cast(num_data); - const double init_score = suml / sumw; - *out_cuda_boost_from_score = init_score; +// TODO(shiyu1994): try to use global kernels as class methods +__global__ void ConvertOutputCUDAKernel_Regression_Poissson(const bool sqrt, const data_size_t num_data, const double* input, double* output) { + const int data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (data_index < num_data) { + output[data_index] = exp(input[data_index]); + } } -void CUDARegressionL2loss::LaunchCalcInitScoreKernel() const { - const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION; - const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; - CalcInitScoreKernel_1_Regression<<>>(cuda_labels_, num_data_, cuda_boost_from_score_); +void CUDARegressionPoissonLoss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { + const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + ConvertOutputCUDAKernel_Regression_Poissson<<>>(sqrt_, num_data, input, output); +} + +double CUDARegressionQuantileloss::LaunchCalcInitScoreKernel() const { + if (cuda_weights_ == nullptr) { + PercentileGlobal( + cuda_labels_, nullptr, cuda_data_indices_buffer_, nullptr, nullptr, alpha_, num_data_, cuda_percentile_result_); + } else { + PercentileGlobal( + cuda_labels_, cuda_weights_, cuda_data_indices_buffer_, cuda_weights_prefix_sum_, cuda_weights_prefix_sum_buffer_, alpha_, num_data_, cuda_percentile_result_); + } + label_t percentile_result = 0.0f; + CopyFromCUDADeviceToHostOuter(&percentile_result, cuda_percentile_result_, 1, __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CalcInitScoreKernel_2_Regression<<<1, 1>>>(cuda_boost_from_score_, num_data_); + return static_cast(percentile_result); +} + +template +__global__ void RenewTreeOutputCUDAKernel_RegressionQuantile( + const double* score, + const label_t* label, + const label_t* weight, + double* residual_buffer, + label_t* weight_by_leaf, + double* weight_prefix_sum_buffer, + const data_size_t* data_indices_in_leaf, + const data_size_t* num_data_in_leaf, + const data_size_t* data_start_in_leaf, + data_size_t* data_indices_buffer, + double* leaf_value, + const double alpha) { + const int leaf_index = static_cast(blockIdx.x); + const data_size_t data_start = data_start_in_leaf[leaf_index]; + const data_size_t num_data = num_data_in_leaf[leaf_index]; + data_size_t* data_indices_buffer_pointer = data_indices_buffer + data_start; + const label_t* weight_by_leaf_pointer = weight_by_leaf + data_start; + double* weight_prefix_sum_buffer_pointer = weight_prefix_sum_buffer + data_start; + const double* residual_buffer_pointer = residual_buffer + data_start; + for (data_size_t inner_data_index = data_start + static_cast(threadIdx.x); inner_data_index < data_start + num_data; inner_data_index += static_cast(blockDim.x)) { + const data_size_t data_index = data_indices_in_leaf[inner_data_index]; + const label_t data_label = label[data_index]; + const double data_score = score[data_index]; + residual_buffer[inner_data_index] = static_cast(data_label) - data_score; + if (USE_WEIGHT) { + weight_by_leaf[inner_data_index] = weight[data_index]; + } + } + __syncthreads(); + // TODO(shiyu1994): replace this bitonic sort based percentile method with a more efficient one + const double renew_leaf_value = PercentileDevice( + residual_buffer_pointer, weight_by_leaf_pointer, data_indices_buffer_pointer, + weight_prefix_sum_buffer_pointer, alpha, num_data); + if (threadIdx.x == 0) { + leaf_value[leaf_index] = renew_leaf_value; + } +} + +void CUDARegressionQuantileloss::LaunchRenewTreeOutputCUDAKernel( + const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, + const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const { + if (cuda_weights_ == nullptr) { + RenewTreeOutputCUDAKernel_RegressionQuantile<<>>( + score, + cuda_labels_, + cuda_weights_, + cuda_residual_buffer_, + cuda_weight_by_leaf_buffer_, + cuda_weights_prefix_sum_, + data_indices_in_leaf, + num_data_in_leaf, + data_start_in_leaf, + cuda_data_indices_buffer_, + leaf_value, + alpha_); + } else { + RenewTreeOutputCUDAKernel_RegressionQuantile<<>>( + score, + cuda_labels_, + cuda_weights_, + cuda_residual_buffer_, + cuda_weight_by_leaf_buffer_, + cuda_weights_prefix_sum_, + data_indices_in_leaf, + num_data_in_leaf, + data_start_in_leaf, + cuda_data_indices_buffer_, + leaf_value, + alpha_); + } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } -__global__ void GetGradientsKernel_Regression(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, +template +__global__ void GetGradientsKernel_RegressionQuantile(const double* cuda_scores, const label_t* cuda_labels, + const label_t* cuda_weights, const data_size_t num_data, const double alpha, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - // TODO(shiyu1994): consider sqrt_ if (data_index < num_data) { - cuda_out_gradients[data_index] = static_cast(cuda_scores[data_index] - cuda_labels[data_index]); - cuda_out_hessians[data_index] = 1.0f; + if (!USE_WEIGHT) { + const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); + if (diff >= 0.0f) { + cuda_out_gradients[data_index] = (1.0f - alpha); + } else { + cuda_out_gradients[data_index] = -alpha; + } + cuda_out_hessians[data_index] = 1.0f; + } else { + const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); + const score_t weight = static_cast(cuda_weights[data_index]); + if (diff >= 0.0f) { + cuda_out_gradients[data_index] = (1.0f - alpha) * weight; + } else { + cuda_out_gradients[data_index] = -alpha * weight; + } + cuda_out_hessians[data_index] = weight; + } } } -void CUDARegressionL2loss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { +void CUDARegressionQuantileloss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - GetGradientsKernel_Regression<<>>(score, cuda_labels_, num_data_, gradients, hessians); + if (cuda_weights_ == nullptr) { + GetGradientsKernel_RegressionQuantile<<>>(score, cuda_labels_, nullptr, num_data_, alpha_, gradients, hessians); + } else { + GetGradientsKernel_RegressionQuantile<<>>(score, cuda_labels_, cuda_weights_, num_data_, alpha_, gradients, hessians); + } } -// TODO(shiyu1994): try to use global kernels as class methods -__global__ void ConvertOutputCUDAKernel(const bool sqrt, const data_size_t num_data, const double* input, double* output) { - const int data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); +template +__global__ void CalcLabelWeightKernel( + const label_t* cuda_labels, + const label_t* cuda_weights, + const data_size_t num_data, + label_t* label_weights +) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (data_index < num_data) { - if (sqrt) { - const double sign = input[data_index] >= 0.0f ? 1 : -1; - output[data_index] = sign * input[data_index] * input[data_index]; + const label_t label = cuda_labels[data_index]; + if (!USE_WEIGHT) { + label_weights[data_index] = 1.0f / max(1.0f, fabs(label)); } else { - output[data_index] = input[data_index]; + const label_t weight = cuda_weights[data_index]; + label_weights[data_index] = 1.0f / max(1.0f, fabs(label)) * weight; } } } -void CUDARegressionL2loss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { - const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - ConvertOutputCUDAKernel<<>>(sqrt_, num_data, input, output); +void CUDARegressionMAPELOSS::LaunchCalcLabelWeightKernel() { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + if (cuda_weights_ == nullptr) { + CalcLabelWeightKernel<<>>(cuda_labels_, cuda_weights_, num_data_, cuda_label_weights_); + } else { + CalcLabelWeightKernel<<>>(cuda_labels_, cuda_weights_, num_data_, cuda_label_weights_); + } +} + +template +__global__ void GetGradientsKernel_RegressionMAPELOSS(const double* cuda_scores, const label_t* cuda_labels, + const label_t* cuda_weights, const label_t* cuda_label_weights, const data_size_t num_data, + score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); + const label_t label_weight = cuda_label_weights[data_index]; + const double sign = static_cast((diff > 0) - (diff < 0)); + if (!USE_WEIGHT) { + cuda_out_gradients[data_index] = static_cast(sign * label_weight); + cuda_out_hessians[data_index] = 1.0f; + } else { + const score_t weight = static_cast(cuda_weights[data_index]); + cuda_out_gradients[data_index] = static_cast(sign * label_weight) * weight; + cuda_out_hessians[data_index] = weight; + } + } +} + +void CUDARegressionMAPELOSS::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + if (cuda_weights_ == nullptr) { + GetGradientsKernel_RegressionMAPELOSS<<>>(score, cuda_labels_, nullptr, cuda_label_weights_, num_data_, gradients, hessians); + } else { + GetGradientsKernel_RegressionMAPELOSS<<>>(score, cuda_labels_, cuda_weights_, cuda_label_weights_, num_data_, gradients, hessians); + } +} + +double CUDARegressionMAPELOSS::LaunchCalcInitScoreKernel() const { + PercentileGlobal( + cuda_labels_, cuda_label_weights_, cuda_data_indices_buffer_, + cuda_weights_prefix_sum_, cuda_weights_prefix_sum_buffer_, 0.5f, num_data_, cuda_percentile_result_); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + label_t percentile_result = 0.0f; + CopyFromCUDADeviceToHostOuter(&percentile_result, cuda_percentile_result_, 1, __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + return static_cast(percentile_result); +} + +template +__global__ void RenewTreeOutputCUDAKernel_RegressionMAPE( + const double* score, + const label_t* label, + const label_t* weight, + double* residual_buffer, + label_t* weight_by_leaf, + double* weight_prefix_sum_buffer, + const data_size_t* data_indices_in_leaf, + const data_size_t* num_data_in_leaf, + const data_size_t* data_start_in_leaf, + data_size_t* data_indices_buffer, + double* leaf_value) { + const int leaf_index = static_cast(blockIdx.x); + const data_size_t data_start = data_start_in_leaf[leaf_index]; + const data_size_t num_data = num_data_in_leaf[leaf_index]; + data_size_t* data_indices_buffer_pointer = data_indices_buffer + data_start; + const label_t* weight_by_leaf_pointer = weight_by_leaf + data_start; + double* weight_prefix_sum_buffer_pointer = weight_prefix_sum_buffer + data_start; + const double* residual_buffer_pointer = residual_buffer + data_start; + const double alpha = 0.5f; + for (data_size_t inner_data_index = data_start + static_cast(threadIdx.x); inner_data_index < data_start + num_data; inner_data_index += static_cast(blockDim.x)) { + const data_size_t data_index = data_indices_in_leaf[inner_data_index]; + const label_t data_label = label[data_index]; + const double data_score = score[data_index]; + residual_buffer[inner_data_index] = static_cast(data_label) - data_score; + if (USE_WEIGHT) { + weight_by_leaf[inner_data_index] = weight[data_index]; + } + } + __syncthreads(); + // TODO(shiyu1994): replace this bitonic sort based percentile method with a more efficient one + const double renew_leaf_value = PercentileDevice( + residual_buffer_pointer, weight_by_leaf_pointer, data_indices_buffer_pointer, + weight_prefix_sum_buffer_pointer, alpha, num_data); + if (threadIdx.x == 0) { + leaf_value[leaf_index] = renew_leaf_value; + } +} + +void CUDARegressionMAPELOSS::LaunchRenewTreeOutputCUDAKernel( + const double* score, + const data_size_t* data_indices_in_leaf, + const data_size_t* num_data_in_leaf, + const data_size_t* data_start_in_leaf, + const int num_leaves, + double* leaf_value) const { + Log::Warning("laucnhing RenewTreeOutputCUDAKernel_RegressionMAPE"); + RenewTreeOutputCUDAKernel_RegressionMAPE<<>>( + score, + cuda_labels_, + cuda_label_weights_, + cuda_residual_buffer_, + cuda_weight_by_leaf_buffer_, + cuda_weights_prefix_sum_, + data_indices_in_leaf, + num_data_in_leaf, + data_start_in_leaf, + cuda_data_indices_buffer_, + leaf_value); + PrintLastCUDAErrorOuter(__FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + +template +__global__ void GetGradientsKernel_Gamma(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, + const double max_delta_step, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + if (!USE_WEIGHT) { + cuda_out_gradients[data_index] = static_cast(1.0 - cuda_labels[data_index] / exp(cuda_scores[data_index])); + cuda_out_hessians[data_index] = static_cast(cuda_labels[data_index] / exp(cuda_scores[data_index])); + } else { + const score_t weight = static_cast(cuda_weights[data_index]); + cuda_out_gradients[data_index] = static_cast(1.0 - cuda_labels[data_index] / exp(cuda_scores[data_index])) * weight; + cuda_out_hessians[data_index] = static_cast(cuda_labels[data_index] / exp(cuda_scores[data_index])) * weight; + } + } +} + +void CUDARegressionGammaLoss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + if (cuda_weights_ == nullptr) { + GetGradientsKernel_Gamma<<>>(score, cuda_labels_, nullptr, num_data_, max_delta_step_, gradients, hessians); + } else { + GetGradientsKernel_Gamma<<>>(score, cuda_labels_, cuda_weights_, num_data_, max_delta_step_, gradients, hessians); + } +} + +template +__global__ void GetGradientsKernel_Tweedie(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, + const double rho, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (data_index < num_data) { + if (!USE_WEIGHT) { + cuda_out_gradients[data_index] = static_cast(-cuda_labels[data_index] * exp((1 - rho) * cuda_scores[data_index]) + exp((2 - rho) * cuda_scores[data_index])); + cuda_out_hessians[data_index] = static_cast(-cuda_labels[data_index] * (1 - rho) * exp((1 - rho) * cuda_scores[data_index]) + + (2 - rho) * exp((2 - rho) * cuda_scores[data_index])); + } else { + const score_t weight = static_cast(cuda_weights[data_index]); + cuda_out_gradients[data_index] = static_cast(-cuda_labels[data_index] * exp((1 - rho) * cuda_scores[data_index]) + + exp((2 - rho) * cuda_scores[data_index])) * weight; + cuda_out_hessians[data_index] = static_cast(-cuda_labels[data_index] * (1 - rho) * exp((1 - rho) * cuda_scores[data_index]) + + (2 - rho) * exp((2 - rho) * cuda_scores[data_index])) * weight; + } + } +} + +void CUDARegressionTweedieLoss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; + if (cuda_weights_ == nullptr) { + GetGradientsKernel_Tweedie<<>>(score, cuda_labels_, nullptr, num_data_, rho_, gradients, hessians); + } else { + GetGradientsKernel_Tweedie<<>>(score, cuda_labels_, cuda_weights_, num_data_, rho_, gradients, hessians); + } } } // namespace LightGBM diff --git a/src/objective/cuda/cuda_regression_objective.hpp b/src/objective/cuda/cuda_regression_objective.hpp index c1131706bfd7..e3b2d77f9ec8 100644 --- a/src/objective/cuda/cuda_regression_objective.hpp +++ b/src/objective/cuda/cuda_regression_objective.hpp @@ -10,8 +10,6 @@ #ifdef USE_CUDA #define GET_GRADIENTS_BLOCK_SIZE_REGRESSION (1024) -#define CALC_INIT_SCORE_BLOCK_SIZE_REGRESSION (1024) -#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_REGRESSION (6) #include #include "../regression_objective.hpp" @@ -30,9 +28,12 @@ class CUDARegressionL2loss : public CUDAObjectiveInterface, public RegressionL2l void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; + void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; + double BoostFromScore(int) const override; - void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; + void RenewTreeOutputCUDA(const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, + const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override; std::function GetCUDAConvertOutputFunc() const override { return [this] (data_size_t num_data, const double* input, double* output) { @@ -40,17 +41,229 @@ class CUDARegressionL2loss : public CUDAObjectiveInterface, public RegressionL2l }; } - private: - void LaunchCalcInitScoreKernel() const; + bool IsConstantHessian() const override { + if (cuda_weights_ == nullptr) { + return true; + } else { + return false; + } + } + + protected: + virtual double LaunchCalcInitScoreKernel() const; + + virtual void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; + virtual void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; - void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; + virtual void LaunchRenewTreeOutputCUDAKernel( + const double* /*score*/, const data_size_t* /*data_indices_in_leaf*/, const data_size_t* /*num_data_in_leaf*/, + const data_size_t* /*data_start_in_leaf*/, const int /*num_leaves*/, double* /*leaf_value*/) const {} const label_t* cuda_labels_; - // TODO(shiyu1994): add weighted gradients const label_t* cuda_weights_; - double* cuda_boost_from_score_; + label_t* cuda_trans_label_; + double* cuda_block_buffer_; + data_size_t num_get_gradients_blocks_; + data_size_t num_init_score_blocks_; +}; + +class CUDARegressionL1loss : public CUDARegressionL2loss { + public: + explicit CUDARegressionL1loss(const Config& config); + + explicit CUDARegressionL1loss(const std::vector& strs); + + ~CUDARegressionL1loss(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + const char* GetName() const override { + return "regression_l1"; + } + + bool IsRenewTreeOutput() const override { return true; } + + protected: + data_size_t* cuda_data_indices_buffer_; + mutable double* cuda_weights_prefix_sum_; + double* cuda_weights_prefix_sum_buffer_; + mutable double* cuda_residual_buffer_; + mutable label_t* cuda_weight_by_leaf_buffer_; + label_t* cuda_percentile_result_; + + double LaunchCalcInitScoreKernel() const override; + + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; + + void LaunchRenewTreeOutputCUDAKernel( + const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, + const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override; +}; + +class CUDARegressionHuberLoss : public CUDARegressionL2loss { + public: + explicit CUDARegressionHuberLoss(const Config& config); + + explicit CUDARegressionHuberLoss(const std::vector& strs); + + ~CUDARegressionHuberLoss(); + + const char* GetName() const override { + return "huber"; + } + + private: + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; + + const double alpha_ = 0.0f; +}; + +class CUDARegressionFairLoss : public CUDARegressionL2loss { + public: + explicit CUDARegressionFairLoss(const Config& config); + + explicit CUDARegressionFairLoss(const std::vector& strs); + + ~CUDARegressionFairLoss(); + + const char* GetName() const override { + return "fair"; + } + + bool IsConstantHessian() const override { + return false; + } + + private: + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; + + const double c_ = 0.0f; +}; + +class CUDARegressionPoissonLoss : public CUDARegressionL2loss { + public: + explicit CUDARegressionPoissonLoss(const Config& config); + + explicit CUDARegressionPoissonLoss(const std::vector& strs); + + ~CUDARegressionPoissonLoss(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + double LaunchCalcInitScoreKernel() const override; + + bool IsConstantHessian() const override { + return false; + } + + const char* GetName() const override { + return "poisson"; + } + + protected: + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; + + void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const override; + + void LaunchCheckLabelKernel() const; + + const double max_delta_step_ = 0.0f; + mutable double* cuda_block_buffer_; +}; + +class CUDARegressionQuantileloss : public CUDARegressionL2loss { + public: + explicit CUDARegressionQuantileloss(const Config& config); + + explicit CUDARegressionQuantileloss(const std::vector& strs); + + ~CUDARegressionQuantileloss(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + const char* GetName() const override { + return "quantile"; + } + + bool IsRenewTreeOutput() const override { return true; } + + private: + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; + + double LaunchCalcInitScoreKernel() const override; + + void LaunchRenewTreeOutputCUDAKernel( + const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, + const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override; + + const double alpha_ = 0.0f; + data_size_t* cuda_data_indices_buffer_; + mutable double* cuda_weights_prefix_sum_; + double* cuda_weights_prefix_sum_buffer_; + mutable double* cuda_residual_buffer_; + mutable label_t* cuda_weight_by_leaf_buffer_; + label_t* cuda_percentile_result_; +}; + +class CUDARegressionMAPELOSS : public CUDARegressionL1loss { + public: + explicit CUDARegressionMAPELOSS(const Config& config); + + explicit CUDARegressionMAPELOSS(const std::vector& strs); + + ~CUDARegressionMAPELOSS(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + bool IsRenewTreeOutput() const override { return true; } + + private: + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; + + double LaunchCalcInitScoreKernel() const override; + + void LaunchRenewTreeOutputCUDAKernel( + const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, + const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override; + + void LaunchCalcLabelWeightKernel(); + + label_t* cuda_label_weights_; +}; + +class CUDARegressionGammaLoss : public CUDARegressionPoissonLoss { + public: + explicit CUDARegressionGammaLoss(const Config& config); + + explicit CUDARegressionGammaLoss(const std::vector& strs); + + ~CUDARegressionGammaLoss(); + + const char* GetName() const override { + return "gamma"; + } + + private: + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; +}; + +class CUDARegressionTweedieLoss : public CUDARegressionPoissonLoss { + public: + explicit CUDARegressionTweedieLoss(const Config& config); + + explicit CUDARegressionTweedieLoss(const std::vector& strs); + + ~CUDARegressionTweedieLoss(); + + const char* GetName() const override { + return "tweedie"; + } + + private: + const double rho_ = 0.0f; + + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; }; } // namespace LightGBM diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index 3ec74414405b..62e7d8c24149 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -23,12 +23,28 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new CUDABinaryLogloss(config); } else if (type == std::string("regression")) { return new CUDARegressionL2loss(config); + } else if (type == std::string("regression_l1")) { + return new CUDARegressionL1loss(config); + } else if (type == std::string("quantile")) { + return new CUDARegressionQuantileloss(config); + } else if (type == std::string("huber")) { + return new CUDARegressionHuberLoss(config); + } else if (type == std::string("fair")) { + return new CUDARegressionFairLoss(config); + } else if (type == std::string("poisson")) { + return new CUDARegressionFairLoss(config); } else if (type == std::string("lambdarank")) { return new CUDALambdarankNDCG(config); } else if (type == std::string("rank_xendcg")) { return new CUDARankXENDCG(config); } else if (type == std::string("multiclass")) { return new CUDAMulticlassSoftmax(config); + } else if (type == std::string("mape")) { + return new CUDARegressionMAPELOSS(config); + } else if (type == std::string("gamma")) { + return new CUDARegressionGammaLoss(config); + } else if (type == std::string("tweedie")) { + return new CUDARegressionTweedieLoss(config); } } else { if (type == std::string("regression")) { diff --git a/src/objective/regression_objective.hpp b/src/objective/regression_objective.hpp index bd25ec4e5029..4ac758309914 100644 --- a/src/objective/regression_objective.hpp +++ b/src/objective/regression_objective.hpp @@ -620,6 +620,7 @@ class RegressionMAPELOSS : public RegressionL1loss { #pragma omp parallel for schedule(static) for (data_size_t i = 0; i < num_data_; ++i) { const double diff = score[i] - label_[i]; + // TODO(shiyu1994): sample weight should be considered in the gradient calculation gradients[i] = static_cast(Common::Sign(diff) * label_weight_[i]); hessians[i] = weights_[i]; } @@ -662,6 +663,7 @@ class RegressionMAPELOSS : public RegressionL1loss { } bool IsConstantHessian() const override { + // TODO(shiyu1994): true only when weights is constant return true; } diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index b18746be414b..604a4728a3fd 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -200,8 +200,8 @@ void CUDADataPartition::SplitInner( ++cur_num_leaves_; } -void CUDADataPartition::UpdateTrainScore(const double learning_rate, double* cuda_scores) { - LaunchAddPredictionToScoreKernel(learning_rate, cuda_scores); +void CUDADataPartition::UpdateTrainScore(const double* leaf_value, double* cuda_scores) { + LaunchAddPredictionToScoreKernel(leaf_value, cuda_scores); } void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf, diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 309b0d57666b..8ee4d58712c5 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -1247,9 +1247,9 @@ void CUDADataPartition::LaunchSplitInnerKernel( *right_leaf_sum_of_hessians_ref = cpu_sum_hessians_info[1]; } -__global__ void AddPredictionToScoreKernel(const double* cuda_leaf_output, +__global__ void AddPredictionToScoreKernel( const data_size_t* num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const data_size_t* leaf_data_start, const double learning_rate, double* cuda_scores, + const data_size_t* leaf_data_start, const double* leaf_value, double* cuda_scores, const int* cuda_data_index_to_leaf_index, const data_size_t num_data) { const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockIdx_x = blockIdx.x; @@ -1257,16 +1257,16 @@ __global__ void AddPredictionToScoreKernel(const double* cuda_leaf_output, const int data_index = static_cast(blockIdx_x * blockDim_x + threadIdx_x); if (data_index < num_data) { const int leaf_index = cuda_data_index_to_leaf_index[data_index]; - const double leaf_prediction_value = cuda_leaf_output[leaf_index] * learning_rate; + const double leaf_prediction_value = leaf_value[leaf_index]; cuda_scores[data_index] += leaf_prediction_value; } } -void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double learning_rate, double* cuda_scores) { +void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double* leaf_value, double* cuda_scores) { global_timer.Start("CUDADataPartition::AddPredictionToScoreKernel"); const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; - AddPredictionToScoreKernel<<>>(cuda_leaf_output_, - cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, learning_rate, cuda_scores, cuda_data_index_to_leaf_index_, num_data_); + AddPredictionToScoreKernel<<>>( + cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, leaf_value, cuda_scores, cuda_data_index_to_leaf_index_, num_data_); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel"); } diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index db22feb61d28..4551e8714c31 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -57,10 +57,14 @@ class CUDADataPartition { double* left_leaf_sum_of_hessians, double* right_leaf_sum_of_hessians); - void UpdateTrainScore(const double learning_rate, double* cuda_scores); + void UpdateTrainScore(const double* leaf_value, double* cuda_scores); const data_size_t* cuda_data_indices() const { return cuda_data_indices_; } + const data_size_t* cuda_leaf_num_data() const { return cuda_leaf_num_data_; } + + const data_size_t* cuda_leaf_data_start() const { return cuda_leaf_data_start_; } + private: void CalcBlockDim( const data_size_t num_data_in_leaf, @@ -199,7 +203,7 @@ class CUDADataPartition { const int num_blocks, const int block_size); - void LaunchAddPredictionToScoreKernel(const double learning_rate, double* cuda_scores); + void LaunchAddPredictionToScoreKernel(const double* leaf_value, double* cuda_scores); // Host memory diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index fc30731a92c7..613ba969acf6 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -105,7 +105,7 @@ class CUDAHistogramConstructor { /*! \brief aligned number of bins of the features whose histograms need to be fixed */ std::vector need_fix_histogram_features_num_bin_aligend_; /*! \brief minimum number of blocks allowed in the y dimension */ - const int min_grid_dim_y_ = 160; + const int min_grid_dim_y_ = 10; // CUDA memory, held by this object diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 6ca28afc1de7..a11400ae0ad7 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace LightGBM { @@ -78,8 +79,10 @@ void NewCUDATreeLearner::FindBestSplitsFromHistograms(const std::vector& void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, int* /*left_leaf*/, int* /*right_leaf*/) {} -void NewCUDATreeLearner::AddPredictionToScore(const Tree* /*tree*/, double* out_score) const { - cuda_data_partition_->UpdateTrainScore(config_->learning_rate, out_score); +void NewCUDATreeLearner::AddPredictionToScore(const Tree* tree, double* out_score) const { + CHECK(tree->is_cuda_tree()); + const CUDATree* cuda_tree = reinterpret_cast(tree); + cuda_data_partition_->UpdateTrainScore(cuda_tree->cuda_leaf_value(), out_score); } Tree* NewCUDATreeLearner::Train(const score_t* gradients, @@ -217,6 +220,27 @@ void NewCUDATreeLearner::ResetTrainingData(const Dataset* /*train_data*/, void NewCUDATreeLearner::SetBaggingData(const Dataset* /*subset*/, const data_size_t* /*used_indices*/, data_size_t /*num_data*/) {} +void NewCUDATreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function /*residual_getter*/, + const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { + CHECK(tree->is_cuda_tree()); + CUDATree* cuda_tree = reinterpret_cast(tree); + std::vector host_leaf_values(cuda_tree->num_leaves(), 0.0f); + CopyFromCUDADeviceToHostOuter(host_leaf_values.data(), cuda_tree->cuda_leaf_value(), static_cast(cuda_tree->num_leaves()), __FILE__, __LINE__); + for (int leaf_index = 0; leaf_index < cuda_tree->num_leaves(); ++leaf_index) { + Log::Warning("before convert tree output, leaf_index = %d, leaf_value = %f", leaf_index, host_leaf_values[leaf_index]); + } + obj->RenewTreeOutputCUDA(score, + cuda_data_partition_->cuda_data_indices(), + cuda_data_partition_->cuda_leaf_num_data(), + cuda_data_partition_->cuda_leaf_data_start(), + tree->num_leaves(), + cuda_tree->cuda_leaf_value_ref()); + CopyFromCUDADeviceToHostOuter(host_leaf_values.data(), cuda_tree->cuda_leaf_value(), static_cast(cuda_tree->num_leaves()), __FILE__, __LINE__); + for (int leaf_index = 0; leaf_index < cuda_tree->num_leaves(); ++leaf_index) { + Log::Warning("after convert tree output, leaf_index = %d, leaf_value = %f", leaf_index, host_leaf_values[leaf_index]); + } +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 79a85e3e7ad8..4ee40ffdb668 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -33,6 +33,9 @@ class NewCUDATreeLearner: public SerialTreeLearner { void AddPredictionToScore(const Tree* tree, double* out_score) const override; + void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, + const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; + protected: void FindBestSplits(const Tree* tree) override; diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 9a30e37a03dc..cbe46ac47a12 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -681,7 +681,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, } void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, - data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { + const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { if (obj != nullptr && obj->IsRenewTreeOutput()) { CHECK_LE(tree->num_leaves(), data_partition_->num_leaves()); const data_size_t* bag_mapper = nullptr; @@ -691,6 +691,9 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj } std::vector n_nozeroworker_perleaf(tree->num_leaves(), 1); int num_machines = Network::num_machines(); + for (int leaf_index = 0; leaf_index < tree->num_leaves(); ++leaf_index) { + Log::Warning("before converting tree output leaf_index = %d, leaf_output = %f", leaf_index, tree->LeafOutput(leaf_index)); + } #pragma omp parallel for schedule(static) for (int i = 0; i < tree->num_leaves(); ++i) { const double output = static_cast(tree->LeafOutput(i)); @@ -717,6 +720,9 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj tree->SetLeafOutput(i, outputs[i] / n_nozeroworker_perleaf[i]); } } + for (int leaf_index = 0; leaf_index < tree->num_leaves(); ++leaf_index) { + Log::Warning("after converting tree output leaf_index = %d, leaf_output = %f", leaf_index, tree->LeafOutput(leaf_index)); + } } } diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 6a903542bef9..3ae24537f539 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -113,7 +113,7 @@ class SerialTreeLearner: public TreeLearner { } void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, - data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; + const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; /*! \brief Get output of parent node, used for path smoothing */ double GetParentOutput(const Tree* tree, const LeafSplits* leaf_splits) const; From 951aa37a8c622efe08371788d14e55433f37c8e9 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 3 Sep 2021 06:43:28 +0000 Subject: [PATCH 061/166] fix cuda mape loss add cuda xentropy loss --- include/LightGBM/cuda/cuda_algorithms.hpp | 2 +- src/cuda/cuda_algorithms.cu | 29 ++++++------- .../cuda/cuda_regression_objective.cu | 15 +++---- .../cuda/cuda_xentropy_objective.cpp | 26 ++++++++++++ src/objective/cuda/cuda_xentropy_objective.cu | 42 +++++++++++++++++++ .../cuda/cuda_xentropy_objective.hpp | 35 ++++++++++++++++ src/objective/xentropy_objective.hpp | 2 +- .../cuda/new_cuda_tree_learner.cpp | 9 ---- src/treelearner/serial_tree_learner.cpp | 6 --- 9 files changed, 125 insertions(+), 41 deletions(-) create mode 100644 src/objective/cuda/cuda_xentropy_objective.cpp create mode 100644 src/objective/cuda/cuda_xentropy_objective.cu create mode 100644 src/objective/cuda/cuda_xentropy_objective.hpp diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 82c05bec324a..c4c0e1f1fce1 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -374,7 +374,7 @@ __device__ void PrefixSumDevice(const VAL_T* in_values, } shared_buffer[threadIdx.x] = thread_sum; __syncthreads(); - PrefixSum(shared_buffer, num_data); + PrefixSum(shared_buffer, blockDim.x); const REDUCE_VAL_T thread_base = shared_buffer[threadIdx.x]; for (INDEX_T index = start; index < end; ++index) { out_values[index] = thread_base + static_cast(in_values[sorted_indices[index]]); diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index d16817e14c08..572063b1f6a0 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -364,19 +364,19 @@ void BitonicArgSortGlobal(const label_t* values, da } -template +template __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, const int len) { - __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; - __shared__ INDEX_T shared_indices[BITONIC_SORT_NUM_ELEMENTS]; + __shared__ VAL_T shared_values[BLOCK_DIM]; + __shared__ INDEX_T shared_indices[BLOCK_DIM]; int len_to_shift = len - 1; int max_depth = 1; while (len_to_shift > 0) { len_to_shift >>= 1; ++max_depth; } - const int num_blocks = (len + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; + const int num_blocks = (len + BLOCK_DIM - 1) / BLOCK_DIM; for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS + static_cast(threadIdx.x); + const int this_index = block_index * BLOCK_DIM + static_cast(threadIdx.x); if (this_index < len) { shared_values[threadIdx.x] = values[this_index]; shared_indices[threadIdx.x] = this_index; @@ -384,7 +384,7 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons shared_indices[threadIdx.x] = len; } __syncthreads(); - for (int depth = max_depth - 1; depth > max_depth - 11; --depth) { + for (int depth = max_depth - 1; depth > max_depth - MAX_DEPTH; --depth) { const int segment_length = (1 << (max_depth - depth)); const int segment_index = this_index / segment_length; const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); @@ -436,13 +436,13 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons } __syncthreads(); } - for (int depth = max_depth - 11; depth >= 1; --depth) { + for (int depth = max_depth - MAX_DEPTH; depth >= 1; --depth) { const int segment_length = (1 << (max_depth - depth)); { const int num_total_segment = (len + segment_length - 1) / segment_length; const int half_segment_length = (segment_length >> 1); for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS + static_cast(threadIdx.x); + const int this_index = block_index * BLOCK_DIM + static_cast(threadIdx.x); const int segment_index = this_index / segment_length; const int half_segment_index = this_index / half_segment_length; const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); @@ -467,10 +467,10 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons } __syncthreads(); } - for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { + for (int inner_depth = depth + 1; inner_depth <= max_depth - MAX_DEPTH; ++inner_depth) { const int half_segment_length = (1 << (max_depth - inner_depth - 1)); for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS + static_cast(threadIdx.x); + const int this_index = block_index * BLOCK_DIM + static_cast(threadIdx.x); const int segment_index = this_index / segment_length; const int half_segment_index = this_index / half_segment_length; const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); @@ -491,7 +491,7 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons } } for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS + static_cast(threadIdx.x); + const int this_index = block_index * BLOCK_DIM + static_cast(threadIdx.x); const int segment_index = this_index / segment_length; const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); if (this_index < len) { @@ -502,7 +502,7 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons shared_indices[threadIdx.x] = len; } __syncthreads(); - for (int inner_depth = max_depth - 10; inner_depth < max_depth; ++inner_depth) { + for (int inner_depth = max_depth - MAX_DEPTH + 1; inner_depth < max_depth; ++inner_depth) { const int half_segment_length = (1 << (max_depth - inner_depth - 1)); const int half_segment_index = this_index / half_segment_length; if (half_segment_index % 2 == 0) { @@ -702,7 +702,7 @@ __global__ void BitonicArgSortItemsGlobalKernel(const double* scores, const data_size_t query_item_start = cuda_query_boundaries[query_index]; const data_size_t query_item_end = cuda_query_boundaries[query_index + 1]; const data_size_t num_items_in_query = query_item_end - query_item_start; - BitonicArgSortDevice(scores + query_item_start, + BitonicArgSortDevice(scores + query_item_start, out_indices + query_item_start, num_items_in_query); __syncthreads(); @@ -1329,8 +1329,8 @@ __device__ VAL_T PercentileDeviceInner(const VAL_T* values, if (len <= 1) { return values[0]; } - BitonicArgSortDevice512(values, indices, len); if (!USE_WEIGHT) { + BitonicArgSortDevice(values, indices, len); const double float_pos = (1.0f - alpha) * len; const INDEX_T pos = static_cast(float_pos); if (pos < 1) { @@ -1344,6 +1344,7 @@ __device__ VAL_T PercentileDeviceInner(const VAL_T* values, return static_cast(v1 - (v1 - v2) * bias); } } else { + BitonicArgSortDevice(values, indices, len); PrefixSumDevice(weights, indices, weights_prefix_sum, len); const REDUCE_WEIGHT_T threshold = weights_prefix_sum[len - 1] * (1.0f - alpha); __shared__ INDEX_T pos; diff --git a/src/objective/cuda/cuda_regression_objective.cu b/src/objective/cuda/cuda_regression_objective.cu index 4ad35ab773ed..4d288be64389 100644 --- a/src/objective/cuda/cuda_regression_objective.cu +++ b/src/objective/cuda/cuda_regression_objective.cu @@ -171,7 +171,7 @@ void CUDARegressionL1loss::LaunchRenewTreeOutputCUDAKernel( cuda_data_indices_buffer_, leaf_value); } else { - RenewTreeOutputCUDAKernel_RegressionL1<<>>( + RenewTreeOutputCUDAKernel_RegressionL1<<>>( score, cuda_labels_, cuda_weights_, @@ -378,7 +378,7 @@ void CUDARegressionQuantileloss::LaunchRenewTreeOutputCUDAKernel( leaf_value, alpha_); } else { - RenewTreeOutputCUDAKernel_RegressionQuantile<<>>( + RenewTreeOutputCUDAKernel_RegressionQuantile<<>>( score, cuda_labels_, cuda_weights_, @@ -499,7 +499,6 @@ double CUDARegressionMAPELOSS::LaunchCalcInitScoreKernel() const { return static_cast(percentile_result); } -template __global__ void RenewTreeOutputCUDAKernel_RegressionMAPE( const double* score, const label_t* label, @@ -525,13 +524,11 @@ __global__ void RenewTreeOutputCUDAKernel_RegressionMAPE( const label_t data_label = label[data_index]; const double data_score = score[data_index]; residual_buffer[inner_data_index] = static_cast(data_label) - data_score; - if (USE_WEIGHT) { - weight_by_leaf[inner_data_index] = weight[data_index]; - } + weight_by_leaf[inner_data_index] = weight[data_index]; } __syncthreads(); // TODO(shiyu1994): replace this bitonic sort based percentile method with a more efficient one - const double renew_leaf_value = PercentileDevice( + const double renew_leaf_value = PercentileDevice( residual_buffer_pointer, weight_by_leaf_pointer, data_indices_buffer_pointer, weight_prefix_sum_buffer_pointer, alpha, num_data); if (threadIdx.x == 0) { @@ -546,8 +543,7 @@ void CUDARegressionMAPELOSS::LaunchRenewTreeOutputCUDAKernel( const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const { - Log::Warning("laucnhing RenewTreeOutputCUDAKernel_RegressionMAPE"); - RenewTreeOutputCUDAKernel_RegressionMAPE<<>>( + RenewTreeOutputCUDAKernel_RegressionMAPE<<>>( score, cuda_labels_, cuda_label_weights_, @@ -559,7 +555,6 @@ void CUDARegressionMAPELOSS::LaunchRenewTreeOutputCUDAKernel( data_start_in_leaf, cuda_data_indices_buffer_, leaf_value); - PrintLastCUDAErrorOuter(__FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } diff --git a/src/objective/cuda/cuda_xentropy_objective.cpp b/src/objective/cuda/cuda_xentropy_objective.cpp new file mode 100644 index 000000000000..e2b1e3dee5d2 --- /dev/null +++ b/src/objective/cuda/cuda_xentropy_objective.cpp @@ -0,0 +1,26 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_xentropy_objective.hpp" + +namespace LightGBM { + +CUDACrossEntropy::CUDACrossEntropy(const Config& config): CrossEntropy(config) {} + +CUDACrossEntropy::CUDACrossEntropy(const std::vector& strs): CrossEntropy(strs) {} + +CUDACrossEntropy::~CUDACrossEntropy() {} + +void CUDACrossEntropy::Init(const Metadata& metadata, data_size_t num_data) { + CrossEntropy::Init(metadata, num_data); + cuda_labels_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); +} + +void CUDACrossEntropy::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { + LaunchGetGradientsKernel(score, gradients, hessians); +} + +} // namespace LightGBM diff --git a/src/objective/cuda/cuda_xentropy_objective.cu b/src/objective/cuda/cuda_xentropy_objective.cu new file mode 100644 index 000000000000..c4c2e64e56a0 --- /dev/null +++ b/src/objective/cuda/cuda_xentropy_objective.cu @@ -0,0 +1,42 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_xentropy_objective.hpp" + +namespace LightGBM { + +template +__global__ void GetGradientsKernel_CrossEntropy( + const double* cuda_scores, + const label_t* cuda_labels, + const label_t* cuda_weights, + const data_size_t num_data, + score_t* cuda_out_gradients, + score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + if (USE_WEIGHT) { + const double z = 1.0f / (1.0f + exp(-cuda_scores[data_index])); + const label_t weight = cuda_weights[data_index]; + cuda_out_gradients[data_index] = static_cast(z - cuda_labels[data_index] * weight); + cuda_out_hessians[data_index] = static_cast(z * (1.0f - z) * weight); + } else { + const double z = 1.0f / (1.0f + exp(-cuda_scores[data_index])); + cuda_out_gradients[data_index] = static_cast(z - cuda_labels[data_index]); + cuda_out_hessians[data_index] = static_cast(z * (1.0f - z)); + } + } +} + +void CUDACrossEntropy::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; + if (cuda_weights_ == nullptr) { + GetGradientsKernel_CrossEntropy<<>>(score, cuda_labels_, nullptr, num_data_, gradients, hessians); + } else { + GetGradientsKernel_CrossEntropy<<>>(score, cuda_labels_, cuda_weights_, num_data_, gradients, hessians); + } +} + +} // namespace LightGBM diff --git a/src/objective/cuda/cuda_xentropy_objective.hpp b/src/objective/cuda/cuda_xentropy_objective.hpp new file mode 100644 index 000000000000..05cd0e24c4ef --- /dev/null +++ b/src/objective/cuda/cuda_xentropy_objective.hpp @@ -0,0 +1,35 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_XENTROPY_OBJECTIVE_HPP_ +#define LIGHTGBM_OBJECTIVE_CUDA_CUDA_XENTROPY_OBJECTIVE_HPP_ + +#include "../xentropy_objective.hpp" + +#define GET_GRADIENTS_BLOCK_SIZE_XENTROPY (1024) + +namespace LightGBM { + +class CUDACrossEntropy: public CrossEntropy { + public: + explicit CUDACrossEntropy(const Config& config); + + explicit CUDACrossEntropy(const std::vector& strs); + + ~CUDACrossEntropy(); + + virtual void Init(const Metadata& metadata, data_size_t num_data) override; + + virtual void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; + + private: + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; + + const label_t* cuda_labels_; + const label_t* cuda_weights_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_XENTROPY_OBJECTIVE_HPP_ \ No newline at end of file diff --git a/src/objective/xentropy_objective.hpp b/src/objective/xentropy_objective.hpp index e5e6f9d14ea7..4ce73a847a47 100644 --- a/src/objective/xentropy_objective.hpp +++ b/src/objective/xentropy_objective.hpp @@ -136,7 +136,7 @@ class CrossEntropy: public ObjectiveFunction { return initscore; } - private: + protected: /*! \brief Number of data points */ data_size_t num_data_; /*! \brief Pointer for label */ diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index a11400ae0ad7..ad236210fe54 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -224,21 +224,12 @@ void NewCUDATreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* ob const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { CHECK(tree->is_cuda_tree()); CUDATree* cuda_tree = reinterpret_cast(tree); - std::vector host_leaf_values(cuda_tree->num_leaves(), 0.0f); - CopyFromCUDADeviceToHostOuter(host_leaf_values.data(), cuda_tree->cuda_leaf_value(), static_cast(cuda_tree->num_leaves()), __FILE__, __LINE__); - for (int leaf_index = 0; leaf_index < cuda_tree->num_leaves(); ++leaf_index) { - Log::Warning("before convert tree output, leaf_index = %d, leaf_value = %f", leaf_index, host_leaf_values[leaf_index]); - } obj->RenewTreeOutputCUDA(score, cuda_data_partition_->cuda_data_indices(), cuda_data_partition_->cuda_leaf_num_data(), cuda_data_partition_->cuda_leaf_data_start(), tree->num_leaves(), cuda_tree->cuda_leaf_value_ref()); - CopyFromCUDADeviceToHostOuter(host_leaf_values.data(), cuda_tree->cuda_leaf_value(), static_cast(cuda_tree->num_leaves()), __FILE__, __LINE__); - for (int leaf_index = 0; leaf_index < cuda_tree->num_leaves(); ++leaf_index) { - Log::Warning("after convert tree output, leaf_index = %d, leaf_value = %f", leaf_index, host_leaf_values[leaf_index]); - } } } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index cbe46ac47a12..5fd872d61eeb 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -691,9 +691,6 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj } std::vector n_nozeroworker_perleaf(tree->num_leaves(), 1); int num_machines = Network::num_machines(); - for (int leaf_index = 0; leaf_index < tree->num_leaves(); ++leaf_index) { - Log::Warning("before converting tree output leaf_index = %d, leaf_output = %f", leaf_index, tree->LeafOutput(leaf_index)); - } #pragma omp parallel for schedule(static) for (int i = 0; i < tree->num_leaves(); ++i) { const double output = static_cast(tree->LeafOutput(i)); @@ -720,9 +717,6 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj tree->SetLeafOutput(i, outputs[i] / n_nozeroworker_perleaf[i]); } } - for (int leaf_index = 0; leaf_index < tree->num_leaves(); ++leaf_index) { - Log::Warning("after converting tree output leaf_index = %d, leaf_output = %f", leaf_index, tree->LeafOutput(leaf_index)); - } } } From b50ce5b436c1d5a600e6f5575e978c807be783e2 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 3 Sep 2021 09:15:08 +0000 Subject: [PATCH 062/166] use template for different versions of BitonicArgSortDevice --- include/LightGBM/cuda/cuda_algorithms.hpp | 3 - src/cuda/cuda_algorithms.cu | 182 ++-------------------- 2 files changed, 9 insertions(+), 176 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index c4c0e1f1fce1..c0294ab81889 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -452,14 +452,11 @@ void PercentileGlobal(const VAL_T* values, BitonicArgSortGlobal(values, indices, len); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); if (USE_WEIGHT) { - Log::Warning("before prefix sum"); GlobalInclusiveArgPrefixSum(indices, weights, weights_prefix_sum, weights_prefix_sum_buffer, static_cast(len)); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - Log::Warning("after prefix sum"); PercentileGlobalKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(values, weights, indices, weights_prefix_sum, alpha, len, cuda_out_value); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - Log::Warning("after percentile"); } } // namespace LightGBM diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 572063b1f6a0..1300f44e2aac 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -374,9 +374,9 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons len_to_shift >>= 1; ++max_depth; } - const int num_blocks = (len + BLOCK_DIM - 1) / BLOCK_DIM; + const int num_blocks = (len + static_cast(BLOCK_DIM) - 1) / static_cast(BLOCK_DIM); for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BLOCK_DIM + static_cast(threadIdx.x); + const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); if (this_index < len) { shared_values[threadIdx.x] = values[this_index]; shared_indices[threadIdx.x] = this_index; @@ -384,7 +384,7 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons shared_indices[threadIdx.x] = len; } __syncthreads(); - for (int depth = max_depth - 1; depth > max_depth - MAX_DEPTH; --depth) { + for (int depth = max_depth - 1; depth > max_depth - static_cast(MAX_DEPTH); --depth) { const int segment_length = (1 << (max_depth - depth)); const int segment_index = this_index / segment_length; const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); @@ -436,13 +436,13 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons } __syncthreads(); } - for (int depth = max_depth - MAX_DEPTH; depth >= 1; --depth) { + for (int depth = max_depth - static_cast(MAX_DEPTH); depth >= 1; --depth) { const int segment_length = (1 << (max_depth - depth)); { const int num_total_segment = (len + segment_length - 1) / segment_length; const int half_segment_length = (segment_length >> 1); for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BLOCK_DIM + static_cast(threadIdx.x); + const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); const int segment_index = this_index / segment_length; const int half_segment_index = this_index / half_segment_length; const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); @@ -467,10 +467,10 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons } __syncthreads(); } - for (int inner_depth = depth + 1; inner_depth <= max_depth - MAX_DEPTH; ++inner_depth) { + for (int inner_depth = depth + 1; inner_depth <= max_depth - static_cast(MAX_DEPTH); ++inner_depth) { const int half_segment_length = (1 << (max_depth - inner_depth - 1)); for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BLOCK_DIM + static_cast(threadIdx.x); + const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); const int segment_index = this_index / segment_length; const int half_segment_index = this_index / half_segment_length; const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); @@ -491,7 +491,7 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons } } for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BLOCK_DIM + static_cast(threadIdx.x); + const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); const int segment_index = this_index / segment_length; const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); if (this_index < len) { @@ -502,171 +502,7 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons shared_indices[threadIdx.x] = len; } __syncthreads(); - for (int inner_depth = max_depth - MAX_DEPTH + 1; inner_depth < max_depth; ++inner_depth) { - const int half_segment_length = (1 << (max_depth - inner_depth - 1)); - const int half_segment_index = this_index / half_segment_length; - if (half_segment_index % 2 == 0) { - const int other_index = static_cast(threadIdx.x) + half_segment_length; - const INDEX_T this_data_index = shared_indices[threadIdx.x]; - const INDEX_T other_data_index = shared_indices[other_index]; - const VAL_T this_value = shared_values[threadIdx.x]; - const VAL_T other_value = shared_values[other_index]; - if (other_data_index < len && (this_value > other_value) == ascending) { - shared_indices[threadIdx.x] = other_data_index; - shared_indices[other_index] = this_data_index; - shared_values[threadIdx.x] = other_value; - shared_values[other_index] = this_value; - } - } - __syncthreads(); - } - if (this_index < len) { - indices[this_index] = shared_indices[threadIdx.x]; - } - __syncthreads(); - } - } -} - -template -__device__ void BitonicArgSortDevice512(const VAL_T* values, INDEX_T* indices, const int len) { - __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS / 2]; - __shared__ INDEX_T shared_indices[BITONIC_SORT_NUM_ELEMENTS / 2]; - int len_to_shift = len - 1; - int max_depth = 1; - while (len_to_shift > 0) { - len_to_shift >>= 1; - ++max_depth; - } - const int num_blocks = (len + (BITONIC_SORT_NUM_ELEMENTS / 2) - 1) / (BITONIC_SORT_NUM_ELEMENTS / 2); - for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS / 2 + static_cast(threadIdx.x); - if (this_index < len) { - shared_values[threadIdx.x] = values[this_index]; - shared_indices[threadIdx.x] = this_index; - } else { - shared_indices[threadIdx.x] = len; - } - __syncthreads(); - for (int depth = max_depth - 1; depth > max_depth - 10; --depth) { - const int segment_length = (1 << (max_depth - depth)); - const int segment_index = this_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - { - const int half_segment_length = (segment_length >> 1); - const int half_segment_index = this_index / half_segment_length; - const int num_total_segment = (len + segment_length - 1) / segment_length; - const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ? - (num_total_segment * segment_length - len) : 0; - if (half_segment_index % 2 == 0) { - const int segment_start = segment_index * segment_length; - if (this_index >= offset + segment_start) { - const int other_index = static_cast(threadIdx.x) + half_segment_length - offset; - const INDEX_T this_data_index = shared_indices[threadIdx.x]; - const INDEX_T other_data_index = shared_indices[other_index]; - const VAL_T this_value = shared_values[threadIdx.x]; - const VAL_T other_value = shared_values[other_index]; - if (other_data_index < len && (this_value > other_value) == ascending) { - shared_indices[threadIdx.x] = other_data_index; - shared_indices[other_index] = this_data_index; - shared_values[threadIdx.x] = other_value; - shared_values[other_index] = this_value; - } - } - } - __syncthreads(); - } - for (int inner_depth = depth + 1; inner_depth < max_depth; ++inner_depth) { - const int half_segment_length = (1 << (max_depth - inner_depth - 1)); - const int half_segment_index = this_index / half_segment_length; - if (half_segment_index % 2 == 0) { - const int other_index = static_cast(threadIdx.x) + half_segment_length; - const INDEX_T this_data_index = shared_indices[threadIdx.x]; - const INDEX_T other_data_index = shared_indices[other_index]; - const VAL_T this_value = shared_values[threadIdx.x]; - const VAL_T other_value = shared_values[other_index]; - if (other_data_index < len && (this_value > other_value) == ascending) { - shared_indices[threadIdx.x] = other_data_index; - shared_indices[other_index] = this_data_index; - shared_values[threadIdx.x] = other_value; - shared_values[other_index] = this_value; - } - } - __syncthreads(); - } - } - if (this_index < len) { - indices[this_index] = shared_indices[threadIdx.x]; - } - __syncthreads(); - } - for (int depth = max_depth - 10; depth >= 1; --depth) { - const int segment_length = (1 << (max_depth - depth)); - { - const int num_total_segment = (len + segment_length - 1) / segment_length; - const int half_segment_length = (segment_length >> 1); - for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS / 2 + static_cast(threadIdx.x); - const int segment_index = this_index / segment_length; - const int half_segment_index = this_index / half_segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ? - (num_total_segment * segment_length - len) : 0; - if (half_segment_index % 2 == 0) { - const int segment_start = segment_index * segment_length; - if (this_index >= offset + segment_start) { - const int other_index = this_index + half_segment_length - offset; - if (other_index < len) { - const INDEX_T this_data_index = indices[this_index]; - const INDEX_T other_data_index = indices[other_index]; - const VAL_T this_value = values[this_data_index]; - const VAL_T other_value = values[other_data_index]; - if ((this_value > other_value) == ascending) { - indices[this_index] = other_data_index; - indices[other_index] = this_data_index; - } - } - } - } - } - __syncthreads(); - } - for (int inner_depth = depth + 1; inner_depth <= max_depth - 10; ++inner_depth) { - const int half_segment_length = (1 << (max_depth - inner_depth - 1)); - for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS / 2 + static_cast(threadIdx.x); - const int segment_index = this_index / segment_length; - const int half_segment_index = this_index / half_segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - if (half_segment_index % 2 == 0) { - const int other_index = this_index + half_segment_length; - if (other_index < len) { - const INDEX_T this_data_index = indices[this_index]; - const INDEX_T other_data_index = indices[other_index]; - const VAL_T this_value = values[this_data_index]; - const VAL_T other_value = values[other_data_index]; - if ((this_value > other_value) == ascending) { - indices[this_index] = other_data_index; - indices[other_index] = this_data_index; - } - } - } - __syncthreads(); - } - } - for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * BITONIC_SORT_NUM_ELEMENTS / 2 + static_cast(threadIdx.x); - const int segment_index = this_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - if (this_index < len) { - const INDEX_T index = indices[this_index]; - shared_values[threadIdx.x] = values[index]; - shared_indices[threadIdx.x] = index; - } else { - shared_indices[threadIdx.x] = len; - } - __syncthreads(); - for (int inner_depth = max_depth - 9; inner_depth < max_depth; ++inner_depth) { + for (int inner_depth = max_depth - static_cast(MAX_DEPTH) + 1; inner_depth < max_depth; ++inner_depth) { const int half_segment_length = (1 << (max_depth - inner_depth - 1)); const int half_segment_index = this_index / half_segment_length; if (half_segment_index % 2 == 0) { From f51fd7092285f2330003febce227fee7b3390ee8 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 6 Sep 2021 14:54:32 +0000 Subject: [PATCH 063/166] add multiclass metrics --- include/LightGBM/cuda/cuda_algorithms.hpp | 36 ++ src/cuda/cuda_algorithms.cu | 24 ++ src/metric/cuda/cuda_multiclass_metric.cpp | 104 ++++++ src/metric/cuda/cuda_multiclass_metric.cu | 312 ++++++++++++++++++ src/metric/cuda/cuda_multiclass_metric.hpp | 125 +++++++ src/metric/cuda/cuda_xentropy_metric.cpp | 79 +++++ src/metric/cuda/cuda_xentropy_metric.cu | 149 +++++++++ src/metric/cuda/cuda_xentropy_metric.hpp | 77 +++++ src/metric/metric.cpp | 15 +- src/metric/multiclass_metric.hpp | 4 +- src/metric/xentropy_metric.hpp | 6 +- .../cuda/cuda_xentropy_objective.cpp | 29 ++ src/objective/cuda/cuda_xentropy_objective.cu | 78 +++++ .../cuda/cuda_xentropy_objective.hpp | 31 +- src/objective/objective_function.cpp | 18 +- src/objective/xentropy_objective.hpp | 2 +- 16 files changed, 1075 insertions(+), 14 deletions(-) create mode 100644 src/metric/cuda/cuda_multiclass_metric.cpp create mode 100644 src/metric/cuda/cuda_multiclass_metric.cu create mode 100644 src/metric/cuda/cuda_multiclass_metric.hpp create mode 100644 src/metric/cuda/cuda_xentropy_metric.cpp create mode 100644 src/metric/cuda/cuda_xentropy_metric.cu create mode 100644 src/metric/cuda/cuda_xentropy_metric.hpp diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index c0294ab81889..14acb2494c07 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -57,6 +57,9 @@ __device__ void ReduceSumConflictFree(T* values, size_t n) { template void ReduceSumGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer); +template +void ReduceDotProductGlobal(const VAL_A_T* a, const VAL_B_T* b, size_t n, REDUCE_T* block_buffer); + template void ReduceMaxGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer); @@ -77,6 +80,30 @@ void GlobalGenAUCPosNegSum(const label_t* labels, double* block_sum_pos_buffer, const data_size_t num_data); +template +__global__ void GlobalInclusivePrefixSumReduceBlockKernel(T* block_buffer, data_size_t num_blocks); + +template +__global__ void GlobalInclusivePrefixSumAddBlockBaseKernel(const T* block_buffer, T* values, data_size_t num_data); + +__global__ void GlobalGenAUCMarkKernel(const double* scores, + const data_size_t* sorted_indices, + data_size_t* mark_buffer, + data_size_t* block_mark_buffer, + uint16_t* block_mark_first_zero, + data_size_t num_data); + +__global__ void GlobalInclusivePrefixSumReduceBlockZeroOutKernel( + data_size_t* block_buffer, + const uint16_t* block_mark_first_zero, + data_size_t num_blocks); + +__global__ void GlobalInclusivePrefixSumAddBlockBaseGenAUCMarkKernel( + const data_size_t* block_buffer, + data_size_t* values, + const uint16_t* block_first_zero, + data_size_t num_data); + void GloblGenAUCMark(const double* scores, const data_size_t* sorted_indices, data_size_t* mark_buffer, @@ -178,6 +205,15 @@ void BitonicSortGlobal(VAL_T* values, const size_t len); template void BitonicArgSortGlobal(const VAL_T* values, INDEX_T* indices, const size_t len); +template +__global__ void BitonicArgSortGlobalKernel(const VAL_T* values, INDEX_T* indices, const int num_total_data); + +template +__global__ void BitonicArgCompareKernel(const VAL_T* values, INDEX_T* indices, const int half_segment_length, const int outer_segment_length, const int len); + +template +__global__ void BitonicArgSortMergeKernel(const VAL_T* values, INDEX_T* indices, const int segment_length, const int len); + void BitonicArgSortItemsGlobal(const double* values, const int num_queries, const data_size_t* cuda_query_boundaries, diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 1300f44e2aac..0d03d2f65eeb 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -1102,11 +1102,35 @@ void ReduceSumGlobalInner(const VAL_T* values, size_t n, REDUCE_T* block_buffer) BlockReduceSum<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); } +template +__global__ void ReduceDotProductGlobalKernel(const VAL_A_T* a, const VAL_B_T* b, const data_size_t num_value, REDUCE_T* block_buffer) { + __shared__ REDUCE_T shared_buffer[32]; + const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + const REDUCE_T value = (data_index < num_value ? static_cast(a[data_index]) * static_cast(b[data_index]) : 0.0f); + const REDUCE_T reduce_value = ShuffleReduceSum(value, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + block_buffer[blockIdx.x] = reduce_value; + } +} + template <> void ReduceSumGlobal(const label_t* values, size_t n, double* block_buffer) { ReduceSumGlobalInner(values, n, block_buffer); } +template +void ReduceDotProductGlobalInner(const VAL_A_T* a, const VAL_B_T* b, size_t n, REDUCE_T* block_buffer) { + const data_size_t num_value = static_cast(n); + const data_size_t num_blocks = (num_value + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + ReduceDotProductGlobalKernel<<>>(a, b, num_value, block_buffer); + BlockReduceSum<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); +} + +template <> +void ReduceDotProductGlobal(const label_t* a, const label_t* b, size_t n, double* block_buffer) { + ReduceDotProductGlobalInner(a, b, n, block_buffer); +} + template __global__ void ReduceMaxGlobalKernel(const VAL_T* values, const data_size_t num_value, REDUCE_T* block_buffer) { __shared__ REDUCE_T shared_buffer[32]; diff --git a/src/metric/cuda/cuda_multiclass_metric.cpp b/src/metric/cuda/cuda_multiclass_metric.cpp new file mode 100644 index 000000000000..2a8dd0a1928e --- /dev/null +++ b/src/metric/cuda/cuda_multiclass_metric.cpp @@ -0,0 +1,104 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_multiclass_metric.hpp" + +namespace LightGBM { + +template +CUDAMulticlassMetric::CUDAMulticlassMetric(const Config& config): MulticlassMetric(config) {} + +template +CUDAMulticlassMetric::~CUDAMulticlassMetric() {} + +template +void CUDAMulticlassMetric::Init(const Metadata& metadata, data_size_t num_data) { + cuda_label_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + + const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; + AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); +} + +template +std::vector CUDAMulticlassMetric::Eval(const double* score, const ObjectiveFunction* objective) const { + double sum_loss = 0.0f; + objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); + LaunchEvalKernel(cuda_score_convert_buffer_); + CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); + return std::vector(1, CUDAPointWiseLossCalculator::AverageLoss(sum_loss, this->sum_weights_)); +} + +CUDAMultiErrorMetric::CUDAMultiErrorMetric(const Config& config): CUDAMulticlassMetric(config) {} + +CUDAMultiSoftmaxLoglossMetric::CUDAMultiSoftmaxLoglossMetric(const Config& config): CUDAMulticlassMetric(config) {} + +CUDAAucMuMetric::CUDAAucMuMetric(const Config& config): AucMuMetric(config) {} + +CUDAAucMuMetric::~CUDAAucMuMetric() {} + +void CUDAAucMuMetric::Init(const Metadata& metadata, data_size_t num_data) { + AucMuMetric::Init(metadata, num_data); + std::vector class_start(num_class_, 0); + data_size_t max_class_size = 0; + int max_class_size_class = -1; + for (int i = 0; i < num_class_; ++i) { + const data_size_t this_class_size = class_sizes_[i]; + if (this_class_size > max_class_size) { + max_class_size = this_class_size; + max_class_size_class = i; + } + } + data_size_t second_max_class_size = 0; + for (int i = 0; i < num_class_; ++i) { + if (i != max_class_size_class) { + const data_size_t this_class_size = class_sizes_[i]; + if (this_class_size > second_max_class_size) { + second_max_class_size = this_class_size; + } + } + } + for (int i = 1; i < num_class_; ++i) { + class_start[i] += class_start[i - 1] + class_sizes_[i - 1]; + } + InitCUDAMemoryFromHostMemoryOuter(&cuda_class_start_, class_start.data(), class_start.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_class_size_, class_sizes_.data(), class_sizes_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_sorted_indices_, sorted_data_idx_.data(), sorted_data_idx_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_class_data_weights_, class_data_weights_.data(), class_data_weights_.size(), __FILE__, __LINE__); + const int num_class_pair = (num_class_ - 1) * num_class_ / 2; + max_pair_buffer_size_ = max_class_size + second_max_class_size; + const size_t total_pair_buffer_size = static_cast(max_pair_buffer_size_ * num_class_pair); + AllocateCUDAMemoryOuter(&cuda_dist_, total_pair_buffer_size, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sorted_indices_by_dist_, total_pair_buffer_size, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_pos_buffer_, total_pair_buffer_size, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_neg_buffer_, total_pair_buffer_size, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_threshold_mask_, total_pair_buffer_size, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_reduce_block_buffer_, static_cast(num_class_pair), __FILE__, __LINE__); + + const int num_blocks = (max_pair_buffer_size_ + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; + AllocateCUDAMemoryOuter(&cuda_block_mark_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_mark_first_zero_, static_cast(num_blocks) + 1, __FILE__, __LINE__); + + const size_t curr_v_size = static_cast(num_class_pair * num_class_); + std::vector all_curr_v(curr_v_size, 0.0f); + for (int i = 0; i < num_class_ - 1; ++i) { + for (int j = i + 1; j < num_class_; ++j) { + const int i_p = num_class_ - 2 - i; + const int pair_index = i_p * (i_p + 1) / 2 + j - i - 1; + for (int k = 0; k < num_class_; ++k) { + all_curr_v[pair_index * num_class_ + k] = class_weights_[i][k] - class_weights_[j][k]; + } + } + } + InitCUDAMemoryFromHostMemoryOuter(&cuda_curr_v_, all_curr_v.data(), all_curr_v.size(), __FILE__, __LINE__); +} + +std::vector CUDAAucMuMetric::Eval(const double* score, const ObjectiveFunction*) const { + LaunchEvalKernel(score); +} + +} // namespace LightGBM diff --git a/src/metric/cuda/cuda_multiclass_metric.cu b/src/metric/cuda/cuda_multiclass_metric.cu new file mode 100644 index 000000000000..1c2e2cffb758 --- /dev/null +++ b/src/metric/cuda/cuda_multiclass_metric.cu @@ -0,0 +1,312 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_multiclass_metric.hpp" +#include + +namespace LightGBM { + +template +__global__ void EvalKernel_MulticlassPointWiseLoss(const double* score, + const label_t* label, + const label_t* weights, + const data_size_t num_data, + const double sum_weight, + double* cuda_sum_loss_buffer, + const int num_classes, + const int multi_error_top_k) { + // assert that warpSize == 32 and maximum number of threads per block is 1024 + __shared__ double shared_buffer[32]; + const int data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const double* score_ptr = score + data_index * num_classes; + const double pointwise_loss = data_index < num_data ? + (USE_WEIGHT ? CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score_ptr, num_classes, multi_error_top_k) * weights[data_index] : + CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score_ptr, num_classes, multi_error_top_k)) : + 0.0f; + const double loss = ShuffleReduceSum(pointwise_loss, shared_buffer, blockDim.x); + if (threadIdx.x == 0) { + cuda_sum_loss_buffer[blockIdx.x] = loss; + } +} + +template +__global__ void ReduceLossKernel_Multiclass(const double* cuda_sum_loss_buffer, const data_size_t num_blocks, double* out_loss) { + __shared__ double shared_buffer[32]; + double thread_sum_loss = 0.0f; + for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { + thread_sum_loss += cuda_sum_loss_buffer[block_index]; + } + const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, static_cast(num_blocks)); + if (threadIdx.x == 0) { + *out_loss = sum_loss; + } +} + +template +void CUDAMulticlassMetric::LaunchEvalKernelInner(const double* score) const { + const data_size_t num_blocks = (MulticlassMetric::num_data_ + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; + if (cuda_weights_ == nullptr) { + EvalKernel_MulticlassPointWiseLoss<<>>( + score, cuda_label_, cuda_weights_, + this->num_data_, + this->sum_weights_, + cuda_sum_loss_buffer_, + this->num_class_, + this->config_.multi_error_top_k); + } else { + EvalKernel_MulticlassPointWiseLoss<<>>( + score, cuda_label_, cuda_weights_, + this->num_data_, + this->sum_weights_, + cuda_sum_loss_buffer_, + this->num_class_, + this->config_.multi_error_top_k); + } + ReduceLossKernel_Multiclass<<<1, EVAL_BLOCK_SIZE_MULTICLASS_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); +} + +template <> +void CUDAMulticlassMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +template <> +void CUDAMulticlassMetric::LaunchEvalKernel(const double* score) const { + LaunchEvalKernelInner(score); +} + +__global__ void EvalKernel_AucMuInner( + const data_size_t* cuda_class_start, + const data_size_t* cuda_class_size, + const data_size_t* cuda_sorted_indices, + const double* cuda_class_data_weights, + double* cuda_dist, + data_size_t* cuda_sorted_indices_by_dist) { + +} + +__global__ void EvalKernel_AucMuWriteDist( + const data_size_t i_class_start, + const data_size_t i_class_size, + const data_size_t j_class_start, + const data_size_t j_class_size, + const data_size_t* cuda_sorted_indices, + const double* cuda_class_data_weights, + const double* cuda_curr_v, + const double* score, + const data_size_t max_pair_buffer_size, + const data_size_t num_data, + const int num_class, + double* cuda_dist) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + // put the dist of class j in the front + const data_size_t data_index_in_class = data_index < j_class_size ? data_index : data_index - j_class_size; + const data_size_t class_start = data_index < j_class_size ? j_class_size : i_class_start; + const data_size_t class_size = data_index < j_class_size ? j_class_size : i_class_size; + const data_size_t* sorted_indices_in_class = cuda_sorted_indices + class_start; + const data_size_t a = sorted_indices_in_class[data_index_in_class]; + double v_a = 0.0f; + for (int m = 0; m < num_class; ++m) { + v_a += cuda_curr_v[m] * score[num_data * m + a]; + } + const double t1 = cuda_curr_v[i] - cuda_curr_v[j]; + cuda_dist[data_index] = v_a * t1; +} + +__global__ void BitonicArgSortGlobal_AucMu( + const double* dist, + data_size_t* out_data_indices, + const data_size_t num_data) { + int max_depth = 1; + int len_to_shift = static_cast(num_data) - 1; + while (len_to_shift > 0) { + ++max_depth; + len_to_shift >>= 1; + } + const int num_blocks = (static_cast(num_data) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; + BitonicArgSortGlobalKernel<<>>(dist, out_data_indices, static_cast(num_data)); + for (int depth = max_depth - 11; depth >= 1; --depth) { + const int segment_length = (1 << (max_depth - depth)); + int half_segment_length = (segment_length >> 1); + { + BitonicArgCompareKernel<<>>( + dist, out_data_indices, half_segment_length, segment_length, static_cast(num_data)); + half_segment_length >>= 1; + } + for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { + BitonicArgCompareKernel<<>>( + dist, out_data_indices, half_segment_length, segment_length, static_cast(num_data)); + half_segment_length >>= 1; + } + BitonicArgSortMergeKernel<<>>( + dist, out_data_indices, segment_length, static_cast(len)); + } +} + +template +__global__ void GenAucMuPosPrefixSumWithinBlock( + const data_size_t* sorted_data_indices_global, + const data_size_t* sorted_data_indices_two_class, + const data_size_t i_class_size, + const data_size_t j_class_size, + const data_size_t i_class_start, + const data_size_t j_class_start, + const label_t* cuda_weights, + double* sum_pos_buffer, + double* block_sum_pos_buffer) { + __shared__ double shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; + const data_size_t inner_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + double pos = 0.0f; + if (inner_data_index < j_class_size + i_class_size) { + const data_size_t data_index_two_class = sorted_data_indices[inner_data_index]; + const bool is_pos_class = (data_index_two_class < j_class_size); + if (USE_WEIGHT) { + const data_size_t data_index_one_class = (is_pos_class ? data_index_two_class : data_index_two_class - j_class_size); + const data_size_t data_index_global = (is_pos_class ? sorted_data_indices_global[j_class_start + data_index_one_class] : + sorted_data_indices_global[i_class_start + data_index_one_class]); + pos = ((is_pos_class == IS_POS) ? cuda_weights[data_index_global] : 0.0f); + } else { + pos = ((is_pos_class == IS_POS) ? 1.0f : 0.0f); + } + } + shared_buffer[threadIdx.x] = pos; + __syncthreads(); + PrefixSum(shared_buffer, blockDim.x); + if (inner_data_index < j_class_size + i_class_size) { + sum_pos_buffer[inner_data_index] = shared_buffer[threadIdx.x + 1]; + } + if (threadIdx.x == 0) { + block_sum_pos_buffer[blockidx.x + 1] = shared_buffer[blockDim.x]; + } +} + +__global__ void GenAucMuPosPrefixSum( + const data_size_t* sorted_data_indices_global, + const data_size_t* sorted_data_indices_two_class, + const data_size_t i_class_size, + const data_size_t j_class_size, + const data_size_t i_class_start, + const data_size_t j_class_start, + const label_t* cuda_weights, + double* sum_pos_buffer, + double* block_sum_pos_buffer) { + const data_size_t num_data = i_class_size + j_class_size; + const int num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + GenAucMuPosPrefixSumWithinBlock<<>>( + sorted_data_indices_global, + sorted_data_indices_two_class, + i_class_size, + j_class_size, + i_class_start, + j_class_start, + cuda_weights, + sum_pos_buffer, + block_sum_pos_buffer); + GlobalInclusivePrefixSumReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( + block_sum_pos_buffer, num_blocks); + GlobalInclusivePrefixSumAddBlockBaseKernel<<>>( + block_sum_pos_buffer, sum_pos_buffer, num_data); +} + +__global__ void GenAucMuMark( + const double* dist, + const data_size_t* sorted_data_indices, + const data_size_t num_data, + data_size_t* threshold_mark, + data_size_t* block_mark_buffer, + uint16_t* block_mark_first_zero) { + const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + GlobalGenAUCMarkKernel<<>>(dist, sorted_data_indices, threshold_mark, block_mark_buffer, block_mark_first_zero, num_data); + GlobalInclusivePrefixSumReduceBlockZeroOutKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( + block_mark_buffer, block_mark_first_zero, num_blocks); + GlobalInclusivePrefixSumAddBlockBaseGenAUCMarkKernel<<>>( + block_mark_buffer, threshold_mark, block_mark_first_zero, num_data); +} + +template +__global__ void EvalKernel_AucMu( + const data_size_t* cuda_class_start, + const data_size_t* cuda_class_size, + const data_size_t* cuda_sorted_indices, + const double* cuda_class_data_weights, + const double* cuda_curr_v, + const double* score, + const data_size_t max_pair_buffer_size, + const data_size_t num_data, + const int num_class, + const label_t* cuda_weights, + double* cuda_dist, + data_size_t* cuda_sorted_indices_by_dist, + data_size_t* cuda_threshold_mark, + data_size_t* cuda_block_threshold_mark_buffer, + uint16_t* cuda_block_mark_first_zero, + double* sum_pos_buffer, + double* block_sum_pos_buffer) { + const int pair_index = static_cast(blockIdx.x); + const double index_2 = 2 * static_cast(pair_index); + const int sqrt_round = static_cast(sqrt(index_2)); + const int i_p = static_cast(sqrt(index_2 - static_cast(sqrt_round) + 1)); + const int j_p = pair_index - ((i_p + 1) * i_p / 2); + const int i = num_class - 2 - i_p; + const int j = j_p + i + 1; + const data_size_t num_data_in_pair = cuda_class_size[i] + cuda_class_size[j]; + const int num_blocks = (num_data_in_pair + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; + const data_size_t i_class_start = cuda_class_start[i]; + const data_size_t j_class_start = cuda_class_start[j]; + const data_size_t i_class_size = cuda_class_size[i]; + const data_size_t j_class_size = cuda_class_size[j]; + double* cuda_dist_ptr = cuda_dist + pair_index * max_pair_buffer_size; + data_size_t* cuda_sorted_indices_by_dist_ptr = cuda_sorted_indices_by_dist + pair_index * max_pair_buffer_size; + const double* cuda_curr_v_ptr = cuda_curr_v + pair_index * num_class; + cudaStream_t cuda_stream; + cudaStreamCreate(&cuda_stream); + EvalKernel_AucMuWriteDist<<>>( + i_class_start, + i_class_size, + j_class_start, + j_class_size, + cuda_sorted_indices, + cuda_class_data_weights, + cuda_curr_v_ptr, + score, + max_pair_buffer_size, + num_data, + num_class, + cuda_dist_ptr); + BitonicArgSortGlobal_AucMu<<<1, 1, 0, cuda_stream>>>( + cuda_dist_ptr, + cuda_sorted_indices_by_dist_ptr, + i_class_size + j_class_size); + GenAucMuPosPrefixSum<<<1, 1, 0, cuda_stream>>>( + cuda_sorted_indices, + cuda_sorted_indices_by_dist, + i_class_size, + j_class_size, + i_class_start, + j_class_start, + cuda_weights, + sum_pos_buffer, + block_sum_pos_buffer); + GenAucMuMark<<<1, 1, 0, cuda_stream>>>( + cuda_dist_ptr, + cuda_sorted_indices_by_dist_ptr, + cuda_threshold_mark, + cuda_block_threshold_mark_buffer, + cuda_block_mark_first_zero); + +} + +void CUDAAucMuMetric::LaunchEvalKernel(const double* score) const { + const int num_class_pair = (num_class_ - 1) * num_class_ / 2; + EvalKernel_AucMu<<>>( + cuda_class_start_, + cuda_class_size_, + cuda_sorted_indices, + cuda_class_data_weights_, + cuda_dist_, + cuda_sorted_indices_by_dist); +} + +} // namespace LightGBM diff --git a/src/metric/cuda/cuda_multiclass_metric.hpp b/src/metric/cuda/cuda_multiclass_metric.hpp new file mode 100644 index 000000000000..6d0c49c10837 --- /dev/null +++ b/src/metric/cuda/cuda_multiclass_metric.hpp @@ -0,0 +1,125 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_METRIC_CUDA_CUDA_MULTICLASS_METRIC_HPP_ +#define LIGHTGBM_METRIC_CUDA_CUDA_MULTICLASS_METRIC_HPP_ + +#include "cuda_metric.hpp" +#include "../multiclass_metric.hpp" + +#define EVAL_BLOCK_SIZE_MULTICLASS_METRIC (1024) + +namespace LightGBM { + +template +class CUDAMulticlassMetric : public CUDAMetricInterface, public MulticlassMetric { + public: + explicit CUDAMulticlassMetric(const Config& config); + + ~CUDAMulticlassMetric(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + std::vector Eval(const double* score, const ObjectiveFunction* objective) const override; + + inline static double AverageLoss(double sum_loss, double sum_weights) { + // need sqrt the result for RMSE loss + return (sum_loss / sum_weights); + } + + inline static double LossOnPoint(label_t /*label*/, std::vector* /*score*/, const Config& /*config*/) { + Log::Fatal("Calling host LossOnPoint for a CUDA metric."); + return 0.0f; + } + + protected: + void LaunchEvalKernel(const double* score) const; + + void LaunchEvalKernelInner(const double* score) const; + + const label_t* cuda_label_; + const label_t* cuda_weights_; + double* cuda_score_convert_buffer_; + double* cuda_sum_loss_buffer_; + double* cuda_sum_loss_; +}; + +class CUDAMultiErrorMetric : public CUDAMulticlassMetric { + public: + explicit CUDAMultiErrorMetric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, const double* score, const int num_classes, const int multi_error_top_k) { + const size_t k = static_cast(label); + int num_larger = 0; + for (int i = 0; i < num_classes; ++i) { + if (score[i] >= score[k]) ++num_larger; + if (num_larger > multi_error_top_k) return 1.0f; + } + return 0.0f; + } + + inline static const std::string Name(const Config& config) { + if (config.multi_error_top_k == 1) { + return "multi_error"; + } else { + return "multi_error@" + std::to_string(config.multi_error_top_k); + } + } +}; + +class CUDAMultiSoftmaxLoglossMetric : public CUDAMulticlassMetric { + public: + explicit CUDAMultiSoftmaxLoglossMetric(const Config& config); + + __device__ inline static double LossOnPointCUDA(label_t label, const double* score, const int /*num_classes*/, const int /*multi_error_top_k*/) { + size_t k = static_cast(label); + if (score[k] > kEpsilon) { + return static_cast(-log(score[k])); + } else { + return -log(kEpsilon); + } + } + + inline static const std::string Name(const Config& /*config*/) { + return "multi_logloss"; + } +}; + +class CUDAAucMuMetric : public CUDAMetricInterface, public AucMuMetric { + public: + explicit CUDAAucMuMetric(const Config& config); + + ~CUDAAucMuMetric(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + std::vector Eval(const double* score, const ObjectiveFunction*) const override; + + private: + void LaunchEvalKernel(const double* score) const; + + int num_class_pair_; + data_size_t max_pair_buffer_size_; + + data_size_t* cuda_class_start_; + data_size_t* cuda_class_size_; + data_size_t* cuda_sorted_indices_; + double* cuda_dist_; + double* cuda_class_data_weights_; + double* cuda_class_weights_; + data_size_t* cuda_sorted_indices_by_dist_; + double* cuda_curr_v_; + + double* cuda_sum_pos_buffer_; + double* cuda_sum_neg_buffer_; + data_size_t* cuda_threshold_mask_; + data_size_t* cuda_block_mark_buffer_; + uint16_t* cuda_block_mark_first_zero_; + + double* cuda_reduce_block_buffer_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_METRIC_CUDA_CUDA_MULTICLASS_METRIC_HPP_ diff --git a/src/metric/cuda/cuda_xentropy_metric.cpp b/src/metric/cuda/cuda_xentropy_metric.cpp new file mode 100644 index 000000000000..68d83bbd3933 --- /dev/null +++ b/src/metric/cuda/cuda_xentropy_metric.cpp @@ -0,0 +1,79 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_xentropy_metric.hpp" + +namespace LightGBM { + +CUDACrossEntropyMetric::CUDACrossEntropyMetric(const Config& config): CrossEntropyMetric(config) {} + +CUDACrossEntropyMetric::~CUDACrossEntropyMetric() {} + +void CUDACrossEntropyMetric::Init(const Metadata& metadata, data_size_t num_data) { + CrossEntropyMetric::Init(metadata, num_data); + cuda_label_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data_), __FILE__, __LINE__); + + const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; + AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); +} + +std::vector CUDACrossEntropyMetric::Eval(const double* score, const ObjectiveFunction* objective) const { + double sum_loss = 0.0f; + objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); + LaunchEvalKernel(score); + CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); + return std::vector(1, sum_loss / sum_weights_); +} + +CUDACrossEntropyLambdaMetric::CUDACrossEntropyLambdaMetric(const Config& config): CrossEntropyLambdaMetric(config) {} + +CUDACrossEntropyLambdaMetric::~CUDACrossEntropyLambdaMetric() {} + +void CUDACrossEntropyLambdaMetric::Init(const Metadata& metadata, data_size_t num_data) { + CrossEntropyLambdaMetric::Init(metadata, num_data); + cuda_label_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data_), __FILE__, __LINE__); + + const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; + AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); +} + +std::vector CUDACrossEntropyLambdaMetric::Eval(const double* score, const ObjectiveFunction* objective) const { + objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); + LaunchEvalKernel(score); + double sum_loss = 0.0f; + CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); + return std::vector(1, sum_loss / static_cast(num_data_)); +} + +CUDAKullbackLeiblerDivergence::CUDAKullbackLeiblerDivergence(const Config& config): KullbackLeiblerDivergence(config) {} + +CUDAKullbackLeiblerDivergence::~CUDAKullbackLeiblerDivergence() {} + +void CUDAKullbackLeiblerDivergence::Init(const Metadata& metadata, data_size_t num_data) { + KullbackLeiblerDivergence::Init(metadata, num_data); + cuda_label_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data_), __FILE__, __LINE__); + + const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; + AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); +} + +std::vector CUDAKullbackLeiblerDivergence::Eval(const double* score, const ObjectiveFunction* objective) const { + objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); + LaunchEvalKernel(score); + double sum_loss = 0.0f; + CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); + return std::vector(1, presum_label_entropy_ + sum_loss / sum_weights_); +} + +} // namespace LightGBM diff --git a/src/metric/cuda/cuda_xentropy_metric.cu b/src/metric/cuda/cuda_xentropy_metric.cu new file mode 100644 index 000000000000..05aa875c65fa --- /dev/null +++ b/src/metric/cuda/cuda_xentropy_metric.cu @@ -0,0 +1,149 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_xentropy_metric.hpp" +#include + +namespace LightGBM { + +__device__ inline static double XentLossCUDA(label_t label, double prob) { + const double log_arg_epsilon = 1.0e-12; + double a = label; + if (prob > log_arg_epsilon) { + a *= log(prob); + } else { + a *= log(log_arg_epsilon); + } + double b = 1.0f - label; + if (1.0f - prob > log_arg_epsilon) { + b *= log(1.0f - prob); + } else { + b *= log(log_arg_epsilon); + } + return - (a + b); +} + +__device__ inline static double XentLambdaLossCUDA(label_t label, label_t weight, double hhat) { + return XentLossCUDA(label, 1.0f - exp(-weight * hhat)); +} + +template +__global__ void EvalKernel_CrossEntropy( + const double* score, + const label_t* cuda_label, + const label_t* cuda_weights, + const data_size_t num_data, + double* cuda_sum_loss_buffer) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + double point_loss = 0.0f; + __shared__ double shared_mem_buffer[32]; + if (data_index < num_data) { + const label_t label = cuda_label[data_index]; + if (!USE_WEIGHT) { + point_loss = XentLossCUDA(label, score[data_index]); + } else { + const label_t weight = cuda_weights[data_index]; + point_loss = XentLossCUDA(label, score[data_index]) * weight; + } + } + const double block_sum_loss = ShuffleReduceSum(point_loss, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + cuda_sum_loss_buffer[blockIdx.x] = block_sum_loss; + } +} + +__global__ void ReduceLossKernel_CrossEntropy(const double* cuda_sum_loss_buffer, const data_size_t num_blocks, double* out_loss) { + __shared__ double shared_buffer[32]; + double thread_sum_loss = 0.0f; + for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { + thread_sum_loss += cuda_sum_loss_buffer[block_index]; + } + const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, static_cast(num_blocks)); + if (threadIdx.x == 0) { + *out_loss = sum_loss; + } +} + +void CUDACrossEntropyMetric::LaunchEvalKernel(const double* score) const { + const data_size_t num_blocks = (num_data_ + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; + if (cuda_weights_ == nullptr) { + EvalKernel_CrossEntropy<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); + } else { + EvalKernel_CrossEntropy<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); + } + ReduceLossKernel_CrossEntropy<<<1, EVAL_BLOCK_SIZE_XENTROPY_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); +} + +template +__global__ void EvalKernel_CrossEntropyLambda( + const double* score, + const label_t* cuda_label, + const label_t* cuda_weights, + const data_size_t num_data, + double* cuda_sum_loss_buffer) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + double point_loss = 0.0f; + __shared__ double shared_mem_buffer[32]; + if (data_index < num_data) { + const label_t label = cuda_label[data_index]; + if (!USE_WEIGHT) { + point_loss = XentLambdaLossCUDA(label, 1.0f, score[data_index]); + } else { + const label_t weight = cuda_weights[data_index]; + point_loss = XentLambdaLossCUDA(label, weight, score[data_index]); + } + } + const double block_sum_loss = ShuffleReduceSum(point_loss, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + cuda_sum_loss_buffer[blockIdx.x] = block_sum_loss; + } +} + +void CUDACrossEntropyLambdaMetric::LaunchEvalKernel(const double* score) const { + const data_size_t num_blocks = (num_data_ + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; + if (cuda_weights_ == nullptr) { + EvalKernel_CrossEntropyLambda<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); + } else { + EvalKernel_CrossEntropyLambda<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); + } + ReduceLossKernel_CrossEntropy<<<1, EVAL_BLOCK_SIZE_XENTROPY_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); +} + +template +__global__ void EvalKernel_KullbackLeiblerDivergence( + const double* score, + const label_t* cuda_label, + const label_t* cuda_weights, + const data_size_t num_data, + double* cuda_sum_loss_buffer) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + double point_loss = 0.0f; + __shared__ double shared_mem_buffer[32]; + if (data_index < num_data) { + const label_t label = cuda_label[data_index]; + if (!USE_WEIGHT) { + point_loss = XentLossCUDA(label, score[data_index]); + } else { + const label_t weight = cuda_weights[data_index]; + point_loss = XentLossCUDA(label, score[data_index]) * weight; + } + } + const double block_sum_loss = ShuffleReduceSum(point_loss, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + cuda_sum_loss_buffer[blockIdx.x] = block_sum_loss; + } +} + +void CUDAKullbackLeiblerDivergence::LaunchEvalKernel(const double* score) const { + const data_size_t num_blocks = (num_data_ + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; + if (cuda_weights_ == nullptr) { + EvalKernel_KullbackLeiblerDivergence<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); + } else { + EvalKernel_KullbackLeiblerDivergence<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); + } + ReduceLossKernel_CrossEntropy<<<1, EVAL_BLOCK_SIZE_XENTROPY_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); +} + +} // namespace LightGBM diff --git a/src/metric/cuda/cuda_xentropy_metric.hpp b/src/metric/cuda/cuda_xentropy_metric.hpp new file mode 100644 index 000000000000..7fbe6ae48de6 --- /dev/null +++ b/src/metric/cuda/cuda_xentropy_metric.hpp @@ -0,0 +1,77 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_METRIC_CUDA_CUDA_XENTROPY_METRIC_HPP_ +#define LIGHTGBM_METRIC_CUDA_CUDA_XENTROPY_METRIC_HPP_ + +#include "cuda_metric.hpp" +#include "../xentropy_metric.hpp" + +#define EVAL_BLOCK_SIZE_XENTROPY_METRIC (1024) + +namespace LightGBM { + +class CUDACrossEntropyMetric : public CUDAMetricInterface, public CrossEntropyMetric { + public: + explicit CUDACrossEntropyMetric(const Config&); + + ~CUDACrossEntropyMetric(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + std::vector Eval(const double* score, const ObjectiveFunction* objective) const; + + private: + void LaunchEvalKernel(const double* score) const; + + const label_t* cuda_label_; + const label_t* cuda_weights_; + double* cuda_score_convert_buffer_; + double* cuda_sum_loss_buffer_; + double* cuda_sum_loss_; +}; + +class CUDACrossEntropyLambdaMetric : public CUDAMetricInterface, public CrossEntropyLambdaMetric { + public: + explicit CUDACrossEntropyLambdaMetric(const Config&); + + ~CUDACrossEntropyLambdaMetric(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + std::vector Eval(const double* score, const ObjectiveFunction* objective) const; + + private: + void LaunchEvalKernel(const double* score) const; + + const label_t* cuda_label_; + const label_t* cuda_weights_; + double* cuda_score_convert_buffer_; + double* cuda_sum_loss_buffer_; + double* cuda_sum_loss_; +}; + +class CUDAKullbackLeiblerDivergence : public CUDAMetricInterface, public KullbackLeiblerDivergence { + public: + explicit CUDAKullbackLeiblerDivergence(const Config&); + + ~CUDAKullbackLeiblerDivergence(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + std::vector Eval(const double* score, const ObjectiveFunction* objective) const; + + private: + void LaunchEvalKernel(const double* score) const; + + const label_t* cuda_label_; + const label_t* cuda_weights_; + double* cuda_score_convert_buffer_; + double* cuda_sum_loss_buffer_; + double* cuda_sum_loss_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_METRIC_CUDA_CUDA_XENTROPY_METRIC_HPP_ diff --git a/src/metric/metric.cpp b/src/metric/metric.cpp index b09b640a54f8..bd9b5833c187 100644 --- a/src/metric/metric.cpp +++ b/src/metric/metric.cpp @@ -13,6 +13,8 @@ #include "cuda/cuda_binary_metric.hpp" #include "cuda/cuda_regression_metric.hpp" +#include "cuda/cuda_multiclass_metric.hpp" +#include "cuda/cuda_xentropy_metric.hpp" namespace LightGBM { @@ -22,8 +24,6 @@ Metric* Metric::CreateMetric(const std::string& type, const Config& config) { return new CUDAL2Metric(config); } else if (type == std::string("rmse")) { return new CUDARMSEMetric(config); - } else if (type == std::string("rmse")) { - return new CUDARMSEMetric(config); } else if (type == std::string("l1")) { return new CUDAL1Metric(config); } else if (type == std::string("quantile")) { @@ -42,6 +42,14 @@ Metric* Metric::CreateMetric(const std::string& type, const Config& config) { return new CUDAAUCMetric(config); } else if (type == std::string("average_precision")) { return new CUDAAveragePrecisionMetric(config); + } else if (type == std::string("multi_error")) { + return new CUDAMultiErrorMetric(config); + } else if (type == std::string("cross_entropy")) { + return new CUDACrossEntropyMetric(config); + } else if (type == std::string("cross_entropy_lambda")) { + return new CUDACrossEntropyLambdaMetric(config); + } else if (type == std::string("kullback_leibler")) { + return new CUDAKullbackLeiblerDivergence(config); } else if (type == std::string("mape")) { return new CUDAMAPEMetric(config); } else if (type == std::string("gamma")) { @@ -50,8 +58,6 @@ Metric* Metric::CreateMetric(const std::string& type, const Config& config) { return new CUDAGammaDevianceMetric(config); } else if (type == std::string("tweedie")) { return new CUDATweedieMetric(config); - } else if (type == std::string("ndcg")) { - return new NDCGMetric(config); } } else { if (type == std::string("l2")) { @@ -102,6 +108,7 @@ Metric* Metric::CreateMetric(const std::string& type, const Config& config) { return new TweedieMetric(config); } } + Log::Fatal("Unknown metric type name: %s", type.c_str()); return nullptr; } diff --git a/src/metric/multiclass_metric.hpp b/src/metric/multiclass_metric.hpp index 8f39809a1a5d..7e64c990faa7 100644 --- a/src/metric/multiclass_metric.hpp +++ b/src/metric/multiclass_metric.hpp @@ -118,7 +118,7 @@ class MulticlassMetric: public Metric { return std::vector(1, loss); } - private: + protected: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ @@ -339,7 +339,7 @@ class AucMuMetric : public Metric { return std::vector(1, ans); } - private: + protected: /*! \brief Number of data*/ data_size_t num_data_; /*! \brief Pointer to label*/ diff --git a/src/metric/xentropy_metric.hpp b/src/metric/xentropy_metric.hpp index 241b0a856efe..49e4274cebc2 100644 --- a/src/metric/xentropy_metric.hpp +++ b/src/metric/xentropy_metric.hpp @@ -146,7 +146,7 @@ class CrossEntropyMetric : public Metric { return -1.0f; // negative means smaller loss is better, positive means larger loss is better } - private: + protected: /*! \brief Number of data points */ data_size_t num_data_; /*! \brief Pointer to label */ @@ -232,7 +232,7 @@ class CrossEntropyLambdaMetric : public Metric { return -1.0f; } - private: + protected: /*! \brief Number of data points */ data_size_t num_data_; /*! \brief Pointer to label */ @@ -338,7 +338,7 @@ class KullbackLeiblerDivergence : public Metric { return -1.0f; } - private: + protected: /*! \brief Number of data points */ data_size_t num_data_; /*! \brief Pointer to label */ diff --git a/src/objective/cuda/cuda_xentropy_objective.cpp b/src/objective/cuda/cuda_xentropy_objective.cpp index e2b1e3dee5d2..068d48a2efc5 100644 --- a/src/objective/cuda/cuda_xentropy_objective.cpp +++ b/src/objective/cuda/cuda_xentropy_objective.cpp @@ -15,12 +15,41 @@ CUDACrossEntropy::~CUDACrossEntropy() {} void CUDACrossEntropy::Init(const Metadata& metadata, data_size_t num_data) { CrossEntropy::Init(metadata, num_data); + const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; + AllocateCUDAMemoryOuter(&cuda_reduce_sum_buffer_, static_cast(num_blocks), __FILE__, __LINE__); cuda_labels_ = metadata.cuda_metadata()->cuda_label(); cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); } +double CUDACrossEntropy::BoostFromScore(int) const { + return LaunchCalcInitScoreKernel(); +} + void CUDACrossEntropy::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { LaunchGetGradientsKernel(score, gradients, hessians); } +CUDACrossEntropyLambda::CUDACrossEntropyLambda(const Config& config): CrossEntropyLambda(config) {} + +CUDACrossEntropyLambda::CUDACrossEntropyLambda(const std::vector& strs): CrossEntropyLambda(strs) {} + +CUDACrossEntropyLambda::~CUDACrossEntropyLambda() {} + +void CUDACrossEntropyLambda::Init(const Metadata& metadata, data_size_t num_data) { + CrossEntropyLambda::Init(metadata, num_data); + const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; + AllocateCUDAMemoryOuter(&cuda_reduce_sum_buffer_, static_cast(num_blocks), __FILE__, __LINE__); + cuda_labels_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); +} + +double CUDACrossEntropyLambda::BoostFromScore(int) const { + return LaunchCalcInitScoreKernel(); +} + +void CUDACrossEntropyLambda::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { + LaunchGetGradientsKernel(score, gradients, hessians); +} + + } // namespace LightGBM diff --git a/src/objective/cuda/cuda_xentropy_objective.cu b/src/objective/cuda/cuda_xentropy_objective.cu index c4c2e64e56a0..3813d0978a46 100644 --- a/src/objective/cuda/cuda_xentropy_objective.cu +++ b/src/objective/cuda/cuda_xentropy_objective.cu @@ -3,10 +3,30 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#include #include "cuda_xentropy_objective.hpp" namespace LightGBM { +double CUDACrossEntropy::LaunchCalcInitScoreKernel() const { + double suml = 0.0f; + double sumw = 0.0f; + if (cuda_weights_ == nullptr) { + sumw = static_cast(num_data_); + ReduceSumGlobal(cuda_labels_, static_cast(num_data_), cuda_reduce_sum_buffer_); + CopyFromCUDADeviceToHostOuter(&suml, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); + } else { + ReduceDotProductGlobal(cuda_labels_, cuda_weights_, static_cast(num_data_), cuda_reduce_sum_buffer_); + CopyFromCUDADeviceToHostOuter(&suml, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); + ReduceSumGlobal(cuda_weights_, static_cast(num_data_), cuda_reduce_sum_buffer_); + CopyFromCUDADeviceToHostOuter(&sumw, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); + } + double pavg = suml / sumw; + pavg = std::min(pavg, 1.0 - kEpsilon); + pavg = std::max(pavg, kEpsilon); + return std::log(pavg / (1.0f - pavg)); +} + template __global__ void GetGradientsKernel_CrossEntropy( const double* cuda_scores, @@ -39,4 +59,62 @@ void CUDACrossEntropy::LaunchGetGradientsKernel(const double* score, score_t* gr } } +double CUDACrossEntropyLambda::LaunchCalcInitScoreKernel() const { + double suml = 0.0f; + double sumw = 0.0f; + if (cuda_weights_ == nullptr) { + sumw = static_cast(num_data_); + ReduceSumGlobal(cuda_labels_, static_cast(num_data_), cuda_reduce_sum_buffer_); + CopyFromCUDADeviceToHostOuter(&suml, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); + } else { + ReduceDotProductGlobal(cuda_labels_, cuda_weights_, static_cast(num_data_), cuda_reduce_sum_buffer_); + CopyFromCUDADeviceToHostOuter(&suml, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); + ReduceSumGlobal(cuda_weights_, static_cast(num_data_), cuda_reduce_sum_buffer_); + CopyFromCUDADeviceToHostOuter(&sumw, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); + } + double havg = suml / sumw; + return std::log(std::exp(havg) - 1.0f); +} + +template +__global__ void GetGradientsKernel_CrossEntropyLambda( + const double* cuda_scores, + const label_t* cuda_labels, + const label_t* cuda_weights, + const data_size_t num_data, + score_t* cuda_out_gradients, + score_t* cuda_out_hessians) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + if (USE_WEIGHT) { + const double w = static_cast(cuda_weights[data_index]); + const double y = static_cast(cuda_labels[data_index]); + const double epf = exp(cuda_scores[data_index]); + const double hhat = log(1.0f + epf); + const double z = 1.0f - exp(-w * hhat); + const double enf = 1.0f / epf; // = std::exp(-cuda_scores[data_index]); + cuda_out_gradients[data_index] = static_cast((1.0f - y / z) * w / (1.0f + enf)); + const double c = 1.0f / (1.0f - z); + double d = 1.0f + epf; + const double a = w * epf / (d * d); + d = c - 1.0f; + const double b = (c / (d * d) ) * (1.0f + w * epf - c); + cuda_out_hessians[data_index] = static_cast(a * (1.0f + y * b)); + } else { + const double z = 1.0f / (1.0f + exp(-cuda_scores[data_index])); + cuda_out_gradients[data_index] = static_cast(z - cuda_labels[data_index]); + cuda_out_hessians[data_index] = static_cast(z * (1.0f - z)); + } + } +} + +void CUDACrossEntropyLambda::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; + if (cuda_weights_ == nullptr) { + GetGradientsKernel_CrossEntropyLambda<<>>(score, cuda_labels_, nullptr, num_data_, gradients, hessians); + } else { + GetGradientsKernel_CrossEntropyLambda<<>>(score, cuda_labels_, cuda_weights_, num_data_, gradients, hessians); + } +} + } // namespace LightGBM diff --git a/src/objective/cuda/cuda_xentropy_objective.hpp b/src/objective/cuda/cuda_xentropy_objective.hpp index 05cd0e24c4ef..2e42c1f4cc72 100644 --- a/src/objective/cuda/cuda_xentropy_objective.hpp +++ b/src/objective/cuda/cuda_xentropy_objective.hpp @@ -21,15 +21,44 @@ class CUDACrossEntropy: public CrossEntropy { virtual void Init(const Metadata& metadata, data_size_t num_data) override; + double BoostFromScore(int) const override; + virtual void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; private: void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; + double LaunchCalcInitScoreKernel() const; + + const label_t* cuda_labels_; + const label_t* cuda_weights_; + double* cuda_reduce_sum_buffer_; +}; + +class CUDACrossEntropyLambda: public CrossEntropyLambda { + public: + explicit CUDACrossEntropyLambda(const Config& config); + + explicit CUDACrossEntropyLambda(const std::vector& strs); + + ~CUDACrossEntropyLambda(); + + virtual void Init(const Metadata& metadata, data_size_t num_data) override; + + double BoostFromScore(int) const override; + + virtual void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; + + private: + void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; + + double LaunchCalcInitScoreKernel() const; + const label_t* cuda_labels_; const label_t* cuda_weights_; + double* cuda_reduce_sum_buffer_; }; } // namespace LightGBM -#endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_XENTROPY_OBJECTIVE_HPP_ \ No newline at end of file +#endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_XENTROPY_OBJECTIVE_HPP_ diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index 62e7d8c24149..30d3d075b30e 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -14,14 +14,13 @@ #include "cuda/cuda_multiclass_objective.hpp" #include "cuda/cuda_regression_objective.hpp" #include "cuda/cuda_rank_objective.hpp" +#include "cuda/cuda_xentropy_objective.hpp" namespace LightGBM { ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) { if (config.device_type == std::string("cuda")) { - if (type == std::string("binary")) { - return new CUDABinaryLogloss(config); - } else if (type == std::string("regression")) { + if (type == std::string("regression")) { return new CUDARegressionL2loss(config); } else if (type == std::string("regression_l1")) { return new CUDARegressionL1loss(config); @@ -33,18 +32,30 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new CUDARegressionFairLoss(config); } else if (type == std::string("poisson")) { return new CUDARegressionFairLoss(config); + } else if (type == std::string("binary")) { + return new CUDABinaryLogloss(config); } else if (type == std::string("lambdarank")) { return new CUDALambdarankNDCG(config); } else if (type == std::string("rank_xendcg")) { return new CUDARankXENDCG(config); } else if (type == std::string("multiclass")) { return new CUDAMulticlassSoftmax(config); + } else if (type == std::string("multiclassova")) { + return new CUDAMulticlassOVA(config); + } else if (type == std::string("cross_entropy")) { + return new CUDACrossEntropy(config); + } else if (type == std::string("cross_entropy_lambda")) { + return new CUDACrossEntropyLambda(config); } else if (type == std::string("mape")) { return new CUDARegressionMAPELOSS(config); } else if (type == std::string("gamma")) { return new CUDARegressionGammaLoss(config); } else if (type == std::string("tweedie")) { return new CUDARegressionTweedieLoss(config); + } else if (type == std::string("custom")) { + // TODO(shiyu1994): when using customized objective function + // TODO(shiyu1994): we should copy gradients manually to GPU + return nullptr; } } else { if (type == std::string("regression")) { @@ -88,6 +99,7 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& } ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& str) { + // TODO(shiyu1994): consider the case for CUDA auto strs = Common::Split(str.c_str(), ' '); auto type = strs[0]; if (type == std::string("regression")) { diff --git a/src/objective/xentropy_objective.hpp b/src/objective/xentropy_objective.hpp index 4ce73a847a47..4416350b00a0 100644 --- a/src/objective/xentropy_objective.hpp +++ b/src/objective/xentropy_objective.hpp @@ -264,7 +264,7 @@ class CrossEntropyLambda: public ObjectiveFunction { return initscore; } - private: + protected: /*! \brief Number of data points */ data_size_t num_data_; /*! \brief Pointer for label */ From 35c742d6e0ce4dedd624e6c46ab923a046e359fe Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 7 Sep 2021 11:21:19 +0000 Subject: [PATCH 064/166] add ndcg metric --- include/LightGBM/cuda/cuda_algorithms.hpp | 7 + include/LightGBM/cuda/cuda_metadata.hpp | 2 + include/LightGBM/metric.h | 4 + src/cuda/cuda_algorithms.cu | 4 - src/metric/cuda/cuda_multiclass_metric.cpp | 20 +- src/metric/cuda/cuda_multiclass_metric.cu | 217 ++++++++++++++------ src/metric/cuda/cuda_multiclass_metric.hpp | 7 +- src/metric/cuda/cuda_rank_metric.cpp | 69 +++++++ src/metric/cuda/cuda_rank_metric.cu | 223 +++++++++++++++++++++ src/metric/cuda/cuda_rank_metric.hpp | 48 +++++ src/metric/rank_metric.hpp | 2 +- src/objective/cuda/cuda_rank_objective.hpp | 1 - 12 files changed, 533 insertions(+), 71 deletions(-) create mode 100644 src/metric/cuda/cuda_rank_metric.cpp create mode 100644 src/metric/cuda/cuda_rank_metric.cu create mode 100644 src/metric/cuda/cuda_rank_metric.hpp diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 14acb2494c07..ff28bbba201b 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -20,6 +20,10 @@ #define LOG_NUM_BANKS_DATA_PARTITION (4) #define GLOBAL_PREFIX_SUM_BLOCK_SIZE (1024) +#define BITONIC_SORT_NUM_ELEMENTS (1024) +#define BITONIC_SORT_DEPTH (11) +#define BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE (10) + #define CONFLICT_FREE_INDEX(n) \ ((n) + ((n) >> LOG_NUM_BANKS_DATA_PARTITION)) \ @@ -57,6 +61,9 @@ __device__ void ReduceSumConflictFree(T* values, size_t n) { template void ReduceSumGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer); +template +__global__ void BlockReduceSum(T* block_buffer, const data_size_t num_blocks); + template void ReduceDotProductGlobal(const VAL_A_T* a, const VAL_B_T* b, size_t n, REDUCE_T* block_buffer); diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp index 894147046b0c..db2044e494f4 100644 --- a/include/LightGBM/cuda/cuda_metadata.hpp +++ b/include/LightGBM/cuda/cuda_metadata.hpp @@ -31,6 +31,8 @@ class CUDAMetadata { const data_size_t* cuda_query_boundaries() const { return cuda_query_boundaries_; } + const label_t* cuda_query_weights() const { return cuda_query_weights_; } + private: label_t* cuda_label_; label_t* cuda_weights_; diff --git a/include/LightGBM/metric.h b/include/LightGBM/metric.h index 9d505d2768d1..818d6581fb89 100644 --- a/include/LightGBM/metric.h +++ b/include/LightGBM/metric.h @@ -135,6 +135,10 @@ class DCGCalculator { */ inline static double GetDiscount(data_size_t k) { return discount_[k]; } + inline static const std::vector& label_gain() { return label_gain_; } + + inline static const std::vector& discount() { return discount_; } + private: /*! \brief store gains for different label */ static std::vector label_gain_; diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 0d03d2f65eeb..894606f3b6ae 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -7,10 +7,6 @@ namespace LightGBM { -#define BITONIC_SORT_NUM_ELEMENTS (1024) -#define BITONIC_SORT_DEPTH (11) -#define BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE (10) - template __global__ void BitonicSortGlobalKernel(T* values, const int num_total_data) { const int thread_index = static_cast(threadIdx.x); diff --git a/src/metric/cuda/cuda_multiclass_metric.cpp b/src/metric/cuda/cuda_multiclass_metric.cpp index 2a8dd0a1928e..7943d2989b17 100644 --- a/src/metric/cuda/cuda_multiclass_metric.cpp +++ b/src/metric/cuda/cuda_multiclass_metric.cpp @@ -75,14 +75,15 @@ void CUDAAucMuMetric::Init(const Metadata& metadata, data_size_t num_data) { AllocateCUDAMemoryOuter(&cuda_dist_, total_pair_buffer_size, __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_sorted_indices_by_dist_, total_pair_buffer_size, __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_sum_pos_buffer_, total_pair_buffer_size, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_neg_buffer_, total_pair_buffer_size, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_threshold_mask_, total_pair_buffer_size, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_reduce_block_buffer_, static_cast(num_class_pair), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_threshold_mark_, total_pair_buffer_size, __FILE__, __LINE__); const int num_blocks = (max_pair_buffer_size_ + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; - AllocateCUDAMemoryOuter(&cuda_block_mark_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_mark_first_zero_, static_cast(num_blocks) + 1, __FILE__, __LINE__); - + const size_t class_pair_block_buffer = static_cast(num_class_pair * (num_blocks + 1)); + AllocateCUDAMemoryOuter(&cuda_block_mark_buffer_, class_pair_block_buffer, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_mark_first_zero_, class_pair_block_buffer, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_reduce_block_buffer_, class_pair_block_buffer, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_reduce_block_buffer_, 0, class_pair_block_buffer, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_reduce_ans_buffer_, static_cast(num_class_pair), __FILE__, __LINE__); const size_t curr_v_size = static_cast(num_class_pair * num_class_); std::vector all_curr_v(curr_v_size, 0.0f); for (int i = 0; i < num_class_ - 1; ++i) { @@ -95,10 +96,17 @@ void CUDAAucMuMetric::Init(const Metadata& metadata, data_size_t num_data) { } } InitCUDAMemoryFromHostMemoryOuter(&cuda_curr_v_, all_curr_v.data(), all_curr_v.size(), __FILE__, __LINE__); + + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + cuda_label_ = metadata.cuda_metadata()->cuda_label(); } std::vector CUDAAucMuMetric::Eval(const double* score, const ObjectiveFunction*) const { LaunchEvalKernel(score); + double ans = 0.0f; + const int num_class_pair = (num_class_ - 1) * num_class_ / 2; + CopyFromCUDADeviceToHostOuter(&ans, cuda_reduce_ans_buffer_, static_cast(num_class_pair), __FILE__, __LINE__); + return std::vector(1, ans / static_cast(num_class_pair)); } } // namespace LightGBM diff --git a/src/metric/cuda/cuda_multiclass_metric.cu b/src/metric/cuda/cuda_multiclass_metric.cu index 1c2e2cffb758..37d367082d29 100644 --- a/src/metric/cuda/cuda_multiclass_metric.cu +++ b/src/metric/cuda/cuda_multiclass_metric.cu @@ -77,16 +77,6 @@ void CUDAMulticlassMetric::LaunchEvalKernel(const LaunchEvalKernelInner(score); } -__global__ void EvalKernel_AucMuInner( - const data_size_t* cuda_class_start, - const data_size_t* cuda_class_size, - const data_size_t* cuda_sorted_indices, - const double* cuda_class_data_weights, - double* cuda_dist, - data_size_t* cuda_sorted_indices_by_dist) { - -} - __global__ void EvalKernel_AucMuWriteDist( const data_size_t i_class_start, const data_size_t i_class_size, @@ -99,20 +89,23 @@ __global__ void EvalKernel_AucMuWriteDist( const data_size_t max_pair_buffer_size, const data_size_t num_data, const int num_class, + const int i, + const int j, double* cuda_dist) { const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - // put the dist of class j in the front - const data_size_t data_index_in_class = data_index < j_class_size ? data_index : data_index - j_class_size; - const data_size_t class_start = data_index < j_class_size ? j_class_size : i_class_start; - const data_size_t class_size = data_index < j_class_size ? j_class_size : i_class_size; - const data_size_t* sorted_indices_in_class = cuda_sorted_indices + class_start; - const data_size_t a = sorted_indices_in_class[data_index_in_class]; - double v_a = 0.0f; - for (int m = 0; m < num_class; ++m) { - v_a += cuda_curr_v[m] * score[num_data * m + a]; + if (data_index < j_class_size + i_class_size) { + // put the dist of class j in the front + const data_size_t data_index_in_class = data_index < j_class_size ? data_index : data_index - j_class_size; + const data_size_t class_start = data_index < j_class_size ? j_class_start : i_class_start; + const data_size_t* sorted_indices_in_class = cuda_sorted_indices + class_start; + const data_size_t a = sorted_indices_in_class[data_index_in_class]; + double v_a = 0.0f; + for (int m = 0; m < num_class; ++m) { + v_a += cuda_curr_v[m] * score[num_data * m + a]; + } + const double t1 = cuda_curr_v[i] - cuda_curr_v[j]; + cuda_dist[data_index] = v_a * t1; } - const double t1 = cuda_curr_v[i] - cuda_curr_v[j]; - cuda_dist[data_index] = v_a * t1; } __global__ void BitonicArgSortGlobal_AucMu( @@ -131,17 +124,17 @@ __global__ void BitonicArgSortGlobal_AucMu( const int segment_length = (1 << (max_depth - depth)); int half_segment_length = (segment_length >> 1); { - BitonicArgCompareKernel<<>>( + BitonicArgCompareKernel<<>>( dist, out_data_indices, half_segment_length, segment_length, static_cast(num_data)); half_segment_length >>= 1; } for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { - BitonicArgCompareKernel<<>>( + BitonicArgCompareKernel<<>>( dist, out_data_indices, half_segment_length, segment_length, static_cast(num_data)); half_segment_length >>= 1; } - BitonicArgSortMergeKernel<<>>( - dist, out_data_indices, segment_length, static_cast(len)); + BitonicArgSortMergeKernel<<>>( + dist, out_data_indices, segment_length, static_cast(num_data)); } } @@ -160,7 +153,7 @@ __global__ void GenAucMuPosPrefixSumWithinBlock( const data_size_t inner_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); double pos = 0.0f; if (inner_data_index < j_class_size + i_class_size) { - const data_size_t data_index_two_class = sorted_data_indices[inner_data_index]; + const data_size_t data_index_two_class = sorted_data_indices_two_class[inner_data_index]; const bool is_pos_class = (data_index_two_class < j_class_size); if (USE_WEIGHT) { const data_size_t data_index_one_class = (is_pos_class ? data_index_two_class : data_index_two_class - j_class_size); @@ -178,10 +171,11 @@ __global__ void GenAucMuPosPrefixSumWithinBlock( sum_pos_buffer[inner_data_index] = shared_buffer[threadIdx.x + 1]; } if (threadIdx.x == 0) { - block_sum_pos_buffer[blockidx.x + 1] = shared_buffer[blockDim.x]; + block_sum_pos_buffer[blockIdx.x + 1] = shared_buffer[blockDim.x]; } } +template __global__ void GenAucMuPosPrefixSum( const data_size_t* sorted_data_indices_global, const data_size_t* sorted_data_indices_two_class, @@ -190,11 +184,11 @@ __global__ void GenAucMuPosPrefixSum( const data_size_t i_class_start, const data_size_t j_class_start, const label_t* cuda_weights, - double* sum_pos_buffer, - double* block_sum_pos_buffer) { + double* prefix_sum_result, + double* block_buffer) { const data_size_t num_data = i_class_size + j_class_size; const int num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - GenAucMuPosPrefixSumWithinBlock<<>>( + GenAucMuPosPrefixSumWithinBlock<<>>( sorted_data_indices_global, sorted_data_indices_two_class, i_class_size, @@ -202,12 +196,12 @@ __global__ void GenAucMuPosPrefixSum( i_class_start, j_class_start, cuda_weights, - sum_pos_buffer, - block_sum_pos_buffer); + prefix_sum_result, + block_buffer); GlobalInclusivePrefixSumReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( - block_sum_pos_buffer, num_blocks); + block_buffer, num_blocks); GlobalInclusivePrefixSumAddBlockBaseKernel<<>>( - block_sum_pos_buffer, sum_pos_buffer, num_data); + block_buffer, prefix_sum_result, num_data); } __global__ void GenAucMuMark( @@ -225,6 +219,53 @@ __global__ void GenAucMuMark( block_mark_buffer, threshold_mark, block_mark_first_zero, num_data); } +template +__global__ void CalcAucMuArea( + const double* block_sum_pos_buffer, + const data_size_t* sorted_data_indices_global, + const data_size_t* sorted_data_indices_two_class, + const data_size_t* threshold_mark, + const label_t* cuda_weights, + const data_size_t num_data, + const data_size_t i_class_start, + const data_size_t j_class_size, + double* block_buffer) { + __shared__ double shared_mem_buffer[32]; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + double area = 0.0f; + if (data_index < num_data) { + const data_size_t data_index_two_class = sorted_data_indices_two_class[data_index]; + if (data_index_two_class >= j_class_size) { + const data_size_t data_index_global = sorted_data_indices_global[i_class_start + data_index_two_class - j_class_size]; + const double num_j = block_sum_pos_buffer[data_index]; + if (USE_WEIGHT) { + const double curr_weight = static_cast(cuda_weights[data_index_global]); + if (threshold_mark[data_index] > 0) { + const data_size_t prev_data_index = data_index - threshold_mark[data_index] - 1; + const double prev_sum_pos = prev_data_index < 0 ? 0.0f : block_sum_pos_buffer[prev_data_index]; + const double num_curr_j = block_sum_pos_buffer[data_index] - prev_sum_pos; + area = curr_weight * (num_j - 0.5f * num_curr_j); + } else { + area = curr_weight * num_j; + } + } else { + if (threshold_mark[data_index] > 0) { + const data_size_t prev_data_index = data_index - threshold_mark[data_index] - 1; + const double prev_sum_pos = prev_data_index < 0 ? 0.0f : block_sum_pos_buffer[prev_data_index]; + const double num_curr_j = block_sum_pos_buffer[data_index] - prev_sum_pos; + area = num_j - 0.5f * num_curr_j; + } else { + area = num_j; + } + } + } + } + const double block_area = ShuffleReduceSum(area, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + block_buffer[blockIdx.x] = block_area; + } +} + template __global__ void EvalKernel_AucMu( const data_size_t* cuda_class_start, @@ -243,25 +284,32 @@ __global__ void EvalKernel_AucMu( data_size_t* cuda_block_threshold_mark_buffer, uint16_t* cuda_block_mark_first_zero, double* sum_pos_buffer, - double* block_sum_pos_buffer) { + double* block_sum_pos_buffer, + double* reduce_ans_buffer) { const int pair_index = static_cast(blockIdx.x); - const double index_2 = 2 * static_cast(pair_index); + const double index_2 = 2.0f * static_cast(pair_index); const int sqrt_round = static_cast(sqrt(index_2)); - const int i_p = static_cast(sqrt(index_2 - static_cast(sqrt_round) + 1)); + const int i_p = (pair_index == 0) ? 0 : static_cast(sqrt(index_2 - static_cast(sqrt_round) + 1)); const int j_p = pair_index - ((i_p + 1) * i_p / 2); const int i = num_class - 2 - i_p; const int j = j_p + i + 1; - const data_size_t num_data_in_pair = cuda_class_size[i] + cuda_class_size[j]; + const data_size_t i_class_size = cuda_class_size[i]; + const data_size_t j_class_size = cuda_class_size[j]; + const data_size_t num_data_in_pair = i_class_size + j_class_size; const int num_blocks = (num_data_in_pair + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; + const int num_blocks_for_offset = (max_pair_buffer_size + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; const data_size_t i_class_start = cuda_class_start[i]; const data_size_t j_class_start = cuda_class_start[j]; - const data_size_t i_class_size = cuda_class_size[i]; - const data_size_t j_class_size = cuda_class_size[j]; double* cuda_dist_ptr = cuda_dist + pair_index * max_pair_buffer_size; data_size_t* cuda_sorted_indices_by_dist_ptr = cuda_sorted_indices_by_dist + pair_index * max_pair_buffer_size; const double* cuda_curr_v_ptr = cuda_curr_v + pair_index * num_class; + double* sum_pos_buffer_ptr = sum_pos_buffer + pair_index * max_pair_buffer_size; + double* block_sum_pos_buffer_ptr = block_sum_pos_buffer + pair_index * (num_blocks_for_offset + 1); + data_size_t* cuda_threshold_mark_ptr = cuda_threshold_mark + pair_index * max_pair_buffer_size; + data_size_t* cuda_block_threshold_mark_buffer_ptr = cuda_block_threshold_mark_buffer + pair_index * (num_blocks_for_offset + 1); + uint16_t* cuda_block_mark_first_zero_ptr = cuda_block_mark_first_zero + pair_index * (num_blocks_for_offset + 1); cudaStream_t cuda_stream; - cudaStreamCreate(&cuda_stream); + cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking); EvalKernel_AucMuWriteDist<<>>( i_class_start, i_class_size, @@ -274,39 +322,94 @@ __global__ void EvalKernel_AucMu( max_pair_buffer_size, num_data, num_class, + i, + j, cuda_dist_ptr); BitonicArgSortGlobal_AucMu<<<1, 1, 0, cuda_stream>>>( cuda_dist_ptr, cuda_sorted_indices_by_dist_ptr, - i_class_size + j_class_size); - GenAucMuPosPrefixSum<<<1, 1, 0, cuda_stream>>>( + num_data_in_pair); + GenAucMuPosPrefixSum<<<1, 1, 0, cuda_stream>>>( cuda_sorted_indices, - cuda_sorted_indices_by_dist, + cuda_sorted_indices_by_dist_ptr, i_class_size, j_class_size, i_class_start, j_class_start, cuda_weights, - sum_pos_buffer, - block_sum_pos_buffer); + sum_pos_buffer_ptr, + block_sum_pos_buffer_ptr); GenAucMuMark<<<1, 1, 0, cuda_stream>>>( cuda_dist_ptr, cuda_sorted_indices_by_dist_ptr, - cuda_threshold_mark, - cuda_block_threshold_mark_buffer, - cuda_block_mark_first_zero); - + num_data_in_pair, + cuda_threshold_mark_ptr, + cuda_block_threshold_mark_buffer_ptr, + cuda_block_mark_first_zero_ptr); + CalcAucMuArea<<<1, 1, 0, cuda_stream>>>( + block_sum_pos_buffer_ptr, + cuda_sorted_indices, + cuda_sorted_indices_by_dist_ptr, + cuda_threshold_mark_ptr, + cuda_weights, + num_data_in_pair, + i_class_start, + j_class_size, + block_sum_pos_buffer_ptr); + BlockReduceSum<<<1, EVAL_BLOCK_SIZE_MULTICLASS_METRIC, 0, cuda_stream>>>(block_sum_pos_buffer_ptr, num_blocks); + if (USE_WEIGHT) { + reduce_ans_buffer[pair_index] = block_sum_pos_buffer_ptr[0] / cuda_class_data_weights[i] / cuda_class_data_weights[j]; + } else { + reduce_ans_buffer[pair_index] = block_sum_pos_buffer_ptr[0] / static_cast(cuda_class_size[i]) / static_cast(cuda_class_size[j]); + } + cudaStreamDestroy(cuda_stream); } void CUDAAucMuMetric::LaunchEvalKernel(const double* score) const { const int num_class_pair = (num_class_ - 1) * num_class_ / 2; - EvalKernel_AucMu<<>>( - cuda_class_start_, - cuda_class_size_, - cuda_sorted_indices, - cuda_class_data_weights_, - cuda_dist_, - cuda_sorted_indices_by_dist); + if (cuda_weights_ == nullptr) { + EvalKernel_AucMu<<>>( + cuda_class_start_, + cuda_class_size_, + cuda_sorted_indices_, + cuda_class_data_weights_, + cuda_curr_v_, + score, + max_pair_buffer_size_, + num_data_, + num_class_, + cuda_weights_, + cuda_dist_, + cuda_sorted_indices_by_dist_, + cuda_threshold_mark_, + cuda_block_mark_buffer_, + cuda_block_mark_first_zero_, + cuda_sum_pos_buffer_, + cuda_reduce_block_buffer_, + cuda_reduce_ans_buffer_); + } else { + EvalKernel_AucMu<<>>( + cuda_class_start_, + cuda_class_size_, + cuda_sorted_indices_, + cuda_class_data_weights_, + cuda_curr_v_, + score, + max_pair_buffer_size_, + num_data_, + num_class_, + cuda_weights_, + cuda_dist_, + cuda_sorted_indices_by_dist_, + cuda_threshold_mark_, + cuda_block_mark_buffer_, + cuda_block_mark_first_zero_, + cuda_sum_pos_buffer_, + cuda_reduce_block_buffer_, + cuda_reduce_ans_buffer_); + } + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + BlockReduceSum<<<1, EVAL_BLOCK_SIZE_MULTICLASS_METRIC>>>(cuda_reduce_ans_buffer_, num_class_pair); } } // namespace LightGBM diff --git a/src/metric/cuda/cuda_multiclass_metric.hpp b/src/metric/cuda/cuda_multiclass_metric.hpp index 6d0c49c10837..36c8476e8f72 100644 --- a/src/metric/cuda/cuda_multiclass_metric.hpp +++ b/src/metric/cuda/cuda_multiclass_metric.hpp @@ -99,6 +99,9 @@ class CUDAAucMuMetric : public CUDAMetricInterface, public AucMuMetric { private: void LaunchEvalKernel(const double* score) const; + const label_t* cuda_label_; + const label_t* cuda_weights_; + int num_class_pair_; data_size_t max_pair_buffer_size_; @@ -112,12 +115,12 @@ class CUDAAucMuMetric : public CUDAMetricInterface, public AucMuMetric { double* cuda_curr_v_; double* cuda_sum_pos_buffer_; - double* cuda_sum_neg_buffer_; - data_size_t* cuda_threshold_mask_; + data_size_t* cuda_threshold_mark_; data_size_t* cuda_block_mark_buffer_; uint16_t* cuda_block_mark_first_zero_; double* cuda_reduce_block_buffer_; + double* cuda_reduce_ans_buffer_; }; } // namespace LightGBM diff --git a/src/metric/cuda/cuda_rank_metric.cpp b/src/metric/cuda/cuda_rank_metric.cpp new file mode 100644 index 000000000000..ca10bfe9ef81 --- /dev/null +++ b/src/metric/cuda/cuda_rank_metric.cpp @@ -0,0 +1,69 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_rank_metric.hpp" + +namespace LightGBM { + +CUDANDCGMetric::CUDANDCGMetric(const Config& config): NDCGMetric(config) {} + +CUDANDCGMetric::~CUDANDCGMetric() {} + +void CUDANDCGMetric::Init(const Metadata& metadata, data_size_t num_data) { + NDCGMetric::Init(metadata, num_data); + cuda_label_ = metadata.cuda_metadata()->cuda_label(); + cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); + cuda_query_boundaries_ = metadata.cuda_metadata()->cuda_query_boundaries(); + cuda_query_weights_ = metadata.cuda_metadata()->cuda_query_weights(); + const int num_threads = OMP_NUM_THREADS(); + std::vector thread_max_num_items_in_query(num_threads); + Threading::For(0, num_queries_, 1, + [this, &thread_max_num_items_in_query] (int thread_index, data_size_t start, data_size_t end) { + for (data_size_t query_index = start; query_index < end; ++query_index) { + const data_size_t query_item_count = query_boundaries_[query_index + 1] - query_boundaries_[query_index]; + if (query_item_count > thread_max_num_items_in_query[thread_index]) { + thread_max_num_items_in_query[thread_index] = query_item_count; + } + } + }); + max_items_in_query_ = 0; + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + if (thread_max_num_items_in_query[thread_index] > max_items_in_query_) { + max_items_in_query_ = thread_max_num_items_in_query[thread_index]; + } + } + max_items_in_query_aligned_ = 1; + --max_items_in_query_; + while (max_items_in_query_ > 0) { + max_items_in_query_ >>= 1; + max_items_in_query_aligned_ <<= 1; + } + num_eval_ = static_cast(eval_at_.size()); + InitCUDAMemoryFromHostMemoryOuter(&cuda_eval_at_, eval_at_.data(), eval_at_.size(), __FILE__, __LINE__); + const size_t total_inverse_max_dcg_items = static_cast(num_queries_ * num_eval_); + std::vector flatten_inverse_max_dcgs(total_inverse_max_dcg_items, 0.0f); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t query_index = 0; query_index < num_queries_; ++query_index) { + OMP_LOOP_EX_BEGIN(); + for (data_size_t eval_index = 0; eval_index < num_eval_; ++eval_index) { + flatten_inverse_max_dcgs[query_index * num_eval_ + eval_index] = inverse_max_dcgs_[query_index][eval_index]; + } + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + InitCUDAMemoryFromHostMemoryOuter(&cuda_inverse_max_dcgs_, flatten_inverse_max_dcgs.data(), flatten_inverse_max_dcgs.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_label_gain_, DCGCalculator::label_gain().data(), DCGCalculator::label_gain().size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_discount_, DCGCalculator::discount().data(), DCGCalculator::discount().size(), __FILE__, __LINE__); + const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK_METRIC - 1) / NUM_QUERY_PER_BLOCK_METRIC; + AllocateCUDAMemoryOuter(&cuda_block_dcg_buffer_, static_cast(num_blocks * num_eval_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_item_indices_buffer_, static_cast(num_data_), __FILE__, __LINE__); +} + +std::vector CUDANDCGMetric::Eval(const double* score, const ObjectiveFunction*) const { + LaunchEvalKernel(score); +} + +} // namespace LightGBM diff --git a/src/metric/cuda/cuda_rank_metric.cu b/src/metric/cuda/cuda_rank_metric.cu new file mode 100644 index 000000000000..2aad1b7c6a85 --- /dev/null +++ b/src/metric/cuda/cuda_rank_metric.cu @@ -0,0 +1,223 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "cuda_rank_metric.hpp" +#include + +namespace LightGBM { + +template +__global__ void EvalKernel_NDCG_SharedMemory( + const double* score, + const label_t* label, + const label_t* query_weights, + const data_size_t* query_boundareis, + const data_size_t num_queries, + const data_size_t* eval_at, + const data_size_t num_eval, + const double* inverse_max_dcgs, + const double* label_gains, + const double* discount, + const data_size_t max_items_in_query, + double* block_ndcg_buffer) { + __shared__ uint16_t shared_item_indices[SHARED_MEMORY_SIZE]; + __shared__ score_t shared_item_scores[SHARED_MEMORY_SIZE]; + __shared__ double shared_eval_result[MAX_NUM_EVAL]; + __shared__ data_size_t shared_eval_at[MAX_NUM_EVAL]; + __shared__ double shared_shuffle_buffer[32]; + for (data_size_t eval_index = 0; eval_index < num_eval; eval_index += static_cast(blockDim.x)) { + shared_eval_at[eval_index] = eval_at[eval_index]; + shared_eval_result[eval_index] = 0.0f; + } + __syncthreads(); + const data_size_t start_query_index = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK_METRIC; + const data_size_t end_query_index = min(start_query_index + NUM_QUERY_PER_BLOCK_METRIC, num_queries); + for (data_size_t query_index = start_query_index; query_index < end_query_index; ++query_index) { + const data_size_t item_start_index = query_boundareis[query_index]; + const data_size_t item_end_index = query_boundareis[query_index + 1]; + const data_size_t num_items = item_end_index - item_start_index; + const double* score_ptr = score + item_start_index; + const label_t* label_ptr = label + item_start_index; + const double* inverse_max_dcgs_ptr = inverse_max_dcgs + query_index * num_eval; + for (data_size_t item_index = static_cast(threadIdx.x); item_index < num_items; item_index += static_cast(blockDim.x)) { + shared_item_scores[item_index] = static_cast(score_ptr[item_index]); + } + __syncthreads(); + if (MAX_ITEM_GREATER_THAN_1024) { + if (num_items > 1024) { + BitonicArgSort_2048(shared_item_scores, shared_item_indices); + } else { + BitonicArgSort_1024(shared_item_scores, shared_item_indices, static_cast(num_items)); + } + } else { + BitonicArgSort_1024(shared_item_scores, shared_item_indices, static_cast(num_items)); + } + __syncthreads(); + double thread_eval = 0.0f; + data_size_t item_index = static_cast(threadIdx.x); + for (data_size_t eval_index = 0; eval_index < num_eval; ++eval_index) { + data_size_t cur_eval_pos = min(num_items, shared_eval_at[eval_index]); + const double* discount_ptr = discount + eval_index * max_items_in_query; + for (; item_index < cur_eval_pos; item_index += static_cast(blockDim.x)) { + thread_eval += label_ptr[shared_item_indices[item_index]] * discount_ptr[item_index]; + } + __syncthreads(); + double block_eval = ShuffleReduceSum(thread_eval, shared_shuffle_buffer, blockDim.x); + if (USE_QUERY_WEIGHT) { + block_eval *= static_cast(query_weights[query_index]); + } + if (threadIdx.x == 0) { + shared_eval_result[eval_index] += block_eval * inverse_max_dcgs_ptr[eval_index]; + } + } + __syncthreads(); + } + for (data_size_t eval_index = static_cast(threadIdx.x); eval_index < num_eval; eval_index += static_cast(blockDim.x)) { + block_ndcg_buffer[eval_index * gridDim.x + blockIdx.x] = shared_eval_result[eval_index]; + } +} + +template +__global__ void EvalKernel_NDCG_GlobalMemory( + const double* score, + const label_t* label, + const label_t* query_weights, + const data_size_t* query_boundareis, + const data_size_t num_queries, + const data_size_t* eval_at, + const data_size_t num_eval, + const double* inverse_max_dcgs, + const double* label_gains, + const double* discount, + const data_size_t max_items_in_query, + double* block_ndcg_buffer, + const data_size_t* cuda_item_indices_buffer) { + __shared__ double shared_eval_result[MAX_NUM_EVAL]; + __shared__ data_size_t shared_eval_at[MAX_NUM_EVAL]; + __shared__ double shared_shuffle_buffer[32]; + for (data_size_t eval_index = 0; eval_index < num_eval; eval_index += static_cast(blockDim.x)) { + shared_eval_at[eval_index] = eval_at[eval_index]; + shared_eval_result[eval_index] = 0.0f; + } + __syncthreads(); + const data_size_t start_query_index = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK_METRIC; + const data_size_t end_query_index = min(start_query_index + NUM_QUERY_PER_BLOCK_METRIC, num_queries); + for (data_size_t query_index = start_query_index; query_index < end_query_index; ++query_index) { + const data_size_t item_start_index = query_boundareis[query_index]; + const data_size_t item_end_index = query_boundareis[query_index + 1]; + const data_size_t num_items = item_end_index - item_start_index; + const label_t* label_ptr = label + item_start_index; + const double* inverse_max_dcgs_ptr = inverse_max_dcgs + query_index * num_eval; + const data_size_t* sorted_item_indices_ptr = cuda_item_indices_buffer + item_start_index; + double thread_eval = 0.0f; + data_size_t item_index = static_cast(threadIdx.x); + for (data_size_t eval_index = 0; eval_index < num_eval; ++eval_index) { + data_size_t cur_eval_pos = min(num_items, shared_eval_at[eval_index]); + const double* discount_ptr = discount + eval_index * max_items_in_query; + for (; item_index < cur_eval_pos; item_index += static_cast(blockDim.x)) { + thread_eval += label_ptr[sorted_item_indices_ptr[item_index]] * discount_ptr[item_index]; + } + __syncthreads(); + double block_eval = ShuffleReduceSum(thread_eval, shared_shuffle_buffer, blockDim.x); + if (USE_QUERY_WEIGHT) { + block_eval *= static_cast(query_weights[query_index]); + } + if (threadIdx.x == 0) { + shared_eval_result[eval_index] += block_eval * inverse_max_dcgs_ptr[eval_index]; + } + } + __syncthreads(); + } + for (data_size_t eval_index = static_cast(threadIdx.x); eval_index < num_eval; eval_index += static_cast(blockDim.x)) { + block_ndcg_buffer[eval_index * gridDim.x + blockIdx.x] = shared_eval_result[eval_index]; + } +} + +#define EvalKernel_NDCG_ARGS \ + score, \ + cuda_label_, \ + cuda_query_weights_, \ + cuda_query_boundaries_, \ + num_queries_, \ + cuda_eval_at_, \ + num_eval_, \ + cuda_inverse_max_dcgs_, \ + cuda_label_gain_, \ + cuda_discount_, \ + max_items_in_query_, \ + cuda_block_dcg_buffer_ + +void CUDANDCGMetric::LaunchEvalKernel(const double* score) const { + const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK_METRIC - 1) / NUM_QUERY_PER_BLOCK_METRIC; + if (cuda_query_weights_ == nullptr) { + if (max_items_in_query_aligned_ <= 1024) { + if (num_eval_ <= 32) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else if (num_eval_ <= 256) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else if (num_eval_ <= 1024) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else { + Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); + } + } else if (max_items_in_query_aligned_ <= 2048) { + if (num_eval_ <= 32) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else if (num_eval_ <= 256) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else if (num_eval_ <= 1024) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else { + Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); + } + } else { + BitonicArgSortItemsGlobal(score, num_queries_, cuda_query_boundaries_, cuda_item_indices_buffer_); + if (num_eval_ <= 32) { + EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); + } else if (num_eval_ <= 256) { + EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); + } else if (num_eval_ <= 1024) { + EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); + } else { + Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); + } + } + } else { + if (max_items_in_query_aligned_ <= 1024) { + if (num_eval_ <= 32) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else if (num_eval_ <= 256) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else if (num_eval_ <= 1024) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else { + Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); + } + } else if (max_items_in_query_aligned_ <= 2048) { + if (num_eval_ <= 32) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else if (num_eval_ <= 256) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else if (num_eval_ <= 1024) { + EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); + } else { + Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); + } + } else { + BitonicArgSortItemsGlobal(score, num_queries_, cuda_query_boundaries_, cuda_item_indices_buffer_); + if (num_eval_ <= 32) { + EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); + } else if (num_eval_ <= 256) { + EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); + } else if (num_eval_ <= 1024) { + EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); + } else { + Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); + } + } + } +} + +} // namespace LightGBM diff --git a/src/metric/cuda/cuda_rank_metric.hpp b/src/metric/cuda/cuda_rank_metric.hpp new file mode 100644 index 000000000000..d862e07af41d --- /dev/null +++ b/src/metric/cuda/cuda_rank_metric.hpp @@ -0,0 +1,48 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_METRIC_CUDA_CUDA_RANK_METRIC_HPP_ +#define LIGHTGBM_METRIC_CUDA_CUDA_RANK_METRIC_HPP_ + +#include "cuda_metric.hpp" +#include "../rank_metric.hpp" + +#define EVAL_BLOCK_SIZE_RANK_METRIC (1024) +#define NUM_QUERY_PER_BLOCK_METRIC (10) +#define MAX_RANK_LABEL_METRIC (32) + +namespace LightGBM { + +class CUDANDCGMetric : public CUDAMetricInterface, public NDCGMetric { + public: + explicit CUDANDCGMetric(const Config& config); + + ~CUDANDCGMetric(); + + void Init(const Metadata& metadata, data_size_t num_data) override; + + std::vector Eval(const double* score, const ObjectiveFunction*) const override; + + private: + void LaunchEvalKernel(const double* score) const; + + const label_t* cuda_label_; + const label_t* cuda_weights_; + const data_size_t* cuda_query_boundaries_; + const label_t* cuda_query_weights_; + const double* cuda_block_reduce_buffer_; + data_size_t* cuda_eval_at_; + double* cuda_inverse_max_dcgs_; + double* cuda_label_gain_; + double* cuda_discount_; + double* cuda_block_dcg_buffer_; + data_size_t* cuda_item_indices_buffer_; + int max_items_in_query_aligned_; + int max_items_in_query_; + int num_eval_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_METRIC_CUDA_CUDA_RANK_METRIC_HPP_ diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp index 58804f415278..cf9a45d2230b 100644 --- a/src/metric/rank_metric.hpp +++ b/src/metric/rank_metric.hpp @@ -143,7 +143,7 @@ class NDCGMetric:public Metric { return result; } - private: + protected: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ diff --git a/src/objective/cuda/cuda_rank_objective.hpp b/src/objective/cuda/cuda_rank_objective.hpp index b35152c423ba..8343ddeb4750 100644 --- a/src/objective/cuda/cuda_rank_objective.hpp +++ b/src/objective/cuda/cuda_rank_objective.hpp @@ -46,7 +46,6 @@ class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG const data_size_t* cuda_query_boundaries_; // Host memory - label_t max_label_; int max_items_in_query_aligned_; }; From 510d8782a43f530da23380d12c6e5adbe1bed21a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 10 Sep 2021 03:55:36 +0000 Subject: [PATCH 065/166] fix cross entropy objectives and metrics --- include/LightGBM/cuda/cuda_column_data.hpp | 2 +- include/LightGBM/cuda/cuda_metadata.hpp | 2 +- include/LightGBM/cuda/cuda_row_data.hpp | 2 +- include/LightGBM/cuda/cuda_tree.hpp | 2 +- include/LightGBM/dataset.h | 3 +- src/boosting/gbdt.cpp | 2 + src/io/cuda/cuda_column_data.cpp | 7 +++- src/io/cuda/cuda_metadata.cpp | 8 +++- src/io/cuda/cuda_row_data.cpp | 8 +++- src/io/cuda/cuda_tree.cpp | 7 +++- src/io/dataset.cpp | 6 ++- src/io/metadata.cpp | 4 +- src/main.cpp | 10 +---- src/metric/cuda/cuda_multiclass_metric.cpp | 5 ++- src/metric/cuda/cuda_multiclass_metric.cu | 12 +++--- src/metric/cuda/cuda_multiclass_metric.hpp | 22 +++++++--- src/metric/cuda/cuda_xentropy_metric.cpp | 6 +-- src/metric/metric.cpp | 2 + .../cuda/cuda_multiclass_objective.cpp | 42 +++++++++++++++++++ .../cuda/cuda_multiclass_objective.cu | 32 +++++++------- .../cuda/cuda_xentropy_objective.cpp | 7 ++++ src/objective/cuda/cuda_xentropy_objective.cu | 24 +++++++++++ .../cuda/cuda_xentropy_objective.hpp | 25 ++++++++++- src/objective/multiclass_objective.hpp | 6 --- .../cuda/cuda_histogram_constructor.cpp | 8 ++-- .../cuda/cuda_histogram_constructor.hpp | 5 ++- .../cuda/new_cuda_tree_learner.cpp | 14 ++++--- 27 files changed, 201 insertions(+), 72 deletions(-) diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp index 84db2a0bee61..efffd9dcae20 100644 --- a/include/LightGBM/cuda/cuda_column_data.hpp +++ b/include/LightGBM/cuda/cuda_column_data.hpp @@ -17,7 +17,7 @@ namespace LightGBM { class CUDAColumnData { public: - CUDAColumnData(const data_size_t num_data); + CUDAColumnData(const data_size_t num_data, const int gpu_device_id); ~CUDAColumnData(); diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp index db2044e494f4..13767c96b5d5 100644 --- a/include/LightGBM/cuda/cuda_metadata.hpp +++ b/include/LightGBM/cuda/cuda_metadata.hpp @@ -14,7 +14,7 @@ namespace LightGBM { class CUDAMetadata { public: - CUDAMetadata(); + CUDAMetadata(const int gpu_device_id); ~CUDAMetadata(); diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 1ea850d5b2f8..9e86ab3b634b 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -21,7 +21,7 @@ namespace LightGBM { class CUDARowData { public: CUDARowData(const Dataset* train_data, - const TrainingShareStates* train_share_state); + const TrainingShareStates* train_share_state, const int gpu_device_id); void Init(const Dataset* train_data, TrainingShareStates* train_share_state); diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index b00b1406c1d1..b874758c81e2 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -31,7 +31,7 @@ class CUDATree : public Tree { * \param track_branch_features Whether to keep track of ancestors of leaf nodes * \param is_linear Whether the tree has linear models at each leaf */ - explicit CUDATree(int max_leaves, bool track_branch_features, bool is_linear); + explicit CUDATree(int max_leaves, bool track_branch_features, bool is_linear, const int gpu_device_id); explicit CUDATree(const Tree* host_tree); diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 73c096819852..92dd613751ae 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -214,7 +214,7 @@ class Metadata { CUDAMetadata* cuda_metadata() const { return cuda_metadata_.get(); } - void CreateCUDAMetadata(); + void CreateCUDAMetadata(const int gpu_device_id); private: /*! \brief Load initial scores from file */ @@ -760,6 +760,7 @@ class Dataset { std::vector numeric_feature_map_; int num_numeric_features_; std::string device_type_; + int gpu_device_id_; std::unique_ptr cuda_column_data_; }; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index a1c13d7b27fc..221770be68d8 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -67,6 +67,8 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective if (config_->device_type == std::string("cuda")) { LGBM_config_::current_learner = use_cuda_learner; + const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0; + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); } // load forced_splits file diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp index 39b02ed6916a..73e0dca6252d 100644 --- a/src/io/cuda/cuda_column_data.cpp +++ b/src/io/cuda/cuda_column_data.cpp @@ -7,9 +7,14 @@ namespace LightGBM { -CUDAColumnData::CUDAColumnData(const data_size_t num_data) { +CUDAColumnData::CUDAColumnData(const data_size_t num_data, const int gpu_device_id) { num_threads_ = OMP_NUM_THREADS(); num_data_ = num_data; + if (gpu_device_id >= 0) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); + } else { + CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + } } CUDAColumnData::~CUDAColumnData() {} diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp index eb3ee6f09f37..781a5d7a5797 100644 --- a/src/io/cuda/cuda_metadata.cpp +++ b/src/io/cuda/cuda_metadata.cpp @@ -7,7 +7,13 @@ namespace LightGBM { -CUDAMetadata::CUDAMetadata() {} +CUDAMetadata::CUDAMetadata(const int gpu_device_id) { + if (gpu_device_id >= 0) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); + } else { + CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + } +} CUDAMetadata::~CUDAMetadata() {} diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 6fecde8f149a..c909c1a371c4 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -8,12 +8,18 @@ namespace LightGBM { CUDARowData::CUDARowData(const Dataset* train_data, - const TrainingShareStates* train_share_state) { + const TrainingShareStates* train_share_state, + const int gpu_device_id) { num_threads_ = OMP_NUM_THREADS(); num_data_ = train_data->num_data(); num_total_bin_ = static_cast(train_share_state->feature_hist_offsets().back()); num_feature_group_ = train_data->num_feature_groups(); num_feature_ = train_data->num_features(); + if (gpu_device_id >= 0) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); + } else { + CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + } } void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_share_state) { diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index 9e5a9e87ebe4..bf768540394c 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -7,10 +7,15 @@ namespace LightGBM { -CUDATree::CUDATree(int max_leaves, bool track_branch_features, bool is_linear): +CUDATree::CUDATree(int max_leaves, bool track_branch_features, bool is_linear, const int gpu_device_id): Tree(max_leaves, track_branch_features, is_linear), num_threads_per_block_add_prediction_to_score_(1024) { is_cuda_tree_ = true; + if (gpu_device_id >= 0) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); + } else { + CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + } InitCUDAMemory(); } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index aa21cfa319d6..2a48ed98303c 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -426,6 +426,7 @@ void Dataset::Construct(std::vector>* bin_mappers, } } device_type_ = io_config.device_type; + gpu_device_id_ = io_config.gpu_device_id; } void Dataset::FinishLoad() { @@ -439,7 +440,7 @@ void Dataset::FinishLoad() { } if (device_type_ == std::string("cuda")) { CreateCUDAColumnData(); - metadata_.CreateCUDAMetadata(); + metadata_.CreateCUDAMetadata(gpu_device_id_); } else { cuda_column_data_.reset(nullptr); } @@ -775,6 +776,7 @@ void Dataset::CreateValid(const Dataset* dataset) { real_feature_idx_ = dataset->real_feature_idx_; forced_bin_bounds_ = dataset->forced_bin_bounds_; device_type_ = dataset->device_type_; + gpu_device_id_ = dataset->gpu_device_id_; } void Dataset::ReSize(data_size_t num_data) { @@ -1499,7 +1501,7 @@ const void* Dataset::GetColWiseData( } void Dataset::CreateCUDAColumnData() { - cuda_column_data_.reset(new CUDAColumnData(num_data_)); + cuda_column_data_.reset(new CUDAColumnData(num_data_, gpu_device_id_)); int num_columns = 0; std::vector column_data; std::vector column_bin_iterator; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 8a90c9d9627f..cb8fb3ad064c 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -473,8 +473,8 @@ void Metadata::LoadQueryWeights() { } } -void Metadata::CreateCUDAMetadata() { - cuda_metadata_.reset(new CUDAMetadata()); +void Metadata::CreateCUDAMetadata(const int gpu_device_id) { + cuda_metadata_.reset(new CUDAMetadata(gpu_device_id)); cuda_metadata_->Init(label_, weights_, query_boundaries_, query_weights_, init_score_, queries_); } diff --git a/src/main.cpp b/src/main.cpp index b2cb1f6804f4..8034da826811 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,16 +8,10 @@ #include "network/linkers.h" -int main(int /*argc*/, char** /*argv*/) { +int main(int argc, char** argv) { bool success = false; - const std::string config_str = std::string("config=train.conf"); - char* argv = new char[config_str.size() + 1]; - for (size_t i = 0; i < config_str.size(); ++i) { - argv[i] = config_str[i]; - } - argv[config_str.size()] = '\0'; try { - LightGBM::Application app(2, &argv - 1); + LightGBM::Application app(argc, argv); app.Run(); #ifdef USE_MPI diff --git a/src/metric/cuda/cuda_multiclass_metric.cpp b/src/metric/cuda/cuda_multiclass_metric.cpp index 7943d2989b17..960b89674dbb 100644 --- a/src/metric/cuda/cuda_multiclass_metric.cpp +++ b/src/metric/cuda/cuda_multiclass_metric.cpp @@ -15,12 +15,13 @@ CUDAMulticlassMetric::~CUDAMulticlassMetric() {} template void CUDAMulticlassMetric::Init(const Metadata& metadata, data_size_t num_data) { + MulticlassMetric::Init(metadata, num_data); cuda_label_ = metadata.cuda_metadata()->cuda_label(); cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - + const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data * this->num_class_), __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); } diff --git a/src/metric/cuda/cuda_multiclass_metric.cu b/src/metric/cuda/cuda_multiclass_metric.cu index 37d367082d29..23ec424a6e12 100644 --- a/src/metric/cuda/cuda_multiclass_metric.cu +++ b/src/metric/cuda/cuda_multiclass_metric.cu @@ -20,11 +20,12 @@ __global__ void EvalKernel_MulticlassPointWiseLoss(const double* score, // assert that warpSize == 32 and maximum number of threads per block is 1024 __shared__ double shared_buffer[32]; const int data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const double* score_ptr = score + data_index * num_classes; - const double pointwise_loss = data_index < num_data ? - (USE_WEIGHT ? CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score_ptr, num_classes, multi_error_top_k) * weights[data_index] : - CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score_ptr, num_classes, multi_error_top_k)) : - 0.0f; + double pointwise_loss = 0.0f; + if (data_index < num_data) { + pointwise_loss = (USE_WEIGHT ? + CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score, data_index, num_data, num_classes, multi_error_top_k) * weights[data_index] : + CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score, data_index, num_data, num_classes, multi_error_top_k)); + } const double loss = ShuffleReduceSum(pointwise_loss, shared_buffer, blockDim.x); if (threadIdx.x == 0) { cuda_sum_loss_buffer[blockIdx.x] = loss; @@ -408,7 +409,6 @@ void CUDAAucMuMetric::LaunchEvalKernel(const double* score) const { cuda_reduce_block_buffer_, cuda_reduce_ans_buffer_); } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); BlockReduceSum<<<1, EVAL_BLOCK_SIZE_MULTICLASS_METRIC>>>(cuda_reduce_ans_buffer_, num_class_pair); } diff --git a/src/metric/cuda/cuda_multiclass_metric.hpp b/src/metric/cuda/cuda_multiclass_metric.hpp index 36c8476e8f72..21f61fb7102d 100644 --- a/src/metric/cuda/cuda_multiclass_metric.hpp +++ b/src/metric/cuda/cuda_multiclass_metric.hpp @@ -49,11 +49,19 @@ class CUDAMultiErrorMetric : public CUDAMulticlassMetric { public: explicit CUDAMultiErrorMetric(const Config& config); - __device__ inline static double LossOnPointCUDA(label_t label, const double* score, const int num_classes, const int multi_error_top_k) { + __device__ inline static double LossOnPointCUDA( + label_t label, + const double* score, + const data_size_t data_index, + const data_size_t num_data, + const int num_classes, + const int multi_error_top_k) { const size_t k = static_cast(label); + const double true_class_score = score[k * num_data + data_index]; int num_larger = 0; for (int i = 0; i < num_classes; ++i) { - if (score[i] >= score[k]) ++num_larger; + const double this_class_score = score[i * num_data + data_index]; + if (this_class_score >= true_class_score) ++num_larger; if (num_larger > multi_error_top_k) return 1.0f; } return 0.0f; @@ -72,10 +80,14 @@ class CUDAMultiSoftmaxLoglossMetric : public CUDAMulticlassMetric(label); - if (score[k] > kEpsilon) { - return static_cast(-log(score[k])); + const double point_score = score[k * num_data + data_index]; + if (point_score > kEpsilon) { + return static_cast(-log(point_score)); } else { return -log(kEpsilon); } diff --git a/src/metric/cuda/cuda_xentropy_metric.cpp b/src/metric/cuda/cuda_xentropy_metric.cpp index 68d83bbd3933..728b6743dd5c 100644 --- a/src/metric/cuda/cuda_xentropy_metric.cpp +++ b/src/metric/cuda/cuda_xentropy_metric.cpp @@ -25,7 +25,7 @@ void CUDACrossEntropyMetric::Init(const Metadata& metadata, data_size_t num_data std::vector CUDACrossEntropyMetric::Eval(const double* score, const ObjectiveFunction* objective) const { double sum_loss = 0.0f; objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); - LaunchEvalKernel(score); + LaunchEvalKernel(cuda_score_convert_buffer_); CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); return std::vector(1, sum_loss / sum_weights_); } @@ -47,7 +47,7 @@ void CUDACrossEntropyLambdaMetric::Init(const Metadata& metadata, data_size_t nu std::vector CUDACrossEntropyLambdaMetric::Eval(const double* score, const ObjectiveFunction* objective) const { objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); - LaunchEvalKernel(score); + LaunchEvalKernel(cuda_score_convert_buffer_); double sum_loss = 0.0f; CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); return std::vector(1, sum_loss / static_cast(num_data_)); @@ -70,7 +70,7 @@ void CUDAKullbackLeiblerDivergence::Init(const Metadata& metadata, data_size_t n std::vector CUDAKullbackLeiblerDivergence::Eval(const double* score, const ObjectiveFunction* objective) const { objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); - LaunchEvalKernel(score); + LaunchEvalKernel(cuda_score_convert_buffer_); double sum_loss = 0.0f; CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); return std::vector(1, presum_label_entropy_ + sum_loss / sum_weights_); diff --git a/src/metric/metric.cpp b/src/metric/metric.cpp index bd9b5833c187..ffc4832d7a2b 100644 --- a/src/metric/metric.cpp +++ b/src/metric/metric.cpp @@ -42,6 +42,8 @@ Metric* Metric::CreateMetric(const std::string& type, const Config& config) { return new CUDAAUCMetric(config); } else if (type == std::string("average_precision")) { return new CUDAAveragePrecisionMetric(config); + } else if (type == std::string("multi_logloss")) { + return new CUDAMultiSoftmaxLoglossMetric(config); } else if (type == std::string("multi_error")) { return new CUDAMultiErrorMetric(config); } else if (type == std::string("cross_entropy")) { diff --git a/src/objective/cuda/cuda_multiclass_objective.cpp b/src/objective/cuda/cuda_multiclass_objective.cpp index 9d68ad051736..3fd303411a9d 100644 --- a/src/objective/cuda/cuda_multiclass_objective.cpp +++ b/src/objective/cuda/cuda_multiclass_objective.cpp @@ -25,6 +25,48 @@ void CUDAMulticlassSoftmax::Init(const Metadata& metadata, data_size_t num_data) void CUDAMulticlassSoftmax::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { LaunchGetGradientsKernel(score, gradients, hessians); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + for (int class_index = 0; class_index < num_class_; ++class_index) { + std::vector host_gradients(num_data_, 0.0f); + std::vector host_hessians(num_data_, 0.0f); + const size_t offset = static_cast(class_index * num_data_); + CopyFromCUDADeviceToHostOuter(host_gradients.data(), gradients + offset, static_cast(num_data_), __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(host_hessians.data(), hessians + offset, static_cast(num_data_), __FILE__, __LINE__); + const int num_threads = OMP_NUM_THREADS(); + std::vector thread_abs_max_gradient(num_threads, 0.0f); + std::vector thread_abs_max_hessian(num_threads, 0.0f); + std::vector thread_abs_min_hessian(num_threads, std::numeric_limits::infinity()); + Threading::For(0, num_data_, 512, + [&thread_abs_max_gradient, &thread_abs_max_hessian, &thread_abs_min_hessian, &host_gradients, &host_hessians] (int thread_index, data_size_t start, data_size_t end) { + for (data_size_t index = start; index < end; ++index) { + const score_t gradient = host_gradients[index]; + const score_t hessian = host_hessians[index]; + if (std::fabs(gradient) > std::fabs(thread_abs_max_gradient[thread_index])) { + thread_abs_max_gradient[thread_index] = gradient; + } + if (std::fabs(hessian) > std::fabs(thread_abs_max_hessian[thread_index])) { + thread_abs_max_hessian[thread_index] = hessian; + } + if (std::fabs(hessian) < std::fabs(thread_abs_min_hessian[thread_index])) { + thread_abs_min_hessian[thread_index] = hessian; + } + } + }); + double max_abs_gradient = 0.0f; + double max_abs_hessian = 0.0f; + double min_abs_hessian = std::numeric_limits::infinity(); + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + if (std::fabs(thread_abs_max_gradient[thread_index]) > std::fabs(max_abs_gradient)) { + max_abs_gradient = thread_abs_max_gradient[thread_index]; + } + if (std::fabs(thread_abs_max_hessian[thread_index] > std::fabs(max_abs_hessian))) { + max_abs_hessian = thread_abs_max_hessian[thread_index]; + } + if (std::fabs(thread_abs_min_hessian[thread_index] < std::fabs(min_abs_hessian))) { + min_abs_hessian = thread_abs_min_hessian[thread_index]; + } + } + Log::Warning("class %d max_abs_gradient = %f, max_abs_hessian = %f", class_index, max_abs_gradient, max_abs_hessian); + } } void CUDAMulticlassSoftmax::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { diff --git a/src/objective/cuda/cuda_multiclass_objective.cu b/src/objective/cuda/cuda_multiclass_objective.cu index e00842b32eb2..98fc1d0f460b 100644 --- a/src/objective/cuda/cuda_multiclass_objective.cu +++ b/src/objective/cuda/cuda_multiclass_objective.cu @@ -21,21 +21,6 @@ __device__ void SoftmaxCUDA(double* softmax_buffer, int len) { } } -__device__ void SoftmaxCUDA(const double* input, int len, double* output) { - double wmax = input[0]; - for (int i = 1; i < len; ++i) { - wmax = max(input[i], wmax); - } - double wsum = 0.0f; - for (int i = 0; i < len; ++i) { - output[i] = exp(input[i] - wmax); - wsum += output[i]; - } - for (int i = 0; i < len; ++i) { - output[i] /= static_cast(wsum); - } -} - template __global__ void GetGradientsKernel_MulticlassSoftmax( const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, @@ -46,6 +31,10 @@ __global__ void GetGradientsKernel_MulticlassSoftmax( const data_size_t offset = data_index * num_class; double* softmax_result = cuda_softmax_buffer + offset; for (int k = 0; k < num_class; ++k) { + const double point_score = cuda_scores[k * num_data + data_index]; + if (isnan(point_score)) { + printf("error find nan %f in score ==================================================\n", point_score); + } softmax_result[k] = cuda_scores[k * num_data + data_index]; } SoftmaxCUDA(softmax_result, num_class); @@ -90,11 +79,18 @@ void CUDAMulticlassSoftmax::LaunchGetGradientsKernel(const double* scores, score } __global__ void ConvertOutputCUDAKernel_MulticlassSoftmax( - const int num_class, const data_size_t num_data, const double* input, double* output) { + const int num_class, const data_size_t num_data, const double* input, double* cuda_softmax_buffer, double* output) { const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (data_index < num_data) { const data_size_t offset = data_index * num_class; - SoftmaxCUDA(input + offset, num_class, output + offset); + double* cuda_softmax_buffer_ptr = cuda_softmax_buffer + offset; + for (int class_index = 0; class_index < num_class; ++class_index) { + cuda_softmax_buffer_ptr[class_index] = input[class_index * num_data + data_index]; + } + SoftmaxCUDA(cuda_softmax_buffer_ptr, num_class); + for (int class_index = 0; class_index < num_class; ++class_index) { + output[class_index * num_data + data_index] = cuda_softmax_buffer_ptr[class_index]; + } } } @@ -102,7 +98,7 @@ void CUDAMulticlassSoftmax::LaunchConvertOutputCUDAKernel( const data_size_t num_data, const double* input, double* output) const { const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_MULTICLASS - 1) / GET_GRADIENTS_BLOCK_SIZE_MULTICLASS; ConvertOutputCUDAKernel_MulticlassSoftmax<<>>( - num_class_, num_data, input, output); + num_class_, num_data, input, cuda_softmax_buffer_, output); } } // namespace LightGBM diff --git a/src/objective/cuda/cuda_xentropy_objective.cpp b/src/objective/cuda/cuda_xentropy_objective.cpp index 068d48a2efc5..d9247736ff43 100644 --- a/src/objective/cuda/cuda_xentropy_objective.cpp +++ b/src/objective/cuda/cuda_xentropy_objective.cpp @@ -29,6 +29,10 @@ void CUDACrossEntropy::GetGradients(const double* score, score_t* gradients, sco LaunchGetGradientsKernel(score, gradients, hessians); } +void CUDACrossEntropy::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { + LaunchConvertOutputCUDAKernel(num_data, input, output); +} + CUDACrossEntropyLambda::CUDACrossEntropyLambda(const Config& config): CrossEntropyLambda(config) {} CUDACrossEntropyLambda::CUDACrossEntropyLambda(const std::vector& strs): CrossEntropyLambda(strs) {} @@ -51,5 +55,8 @@ void CUDACrossEntropyLambda::GetGradients(const double* score, score_t* gradient LaunchGetGradientsKernel(score, gradients, hessians); } +void CUDACrossEntropyLambda::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { + LaunchConvertOutputCUDAKernel(num_data, input, output); +} } // namespace LightGBM diff --git a/src/objective/cuda/cuda_xentropy_objective.cu b/src/objective/cuda/cuda_xentropy_objective.cu index 3813d0978a46..3595877470a4 100644 --- a/src/objective/cuda/cuda_xentropy_objective.cu +++ b/src/objective/cuda/cuda_xentropy_objective.cu @@ -59,6 +59,18 @@ void CUDACrossEntropy::LaunchGetGradientsKernel(const double* score, score_t* gr } } +__global__ void ConvertOutputCUDAKernel_CrossEntropy(const data_size_t num_data, const double* input, double* output) { + const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (data_index < num_data) { + output[data_index] = 1.0f / (1.0f + exp(-input[data_index])); + } +} + +void CUDACrossEntropy::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { + const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; + ConvertOutputCUDAKernel_CrossEntropy<<>>(num_data, input, output); +} + double CUDACrossEntropyLambda::LaunchCalcInitScoreKernel() const { double suml = 0.0f; double sumw = 0.0f; @@ -117,4 +129,16 @@ void CUDACrossEntropyLambda::LaunchGetGradientsKernel(const double* score, score } } +__global__ void ConvertOutputCUDAKernel_CUDACrossEntropyLambda(const data_size_t num_data, const double* input, double* output) { + const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (data_index < num_data) { + output[data_index] = log(1.0f + exp(input[data_index])); + } +} + +void CUDACrossEntropyLambda::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { + const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; + ConvertOutputCUDAKernel_CUDACrossEntropyLambda<<>>(num_data, input, output); +} + } // namespace LightGBM diff --git a/src/objective/cuda/cuda_xentropy_objective.hpp b/src/objective/cuda/cuda_xentropy_objective.hpp index 2e42c1f4cc72..39e47bb5fa5a 100644 --- a/src/objective/cuda/cuda_xentropy_objective.hpp +++ b/src/objective/cuda/cuda_xentropy_objective.hpp @@ -5,13 +5,14 @@ #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_XENTROPY_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_XENTROPY_OBJECTIVE_HPP_ +#include #include "../xentropy_objective.hpp" #define GET_GRADIENTS_BLOCK_SIZE_XENTROPY (1024) namespace LightGBM { -class CUDACrossEntropy: public CrossEntropy { +class CUDACrossEntropy: public CUDAObjectiveInterface, public CrossEntropy { public: explicit CUDACrossEntropy(const Config& config); @@ -25,17 +26,27 @@ class CUDACrossEntropy: public CrossEntropy { virtual void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; + void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; + + std::function GetCUDAConvertOutputFunc() const override { + return [this] (data_size_t num_data, const double* input, double* output) { + ConvertOutputCUDA(num_data, input, output); + }; + } + private: void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; double LaunchCalcInitScoreKernel() const; + void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; + const label_t* cuda_labels_; const label_t* cuda_weights_; double* cuda_reduce_sum_buffer_; }; -class CUDACrossEntropyLambda: public CrossEntropyLambda { +class CUDACrossEntropyLambda: public CUDAObjectiveInterface, public CrossEntropyLambda { public: explicit CUDACrossEntropyLambda(const Config& config); @@ -49,11 +60,21 @@ class CUDACrossEntropyLambda: public CrossEntropyLambda { virtual void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; + void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; + + std::function GetCUDAConvertOutputFunc() const override { + return [this] (data_size_t num_data, const double* input, double* output) { + ConvertOutputCUDA(num_data, input, output); + }; + } + private: void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; double LaunchCalcInitScoreKernel() const; + void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; + const label_t* cuda_labels_; const label_t* cuda_weights_; double* cuda_reduce_sum_buffer_; diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp index c20fc43083cf..5379caec1199 100644 --- a/src/objective/multiclass_objective.hpp +++ b/src/objective/multiclass_objective.hpp @@ -127,12 +127,6 @@ class MulticlassSoftmax: public ObjectiveFunction { } } } - for (int i = 0; i < 100; ++i) { - Log::Warning("class 0 data %d gradient %f hessian %f", i, gradients[i], hessians[i]); - } - for (int i = 0; i < 100; ++i) { - Log::Warning("class 1 data %d gradient %f hessian %f", i, gradients[i + num_data_], hessians[i + num_data_]); - } } void ConvertOutput(const double* input, double* output) const override { diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 87dc92b51660..fbd8c6f99e99 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -16,14 +16,16 @@ CUDAHistogramConstructor::CUDAHistogramConstructor( const int num_threads, const std::vector& feature_hist_offsets, const int min_data_in_leaf, - const double min_sum_hessian_in_leaf): + const double min_sum_hessian_in_leaf, + const int gpu_device_id): num_data_(train_data->num_data()), num_features_(train_data->num_features()), num_leaves_(num_leaves), num_threads_(num_threads), num_feature_groups_(train_data->num_feature_groups()), min_data_in_leaf_(min_data_in_leaf), - min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf) { + min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf), + gpu_device_id_(gpu_device_id) { int offset = 0; for (int group_id = 0; group_id < train_data->num_feature_groups(); ++group_id) { offset += train_data->FeatureGroupNumBin(group_id); @@ -73,7 +75,7 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_most_freq_bins_, feature_most_freq_bins_.data(), feature_most_freq_bins_.size(), __FILE__, __LINE__); - cuda_row_data_.reset(new CUDARowData(train_data, share_state)); + cuda_row_data_.reset(new CUDARowData(train_data, share_state, gpu_device_id_)); cuda_row_data_->Init(train_data, share_state); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 613ba969acf6..25a0f3ff686a 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -38,7 +38,8 @@ class CUDAHistogramConstructor { const int num_threads, const std::vector& feature_hist_offsets, const int min_data_in_leaf, - const double min_sum_hessian_in_leaf); + const double min_sum_hessian_in_leaf, + const int gpu_device_id); void Init(const Dataset* train_data, TrainingShareStates* share_state); @@ -132,6 +133,8 @@ class CUDAHistogramConstructor { const score_t* cuda_gradients_; /*! \brief hessians on CUDA */ const score_t* cuda_hessians_; + + const int gpu_device_id_; }; } // namespace LightGBM diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index ad236210fe54..2addbdd75435 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -23,14 +23,15 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia // use the first gpu by now SerialTreeLearner::Init(train_data, is_constant_hessian); num_threads_ = OMP_NUM_THREADS(); - CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0; + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_)); cuda_smaller_leaf_splits_->Init(); cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_)); cuda_larger_leaf_splits_->Init(); cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, share_state_->feature_hist_offsets(), - config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf)); + config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf, config_->gpu_device_id)); cuda_histogram_constructor_->Init(train_data_, share_state_.get()); cuda_data_partition_.reset(new CUDADataPartition( @@ -100,7 +101,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, double find_best_split_from_all_leaves_time = 0.0f; double split_data_indices_time = 0.0f; const bool track_branch_features = !(config_->interaction_constraints_vector.empty()); - std::unique_ptr tree(new CUDATree(config_->num_leaves, track_branch_features, config_->linear_tree)); + std::unique_ptr tree(new CUDATree(config_->num_leaves, track_branch_features, config_->linear_tree, config_->gpu_device_id)); for (int i = 0; i < config_->num_leaves - 1; ++i) { global_timer.Start("NewCUDATreeLearner::ConstructHistogramForLeaf"); auto start = std::chrono::steady_clock::now(); @@ -204,13 +205,16 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, SynchronizeCUDADeviceOuter(__FILE__, __LINE__); const auto end = std::chrono::steady_clock::now(); const double duration = (static_cast>(end - start)).count(); - Log::Warning("Train time %f", duration); + /*Log::Warning("Train time %f", duration); Log::Warning("before train time %f", static_cast>(before_train_end - before_train_start).count()); Log::Warning("construct histogram time %f", construct_histogram_time); Log::Warning("find best split time %f", find_best_split_time); Log::Warning("find best split time from all leaves %f", find_best_split_from_all_leaves_time); - Log::Warning("split data indices time %f", split_data_indices_time); + Log::Warning("split data indices time %f", split_data_indices_time);*/ tree->ToHost(); + /*for (int leaf_index = 0; leaf_index < tree->num_leaves(); ++leaf_index) { + Log::Warning("tree->LeafOutput(%d) = %f", leaf_index, tree->LeafOutput(leaf_index)); + }*/ return tree.release(); } From 95f4612f180103f952992172e06dc0fb8dcb8e19 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 10 Sep 2021 12:49:29 +0000 Subject: [PATCH 066/166] fix cross entropy and ndcg metrics --- src/boosting/cuda/cuda_score_updater.cpp | 20 ++- src/metric/cuda/cuda_binary_metric.cu | 2 +- src/metric/cuda/cuda_multiclass_metric.cu | 3 +- src/metric/cuda/cuda_multiclass_metric.hpp | 1 - src/metric/cuda/cuda_rank_metric.cpp | 4 + src/metric/cuda/cuda_rank_metric.cu | 114 ++++++++++++------ src/metric/cuda/cuda_rank_metric.hpp | 2 +- src/metric/cuda/cuda_regression_metric.cu | 2 +- src/metric/cuda/cuda_xentropy_metric.cu | 2 +- src/metric/metric.cpp | 3 + .../cuda/cuda_multiclass_objective.cpp | 2 +- src/objective/multiclass_objective.hpp | 40 ++++++ .../cuda/new_cuda_tree_learner.cpp | 11 +- src/treelearner/serial_tree_learner.cpp | 8 ++ 14 files changed, 167 insertions(+), 47 deletions(-) diff --git a/src/boosting/cuda/cuda_score_updater.cpp b/src/boosting/cuda/cuda_score_updater.cpp index a05f8edb3d10..10026955bbe2 100644 --- a/src/boosting/cuda/cuda_score_updater.cpp +++ b/src/boosting/cuda/cuda_score_updater.cpp @@ -27,7 +27,6 @@ CUDAScoreUpdater::CUDAScoreUpdater(const Dataset* data, int num_tree_per_iterati } void CUDAScoreUpdater::InitCUDA(const size_t total_size) { - Log::Warning("allocating cuda_score_ memory with size %d", total_size); AllocateCUDAMemoryOuter(&cuda_score_, total_size, __FILE__, __LINE__); } @@ -51,6 +50,25 @@ inline void CUDAScoreUpdater::AddScore(const TreeLearner* tree_learner, const Tr Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; tree_learner->AddPredictionToScore(tree, cuda_score_ + offset); + std::vector class_train_score(num_data_, 0.0f); + CopyFromCUDADeviceToHostOuter(class_train_score.data(), cuda_score_ + offset, num_data_, __FILE__, __LINE__); + const int num_threads = OMP_NUM_THREADS(); + std::vector thread_max_abs_train_score(num_threads, 0.0f); + Threading::For(0, num_data_, 512, + [&thread_max_abs_train_score, &class_train_score] (int thread_index, data_size_t start, data_size_t end) { + for (data_size_t data_index = start; data_index < end; ++data_index) { + if (std::fabs(class_train_score[data_index]) > std::fabs(thread_max_abs_train_score[thread_index])) { + thread_max_abs_train_score[thread_index] = class_train_score[data_index]; + } + } + }); + double max_abs_train_score = 0.0f; + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + if (std::fabs(thread_max_abs_train_score[thread_index]) > std::fabs(max_abs_train_score)) { + max_abs_train_score = thread_max_abs_train_score[thread_index]; + } + } + Log::Warning("class %d max_abs_train_score = %f", cur_tree_id, max_abs_train_score); } inline void CUDAScoreUpdater::AddScore(const Tree* tree, const data_size_t* data_indices, diff --git a/src/metric/cuda/cuda_binary_metric.cu b/src/metric/cuda/cuda_binary_metric.cu index b15df092d48a..75b8a2e5697d 100644 --- a/src/metric/cuda/cuda_binary_metric.cu +++ b/src/metric/cuda/cuda_binary_metric.cu @@ -36,7 +36,7 @@ __global__ void ReduceLossKernel(const double* cuda_sum_loss_buffer, const data_ for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { thread_sum_loss += cuda_sum_loss_buffer[block_index]; } - const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, static_cast(num_blocks)); + const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, blockDim.x); if (threadIdx.x == 0) { *out_loss = sum_loss; } diff --git a/src/metric/cuda/cuda_multiclass_metric.cu b/src/metric/cuda/cuda_multiclass_metric.cu index 23ec424a6e12..621c139a1db7 100644 --- a/src/metric/cuda/cuda_multiclass_metric.cu +++ b/src/metric/cuda/cuda_multiclass_metric.cu @@ -39,7 +39,7 @@ __global__ void ReduceLossKernel_Multiclass(const double* cuda_sum_loss_buffer, for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { thread_sum_loss += cuda_sum_loss_buffer[block_index]; } - const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, static_cast(num_blocks)); + const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, blockDim.x); if (threadIdx.x == 0) { *out_loss = sum_loss; } @@ -48,6 +48,7 @@ __global__ void ReduceLossKernel_Multiclass(const double* cuda_sum_loss_buffer, template void CUDAMulticlassMetric::LaunchEvalKernelInner(const double* score) const { const data_size_t num_blocks = (MulticlassMetric::num_data_ + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; + Log::Warning("num_blocks = %d", num_blocks); if (cuda_weights_ == nullptr) { EvalKernel_MulticlassPointWiseLoss<<>>( score, cuda_label_, cuda_weights_, diff --git a/src/metric/cuda/cuda_multiclass_metric.hpp b/src/metric/cuda/cuda_multiclass_metric.hpp index 21f61fb7102d..ddbe5b057346 100644 --- a/src/metric/cuda/cuda_multiclass_metric.hpp +++ b/src/metric/cuda/cuda_multiclass_metric.hpp @@ -24,7 +24,6 @@ class CUDAMulticlassMetric : public CUDAMetricInterface, public MulticlassMetric std::vector Eval(const double* score, const ObjectiveFunction* objective) const override; inline static double AverageLoss(double sum_loss, double sum_weights) { - // need sqrt the result for RMSE loss return (sum_loss / sum_weights); } diff --git a/src/metric/cuda/cuda_rank_metric.cpp b/src/metric/cuda/cuda_rank_metric.cpp index ca10bfe9ef81..06ac6d9b5368 100644 --- a/src/metric/cuda/cuda_rank_metric.cpp +++ b/src/metric/cuda/cuda_rank_metric.cpp @@ -60,10 +60,14 @@ void CUDANDCGMetric::Init(const Metadata& metadata, data_size_t num_data) { const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK_METRIC - 1) / NUM_QUERY_PER_BLOCK_METRIC; AllocateCUDAMemoryOuter(&cuda_block_dcg_buffer_, static_cast(num_blocks * num_eval_), __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_item_indices_buffer_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_ndcg_result_, static_cast(num_eval_), __FILE__, __LINE__); } std::vector CUDANDCGMetric::Eval(const double* score, const ObjectiveFunction*) const { LaunchEvalKernel(score); + std::vector result(num_eval_, 0.0f); + CopyFromCUDADeviceToHostOuter(result.data(), cuda_ndcg_result_, static_cast(num_eval_), __FILE__, __LINE__); + return result; } } // namespace LightGBM diff --git a/src/metric/cuda/cuda_rank_metric.cu b/src/metric/cuda/cuda_rank_metric.cu index 2aad1b7c6a85..c93acf59276c 100644 --- a/src/metric/cuda/cuda_rank_metric.cu +++ b/src/metric/cuda/cuda_rank_metric.cu @@ -20,14 +20,13 @@ __global__ void EvalKernel_NDCG_SharedMemory( const double* inverse_max_dcgs, const double* label_gains, const double* discount, - const data_size_t max_items_in_query, double* block_ndcg_buffer) { __shared__ uint16_t shared_item_indices[SHARED_MEMORY_SIZE]; __shared__ score_t shared_item_scores[SHARED_MEMORY_SIZE]; __shared__ double shared_eval_result[MAX_NUM_EVAL]; __shared__ data_size_t shared_eval_at[MAX_NUM_EVAL]; __shared__ double shared_shuffle_buffer[32]; - for (data_size_t eval_index = 0; eval_index < num_eval; eval_index += static_cast(blockDim.x)) { + for (data_size_t eval_index = static_cast(threadIdx.x); eval_index < num_eval; eval_index += static_cast(blockDim.x)) { shared_eval_at[eval_index] = eval_at[eval_index]; shared_eval_result[eval_index] = 0.0f; } @@ -35,44 +34,59 @@ __global__ void EvalKernel_NDCG_SharedMemory( const data_size_t start_query_index = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK_METRIC; const data_size_t end_query_index = min(start_query_index + NUM_QUERY_PER_BLOCK_METRIC, num_queries); for (data_size_t query_index = start_query_index; query_index < end_query_index; ++query_index) { - const data_size_t item_start_index = query_boundareis[query_index]; - const data_size_t item_end_index = query_boundareis[query_index + 1]; - const data_size_t num_items = item_end_index - item_start_index; - const double* score_ptr = score + item_start_index; - const label_t* label_ptr = label + item_start_index; const double* inverse_max_dcgs_ptr = inverse_max_dcgs + query_index * num_eval; - for (data_size_t item_index = static_cast(threadIdx.x); item_index < num_items; item_index += static_cast(blockDim.x)) { - shared_item_scores[item_index] = static_cast(score_ptr[item_index]); - } - __syncthreads(); - if (MAX_ITEM_GREATER_THAN_1024) { - if (num_items > 1024) { - BitonicArgSort_2048(shared_item_scores, shared_item_indices); - } else { - BitonicArgSort_1024(shared_item_scores, shared_item_indices, static_cast(num_items)); + if (inverse_max_dcgs_ptr[0] < 0.0f) { + for (data_size_t eval_index = static_cast(threadIdx.x); eval_index < num_eval; eval_index += static_cast(blockDim.x)) { + shared_eval_result[eval_index] += 1.0f; } } else { - BitonicArgSort_1024(shared_item_scores, shared_item_indices, static_cast(num_items)); - } - __syncthreads(); - double thread_eval = 0.0f; - data_size_t item_index = static_cast(threadIdx.x); - for (data_size_t eval_index = 0; eval_index < num_eval; ++eval_index) { - data_size_t cur_eval_pos = min(num_items, shared_eval_at[eval_index]); - const double* discount_ptr = discount + eval_index * max_items_in_query; - for (; item_index < cur_eval_pos; item_index += static_cast(blockDim.x)) { - thread_eval += label_ptr[shared_item_indices[item_index]] * discount_ptr[item_index]; + const data_size_t item_start_index = query_boundareis[query_index]; + const data_size_t item_end_index = query_boundareis[query_index + 1]; + const data_size_t num_items = item_end_index - item_start_index; + const double* score_ptr = score + item_start_index; + const label_t* label_ptr = label + item_start_index; + for (data_size_t item_index = static_cast(threadIdx.x); item_index < num_items; item_index += static_cast(blockDim.x)) { + shared_item_scores[item_index] = static_cast(score_ptr[item_index]); + shared_item_indices[item_index] = item_index; + } + for (data_size_t item_index = num_items + static_cast(threadIdx.x); item_index < SHARED_MEMORY_SIZE; item_index += static_cast(blockDim.x)) { + shared_item_scores[item_index] = kMinScore; + shared_item_indices[item_index] = item_index; } __syncthreads(); - double block_eval = ShuffleReduceSum(thread_eval, shared_shuffle_buffer, blockDim.x); - if (USE_QUERY_WEIGHT) { - block_eval *= static_cast(query_weights[query_index]); + if (MAX_ITEM_GREATER_THAN_1024) { + if (num_items > 1024) { + for (data_size_t item_index = num_items + static_cast(threadIdx.x); item_index < SHARED_MEMORY_SIZE; item_index += static_cast(blockDim.x)) { + shared_item_scores[item_index] = kMinScore; + } + __syncthreads(); + BitonicArgSort_2048(shared_item_scores, shared_item_indices); + } else { + BitonicArgSort_1024(shared_item_scores, shared_item_indices, static_cast(num_items)); + } + } else { + BitonicArgSort_1024(shared_item_scores, shared_item_indices, static_cast(num_items)); } - if (threadIdx.x == 0) { - shared_eval_result[eval_index] += block_eval * inverse_max_dcgs_ptr[eval_index]; + __syncthreads(); + double thread_eval = 0.0f; + data_size_t item_index = static_cast(threadIdx.x); + for (data_size_t eval_index = 0; eval_index < num_eval; ++eval_index) { + data_size_t cur_eval_pos = min(num_items, shared_eval_at[eval_index]); + for (; item_index < cur_eval_pos; item_index += static_cast(blockDim.x)) { + const int data_label = static_cast(label_ptr[shared_item_indices[item_index]]); + thread_eval += label_gains[data_label] * discount[item_index]; + } + __syncthreads(); + double block_eval = ShuffleReduceSum(thread_eval, shared_shuffle_buffer, blockDim.x); + if (USE_QUERY_WEIGHT) { + block_eval *= static_cast(query_weights[query_index]); + } + if (threadIdx.x == 0) { + shared_eval_result[eval_index] += block_eval * inverse_max_dcgs_ptr[eval_index]; + } } + __syncthreads(); } - __syncthreads(); } for (data_size_t eval_index = static_cast(threadIdx.x); eval_index < num_eval; eval_index += static_cast(blockDim.x)) { block_ndcg_buffer[eval_index * gridDim.x + blockIdx.x] = shared_eval_result[eval_index]; @@ -91,7 +105,6 @@ __global__ void EvalKernel_NDCG_GlobalMemory( const double* inverse_max_dcgs, const double* label_gains, const double* discount, - const data_size_t max_items_in_query, double* block_ndcg_buffer, const data_size_t* cuda_item_indices_buffer) { __shared__ double shared_eval_result[MAX_NUM_EVAL]; @@ -115,9 +128,13 @@ __global__ void EvalKernel_NDCG_GlobalMemory( data_size_t item_index = static_cast(threadIdx.x); for (data_size_t eval_index = 0; eval_index < num_eval; ++eval_index) { data_size_t cur_eval_pos = min(num_items, shared_eval_at[eval_index]); - const double* discount_ptr = discount + eval_index * max_items_in_query; for (; item_index < cur_eval_pos; item_index += static_cast(blockDim.x)) { - thread_eval += label_ptr[sorted_item_indices_ptr[item_index]] * discount_ptr[item_index]; + const uint16_t sorted_item_index = sorted_item_indices_ptr[item_index]; + if (static_cast(sorted_item_index) >= num_items) { + printf("error sorted_item_index = %d, num_items = %d\n", sorted_item_index, num_items); + } + const int data_label = static_cast(label_ptr[sorted_item_indices_ptr[item_index]]); + thread_eval += label_gains[data_label] * discount[item_index]; } __syncthreads(); double block_eval = ShuffleReduceSum(thread_eval, shared_shuffle_buffer, blockDim.x); @@ -135,6 +152,26 @@ __global__ void EvalKernel_NDCG_GlobalMemory( } } +__global__ void ReduceNDCGFromBlocks( + const double* block_ndcg_buffer, + const data_size_t num_eval, + const int num_blocks, + double* ndcg_result, + const double sum_query_weights) { + __shared__ double shared_mem_buffer[32]; + const data_size_t eval_index = static_cast(blockIdx.x); + const double* block_ndcg_buffer_ptr = block_ndcg_buffer + eval_index * num_blocks; + double thread_sum = 0.0f; + for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { + thread_sum += block_ndcg_buffer_ptr[block_index]; + } + __syncthreads(); + const double block_sum = ShuffleReduceSum(thread_sum, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + ndcg_result[eval_index] = block_sum / sum_query_weights; + } +} + #define EvalKernel_NDCG_ARGS \ score, \ cuda_label_, \ @@ -146,7 +183,6 @@ __global__ void EvalKernel_NDCG_GlobalMemory( cuda_inverse_max_dcgs_, \ cuda_label_gain_, \ cuda_discount_, \ - max_items_in_query_, \ cuda_block_dcg_buffer_ void CUDANDCGMetric::LaunchEvalKernel(const double* score) const { @@ -218,6 +254,12 @@ void CUDANDCGMetric::LaunchEvalKernel(const double* score) const { } } } + ReduceNDCGFromBlocks<<>>( + cuda_block_dcg_buffer_, + num_eval_, + num_blocks, + cuda_ndcg_result_, + sum_query_weights_); } } // namespace LightGBM diff --git a/src/metric/cuda/cuda_rank_metric.hpp b/src/metric/cuda/cuda_rank_metric.hpp index d862e07af41d..5058fe9117b3 100644 --- a/src/metric/cuda/cuda_rank_metric.hpp +++ b/src/metric/cuda/cuda_rank_metric.hpp @@ -31,12 +31,12 @@ class CUDANDCGMetric : public CUDAMetricInterface, public NDCGMetric { const label_t* cuda_weights_; const data_size_t* cuda_query_boundaries_; const label_t* cuda_query_weights_; - const double* cuda_block_reduce_buffer_; data_size_t* cuda_eval_at_; double* cuda_inverse_max_dcgs_; double* cuda_label_gain_; double* cuda_discount_; double* cuda_block_dcg_buffer_; + double* cuda_ndcg_result_; data_size_t* cuda_item_indices_buffer_; int max_items_in_query_aligned_; int max_items_in_query_; diff --git a/src/metric/cuda/cuda_regression_metric.cu b/src/metric/cuda/cuda_regression_metric.cu index 2b5a0f12ed0f..f66ce058a766 100644 --- a/src/metric/cuda/cuda_regression_metric.cu +++ b/src/metric/cuda/cuda_regression_metric.cu @@ -39,7 +39,7 @@ __global__ void ReduceLossKernel_Regression(const double* cuda_sum_loss_buffer, for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { thread_sum_loss += cuda_sum_loss_buffer[block_index]; } - const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, static_cast(num_blocks)); + const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, blockDim.x); if (threadIdx.x == 0) { *out_loss = sum_loss; } diff --git a/src/metric/cuda/cuda_xentropy_metric.cu b/src/metric/cuda/cuda_xentropy_metric.cu index 05aa875c65fa..793efa92e421 100644 --- a/src/metric/cuda/cuda_xentropy_metric.cu +++ b/src/metric/cuda/cuda_xentropy_metric.cu @@ -60,7 +60,7 @@ __global__ void ReduceLossKernel_CrossEntropy(const double* cuda_sum_loss_buffer for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { thread_sum_loss += cuda_sum_loss_buffer[block_index]; } - const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, static_cast(num_blocks)); + const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, blockDim.x); if (threadIdx.x == 0) { *out_loss = sum_loss; } diff --git a/src/metric/metric.cpp b/src/metric/metric.cpp index ffc4832d7a2b..f9fb8c7efe19 100644 --- a/src/metric/metric.cpp +++ b/src/metric/metric.cpp @@ -15,6 +15,7 @@ #include "cuda/cuda_regression_metric.hpp" #include "cuda/cuda_multiclass_metric.hpp" #include "cuda/cuda_xentropy_metric.hpp" +#include "cuda/cuda_rank_metric.hpp" namespace LightGBM { @@ -46,6 +47,8 @@ Metric* Metric::CreateMetric(const std::string& type, const Config& config) { return new CUDAMultiSoftmaxLoglossMetric(config); } else if (type == std::string("multi_error")) { return new CUDAMultiErrorMetric(config); + } else if (type == std::string("ndcg")) { + return new CUDANDCGMetric(config); } else if (type == std::string("cross_entropy")) { return new CUDACrossEntropyMetric(config); } else if (type == std::string("cross_entropy_lambda")) { diff --git a/src/objective/cuda/cuda_multiclass_objective.cpp b/src/objective/cuda/cuda_multiclass_objective.cpp index 3fd303411a9d..764c81ae0eb6 100644 --- a/src/objective/cuda/cuda_multiclass_objective.cpp +++ b/src/objective/cuda/cuda_multiclass_objective.cpp @@ -65,7 +65,7 @@ void CUDAMulticlassSoftmax::GetGradients(const double* score, score_t* gradients min_abs_hessian = thread_abs_min_hessian[thread_index]; } } - Log::Warning("class %d max_abs_gradient = %f, max_abs_hessian = %f", class_index, max_abs_gradient, max_abs_hessian); + Log::Warning("class %d max_abs_gradient = %f, max_abs_hessian = %f, min_abs_hessian = %f", class_index, max_abs_gradient, max_abs_hessian, min_abs_hessian); } } diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp index 5379caec1199..07db1dfeb963 100644 --- a/src/objective/multiclass_objective.hpp +++ b/src/objective/multiclass_objective.hpp @@ -127,6 +127,46 @@ class MulticlassSoftmax: public ObjectiveFunction { } } } + for (int class_index = 0; class_index < num_class_; ++class_index) { + const size_t offset = static_cast(class_index * num_data_); + const score_t* host_gradients_ptr = gradients + offset; + const score_t* host_hessians_ptr = hessians + offset; + const int num_threads = OMP_NUM_THREADS(); + std::vector thread_abs_max_gradient(num_threads, 0.0f); + std::vector thread_abs_max_hessian(num_threads, 0.0f); + std::vector thread_abs_min_hessian(num_threads, std::numeric_limits::infinity()); + Threading::For(0, num_data_, 512, + [&thread_abs_max_gradient, &thread_abs_max_hessian, &thread_abs_min_hessian, host_gradients_ptr, host_hessians_ptr] (int thread_index, data_size_t start, data_size_t end) { + for (data_size_t index = start; index < end; ++index) { + const score_t gradient = host_gradients_ptr[index]; + const score_t hessian = host_hessians_ptr[index]; + if (std::fabs(gradient) > std::fabs(thread_abs_max_gradient[thread_index])) { + thread_abs_max_gradient[thread_index] = gradient; + } + if (std::fabs(hessian) > std::fabs(thread_abs_max_hessian[thread_index])) { + thread_abs_max_hessian[thread_index] = hessian; + } + if (std::fabs(hessian) < std::fabs(thread_abs_min_hessian[thread_index])) { + thread_abs_min_hessian[thread_index] = hessian; + } + } + }); + double max_abs_gradient = 0.0f; + double max_abs_hessian = 0.0f; + double min_abs_hessian = std::numeric_limits::infinity(); + for (int thread_index = 0; thread_index < num_threads; ++thread_index) { + if (std::fabs(thread_abs_max_gradient[thread_index]) > std::fabs(max_abs_gradient)) { + max_abs_gradient = thread_abs_max_gradient[thread_index]; + } + if (std::fabs(thread_abs_max_hessian[thread_index] > std::fabs(max_abs_hessian))) { + max_abs_hessian = thread_abs_max_hessian[thread_index]; + } + if (std::fabs(thread_abs_min_hessian[thread_index] < std::fabs(min_abs_hessian))) { + min_abs_hessian = thread_abs_min_hessian[thread_index]; + } + } + Log::Warning("class %d max_abs_gradient = %f, max_abs_hessian = %f, min_abs_hessian = %f", class_index, max_abs_gradient, max_abs_hessian, min_abs_hessian); + } } void ConvertOutput(const double* input, double* output) const override { diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 2addbdd75435..6335ba98a519 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -212,9 +212,14 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, Log::Warning("find best split time from all leaves %f", find_best_split_from_all_leaves_time); Log::Warning("split data indices time %f", split_data_indices_time);*/ tree->ToHost(); - /*for (int leaf_index = 0; leaf_index < tree->num_leaves(); ++leaf_index) { - Log::Warning("tree->LeafOutput(%d) = %f", leaf_index, tree->LeafOutput(leaf_index)); - }*/ + double max_abs_leaf_output = 0.0f; + for (int leaf_index = 0; leaf_index < tree->num_leaves(); ++leaf_index) { + //Log::Warning("leaf_index %d leaf_value %f", leaf_index, tree->LeafOutput(leaf_index)); + if (std::fabs(tree->LeafOutput(leaf_index)) > std::fabs(max_abs_leaf_output)) { + max_abs_leaf_output = tree->LeafOutput(leaf_index); + } + } + Log::Warning("max_abs_leaf_output = %f", max_abs_leaf_output); return tree.release(); } diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 5fd872d61eeb..28be19d1b714 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -205,6 +205,14 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians } Log::Debug("Trained a tree with leaves = %d and depth = %d", tree->num_leaves(), cur_depth); + double max_abs_leaf_output = 0.0f; + for (int leaf_index = 0; leaf_index < tree->num_leaves(); ++leaf_index) { + Log::Warning("leaf_index %d leaf_value %f", leaf_index, tree->LeafOutput(leaf_index)); + if (std::fabs(tree->LeafOutput(leaf_index)) > std::fabs(max_abs_leaf_output)) { + max_abs_leaf_output = tree->LeafOutput(leaf_index); + } + } + Log::Warning("max_abs_leaf_output = %f", max_abs_leaf_output); return tree.release(); } From bb997d01bdcad970d304ce8131267278901df561 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 10 Sep 2021 13:36:46 +0000 Subject: [PATCH 067/166] add support for customized objective in CUDA --- src/boosting/gbdt.cpp | 12 ++++++++++-- src/boosting/gbdt.h | 6 +++++- src/metric/cuda/cuda_binary_metric.cpp | 4 +++- src/metric/cuda/cuda_multiclass_metric.cpp | 4 +++- src/metric/cuda/cuda_regression_metric.cpp | 4 +++- src/metric/cuda/cuda_xentropy_metric.cpp | 12 +++++++++--- 6 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 221770be68d8..0cb1a7db49fb 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -392,6 +392,14 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { Boosting(); gradients = gradients_pointer_; hessians = hessians_pointer_; + } else { + if (config_->device_type == std::string("cuda")) { + const size_t total_size = static_cast(num_data_ * num_class_); + CopyFromHostToCUDADeviceOuter(gradients_pointer_, gradients, total_size, __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(hessians_pointer_, hessians, total_size, __FILE__, __LINE__); + gradients = gradients_pointer_; + hessians = hessians_pointer_; + } } // bagging logic Bagging(iter_); @@ -527,12 +535,12 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { } std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { - /*if (config_->device_type == std::string("cuda")) { + if (config_->device_type == std::string("cuda")) { std::vector tmp_score(num_data * num_class_, 0.0f); CopyFromCUDADeviceToHostOuter(tmp_score.data(), score, static_cast(num_data * num_class_), __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); return metric->Eval(tmp_score.data(), objective_function_); - } else*/ { + } else { return metric->Eval(score, objective_function_); } } diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 756dbe7dd18d..1103b08b7bca 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -401,7 +401,11 @@ class GBDT : public GBDTBase { int num_tree_per_iteration() const override { return num_tree_per_iteration_; } virtual std::function GetCUDAConvertOutputFunc() const { - return objective_function_->GetCUDAConvertOutputFunc(); + if (objective_function_ != nullptr) { + return objective_function_->GetCUDAConvertOutputFunc(); + } else { + return [] (data_size_t, const double*, double*) {}; + } } protected: diff --git a/src/metric/cuda/cuda_binary_metric.cpp b/src/metric/cuda/cuda_binary_metric.cpp index df4d4f975369..efec22a74560 100644 --- a/src/metric/cuda/cuda_binary_metric.cpp +++ b/src/metric/cuda/cuda_binary_metric.cpp @@ -29,7 +29,9 @@ void CUDABinaryMetric::Init(const Metadata& metadat template std::vector CUDABinaryMetric::Eval(const double* score, const ObjectiveFunction* objective) const { double sum_loss = 0.0f; - objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); + if (objective != nullptr) { + objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); + } LaunchEvalKernel(cuda_score_convert_buffer_); CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); return std::vector(1, sum_loss / this->sum_weights_); diff --git a/src/metric/cuda/cuda_multiclass_metric.cpp b/src/metric/cuda/cuda_multiclass_metric.cpp index 960b89674dbb..136c8f6de848 100644 --- a/src/metric/cuda/cuda_multiclass_metric.cpp +++ b/src/metric/cuda/cuda_multiclass_metric.cpp @@ -28,7 +28,9 @@ void CUDAMulticlassMetric::Init(const Metadata& met template std::vector CUDAMulticlassMetric::Eval(const double* score, const ObjectiveFunction* objective) const { double sum_loss = 0.0f; - objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); + if (objective != nullptr) { + objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); + } LaunchEvalKernel(cuda_score_convert_buffer_); CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); return std::vector(1, CUDAPointWiseLossCalculator::AverageLoss(sum_loss, this->sum_weights_)); diff --git a/src/metric/cuda/cuda_regression_metric.cpp b/src/metric/cuda/cuda_regression_metric.cpp index 325f6e214351..91af8b4c1e64 100644 --- a/src/metric/cuda/cuda_regression_metric.cpp +++ b/src/metric/cuda/cuda_regression_metric.cpp @@ -28,7 +28,9 @@ void CUDARegressionMetric::Init(const Metadata& met template std::vector CUDARegressionMetric::Eval(const double* score, const ObjectiveFunction* objective) const { double sum_loss = 0.0f; - objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); + if (objective != nullptr) { + objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); + } LaunchEvalKernel(cuda_score_convert_buffer_); CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); return std::vector(1, CUDAPointWiseLossCalculator::AverageLoss(sum_loss, this->sum_weights_)); diff --git a/src/metric/cuda/cuda_xentropy_metric.cpp b/src/metric/cuda/cuda_xentropy_metric.cpp index 728b6743dd5c..7b7d69ace57d 100644 --- a/src/metric/cuda/cuda_xentropy_metric.cpp +++ b/src/metric/cuda/cuda_xentropy_metric.cpp @@ -24,7 +24,9 @@ void CUDACrossEntropyMetric::Init(const Metadata& metadata, data_size_t num_data std::vector CUDACrossEntropyMetric::Eval(const double* score, const ObjectiveFunction* objective) const { double sum_loss = 0.0f; - objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); + if (objective != nullptr) { + objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); + } LaunchEvalKernel(cuda_score_convert_buffer_); CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); return std::vector(1, sum_loss / sum_weights_); @@ -46,7 +48,9 @@ void CUDACrossEntropyLambdaMetric::Init(const Metadata& metadata, data_size_t nu } std::vector CUDACrossEntropyLambdaMetric::Eval(const double* score, const ObjectiveFunction* objective) const { - objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); + if (objective != nullptr) { + objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); + } LaunchEvalKernel(cuda_score_convert_buffer_); double sum_loss = 0.0f; CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); @@ -69,7 +73,9 @@ void CUDAKullbackLeiblerDivergence::Init(const Metadata& metadata, data_size_t n } std::vector CUDAKullbackLeiblerDivergence::Eval(const double* score, const ObjectiveFunction* objective) const { - objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); + if (objective != nullptr) { + objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); + } LaunchEvalKernel(cuda_score_convert_buffer_); double sum_loss = 0.0f; CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); From 17b78d13de89ab6ed60ec6bba6f98ba7a221b200 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 10 Sep 2021 14:51:09 +0000 Subject: [PATCH 068/166] complete multiclass ova for CUDA --- src/boosting/gbdt.cpp | 6 +++++ src/objective/cuda/cuda_binary_objective.cpp | 16 ++++++++++++-- src/objective/cuda/cuda_binary_objective.cu | 18 +++++++++++++++ src/objective/cuda/cuda_binary_objective.hpp | 7 +++++- .../cuda/cuda_multiclass_objective.cpp | 22 ++++++++++++++++++- .../cuda/cuda_multiclass_objective.hpp | 18 +++++++++++++++ 6 files changed, 83 insertions(+), 4 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 0cb1a7db49fb..a1a04e0fa40e 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -124,6 +124,12 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective gradients_pointer_ = gradients_.data(); hessians_pointer_ = hessians_.data(); } + } else { + if (config_->device_type == std::string("cuda")) { + size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + AllocateCUDAMemoryOuter(&gradients_pointer_, total_size, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&hessians_pointer_, total_size, __FILE__, __LINE__); + } } // get max feature index max_feature_idx_ = train_data_->num_total_features() - 1; diff --git a/src/objective/cuda/cuda_binary_objective.cpp b/src/objective/cuda/cuda_binary_objective.cpp index 51e8e24436ad..a2465caf2ad9 100644 --- a/src/objective/cuda/cuda_binary_objective.cpp +++ b/src/objective/cuda/cuda_binary_objective.cpp @@ -10,8 +10,11 @@ namespace LightGBM { +CUDABinaryLogloss::CUDABinaryLogloss(const Config& config): +BinaryLogloss(config), ova_class_id_(-1) {} + CUDABinaryLogloss::CUDABinaryLogloss(const Config& config, const int ova_class_id): -BinaryLogloss(config), ova_class_id_(ova_class_id) {} +BinaryLogloss(config, [ova_class_id](label_t label) { return static_cast(label) == ova_class_id; }), ova_class_id_(ova_class_id) {} CUDABinaryLogloss::CUDABinaryLogloss(const std::vector& strs): BinaryLogloss(strs) {} @@ -19,7 +22,15 @@ CUDABinaryLogloss::~CUDABinaryLogloss() {} void CUDABinaryLogloss::Init(const Metadata& metadata, data_size_t num_data) { BinaryLogloss::Init(metadata, num_data); - cuda_label_ = metadata.cuda_metadata()->cuda_label(); + if (ova_class_id_ == -1) { + cuda_label_ = metadata.cuda_metadata()->cuda_label(); + cuda_ova_label_ = nullptr; + } else { + Log::Warning("converting cuda labels with ova_class_id_ = %d", ova_class_id_); + InitCUDAMemoryFromHostMemoryOuter(&cuda_ova_label_, metadata.cuda_metadata()->cuda_label(), static_cast(num_data), __FILE__, __LINE__); + LaunchResetOVACUDALableKernel(); + cuda_label_ = cuda_ova_label_; + } cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); AllocateCUDAMemoryOuter(&cuda_boost_from_score_, 1, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_boost_from_score_, 0, 1, __FILE__, __LINE__); @@ -40,6 +51,7 @@ double CUDABinaryLogloss::BoostFromScore(int) const { SynchronizeCUDADeviceOuter(__FILE__, __LINE__); double boost_from_score = 0.0f; CopyFromCUDADeviceToHostOuter(&boost_from_score, cuda_boost_from_score_, 1, __FILE__, __LINE__); + Log::Warning("boost_from_score = %f", boost_from_score); return boost_from_score; } diff --git a/src/objective/cuda/cuda_binary_objective.cu b/src/objective/cuda/cuda_binary_objective.cu index 590e20ea8c00..f29dacbe1ca0 100644 --- a/src/objective/cuda/cuda_binary_objective.cu +++ b/src/objective/cuda/cuda_binary_objective.cu @@ -142,6 +142,24 @@ void CUDABinaryLogloss::LaunchConvertOutputCUDAKernel(const data_size_t num_data ConvertOutputCUDAKernel_BinaryLogloss<<>>(sigmoid_, num_data, input, output); } +__global__ void ResetOVACUDALableKernel( + const int ova_class_id, + const data_size_t num_data, + label_t* cuda_label) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + const int int_label = static_cast(cuda_label[data_index]); + cuda_label[data_index] == (int_label == ova_class_id ? 1.0f : 0.0f); + } +} + +void CUDABinaryLogloss::LaunchResetOVACUDALableKernel() const { + Log::Warning("before LaunchResetOVACUDALableKernel, ova_class_id = %d", ova_class_id_); + const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY; + ResetOVACUDALableKernel<<>>(ova_class_id_, num_data_, cuda_ova_label_); + Log::Warning("after LaunchResetOVACUDALableKernel"); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/objective/cuda/cuda_binary_objective.hpp b/src/objective/cuda/cuda_binary_objective.hpp index 7cec5637e73d..f7397593353f 100644 --- a/src/objective/cuda/cuda_binary_objective.hpp +++ b/src/objective/cuda/cuda_binary_objective.hpp @@ -20,7 +20,9 @@ namespace LightGBM { class CUDABinaryLogloss : public CUDAObjectiveInterface, public BinaryLogloss { public: - explicit CUDABinaryLogloss(const Config& config, const int ova_class_id = -1); + explicit CUDABinaryLogloss(const Config& config); + + explicit CUDABinaryLogloss(const Config& config, const int ova_class_id); explicit CUDABinaryLogloss(const std::vector& strs); @@ -47,8 +49,11 @@ class CUDABinaryLogloss : public CUDAObjectiveInterface, public BinaryLogloss { void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; + void LaunchResetOVACUDALableKernel() const; + // CUDA memory, held by other objects const label_t* cuda_label_; + label_t* cuda_ova_label_; const label_t* cuda_weights_; // CUDA memory, held by this object diff --git a/src/objective/cuda/cuda_multiclass_objective.cpp b/src/objective/cuda/cuda_multiclass_objective.cpp index 764c81ae0eb6..5a5e5d1be99d 100644 --- a/src/objective/cuda/cuda_multiclass_objective.cpp +++ b/src/objective/cuda/cuda_multiclass_objective.cpp @@ -76,7 +76,7 @@ void CUDAMulticlassSoftmax::ConvertOutputCUDA(const data_size_t num_data, const CUDAMulticlassOVA::CUDAMulticlassOVA(const Config& config) { num_class_ = config.num_class; for (int i = 0; i < num_class_; ++i) { - binary_loss_.emplace_back(new CUDABinaryLogloss(config, i)); + cuda_binary_loss_.emplace_back(new CUDABinaryLogloss(config, i)); } sigmoid_ = config.sigmoid; } @@ -85,4 +85,24 @@ CUDAMulticlassOVA::CUDAMulticlassOVA(const std::vector& strs): Mult CUDAMulticlassOVA::~CUDAMulticlassOVA() {} +void CUDAMulticlassOVA::Init(const Metadata& metadata, data_size_t num_data) { + num_data_ = num_data; + for (int i = 0; i < num_class_; ++i) { + cuda_binary_loss_[i]->Init(metadata, num_data); + } +} + +void CUDAMulticlassOVA::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { + for (int i = 0; i < num_class_; ++i) { + int64_t offset = static_cast(num_data_) * i; + cuda_binary_loss_[i]->GetGradients(score + offset, gradients + offset, hessians + offset); + } +} + +void CUDAMulticlassOVA::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { + for (int i = 0; i < num_class_; ++i) { + cuda_binary_loss_[i]->ConvertOutputCUDA(num_data, input + i * num_data, output + i * num_data); + } +} + } // namespace LightGBM diff --git a/src/objective/cuda/cuda_multiclass_objective.hpp b/src/objective/cuda/cuda_multiclass_objective.hpp index d6e75c5f9129..994fdcb0a696 100644 --- a/src/objective/cuda/cuda_multiclass_objective.hpp +++ b/src/objective/cuda/cuda_multiclass_objective.hpp @@ -54,7 +54,25 @@ class CUDAMulticlassOVA: public CUDAObjectiveInterface, public MulticlassOVA { explicit CUDAMulticlassOVA(const std::vector& strs); + void Init(const Metadata& metadata, data_size_t num_data) override; + + void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; + + void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; + + double BoostFromScore(int class_id) const override { + Log::Warning("BoostFromScore class_id = %d", class_id); + return cuda_binary_loss_[class_id]->BoostFromScore(0); + } + + bool ClassNeedTrain(int class_id) const override { + return cuda_binary_loss_[class_id]->ClassNeedTrain(0); + } + ~CUDAMulticlassOVA(); + + private: + std::vector> cuda_binary_loss_; }; } // namespace LightGBM From 8537b8c4205e5d00af03ac0f199a36b571e3e059 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 13 Sep 2021 03:24:47 +0000 Subject: [PATCH 069/166] separate cuda tree learner --- src/boosting/gbdt.cpp | 29 +- src/boosting/gbdt.h | 4 + src/c_api.cpp | 19 +- src/metric/cuda/cuda_binary_metric.cpp | 116 ---- src/metric/cuda/cuda_binary_metric.cu | 108 --- src/metric/cuda/cuda_binary_metric.hpp | 134 ---- src/metric/cuda/cuda_metric.hpp | 19 - src/metric/cuda/cuda_multiclass_metric.cpp | 115 --- src/metric/cuda/cuda_multiclass_metric.cu | 416 ----------- src/metric/cuda/cuda_multiclass_metric.hpp | 139 ---- src/metric/cuda/cuda_rank_metric.cpp | 73 -- src/metric/cuda/cuda_rank_metric.cu | 265 ------- src/metric/cuda/cuda_rank_metric.hpp | 48 -- src/metric/cuda/cuda_regression_metric.cpp | 61 -- src/metric/cuda/cuda_regression_metric.cu | 128 ---- src/metric/cuda/cuda_regression_metric.hpp | 258 ------- src/metric/cuda/cuda_xentropy_metric.cpp | 85 --- src/metric/cuda/cuda_xentropy_metric.cu | 149 ---- src/metric/cuda/cuda_xentropy_metric.hpp | 77 --- src/metric/metric.cpp | 144 ++-- src/objective/cuda/cuda_binary_objective.cpp | 64 -- src/objective/cuda/cuda_binary_objective.cu | 165 ----- src/objective/cuda/cuda_binary_objective.hpp | 68 -- .../cuda/cuda_multiclass_objective.cpp | 108 --- .../cuda/cuda_multiclass_objective.cu | 104 --- .../cuda/cuda_multiclass_objective.hpp | 80 --- .../cuda/cuda_objective_function.cpp | 12 - src/objective/cuda/cuda_rank_objective.cpp | 102 --- src/objective/cuda/cuda_rank_objective.cu | 652 ------------------ src/objective/cuda/cuda_rank_objective.hpp | 79 --- .../cuda/cuda_regression_objective.cpp | 193 ------ .../cuda/cuda_regression_objective.cu | 616 ----------------- .../cuda/cuda_regression_objective.hpp | 272 -------- .../cuda/cuda_xentropy_objective.cpp | 62 -- src/objective/cuda/cuda_xentropy_objective.cu | 144 ---- .../cuda/cuda_xentropy_objective.hpp | 85 --- src/objective/objective_function.cpp | 114 +-- 37 files changed, 105 insertions(+), 5202 deletions(-) delete mode 100644 src/metric/cuda/cuda_binary_metric.cpp delete mode 100644 src/metric/cuda/cuda_binary_metric.cu delete mode 100644 src/metric/cuda/cuda_binary_metric.hpp delete mode 100644 src/metric/cuda/cuda_metric.hpp delete mode 100644 src/metric/cuda/cuda_multiclass_metric.cpp delete mode 100644 src/metric/cuda/cuda_multiclass_metric.cu delete mode 100644 src/metric/cuda/cuda_multiclass_metric.hpp delete mode 100644 src/metric/cuda/cuda_rank_metric.cpp delete mode 100644 src/metric/cuda/cuda_rank_metric.cu delete mode 100644 src/metric/cuda/cuda_rank_metric.hpp delete mode 100644 src/metric/cuda/cuda_regression_metric.cpp delete mode 100644 src/metric/cuda/cuda_regression_metric.cu delete mode 100644 src/metric/cuda/cuda_regression_metric.hpp delete mode 100644 src/metric/cuda/cuda_xentropy_metric.cpp delete mode 100644 src/metric/cuda/cuda_xentropy_metric.cu delete mode 100644 src/metric/cuda/cuda_xentropy_metric.hpp delete mode 100644 src/objective/cuda/cuda_binary_objective.cpp delete mode 100644 src/objective/cuda/cuda_binary_objective.cu delete mode 100644 src/objective/cuda/cuda_binary_objective.hpp delete mode 100644 src/objective/cuda/cuda_multiclass_objective.cpp delete mode 100644 src/objective/cuda/cuda_multiclass_objective.cu delete mode 100644 src/objective/cuda/cuda_multiclass_objective.hpp delete mode 100644 src/objective/cuda/cuda_objective_function.cpp delete mode 100644 src/objective/cuda/cuda_rank_objective.cpp delete mode 100644 src/objective/cuda/cuda_rank_objective.cu delete mode 100644 src/objective/cuda/cuda_rank_objective.hpp delete mode 100644 src/objective/cuda/cuda_regression_objective.cpp delete mode 100644 src/objective/cuda/cuda_regression_objective.cu delete mode 100644 src/objective/cuda/cuda_regression_objective.hpp delete mode 100644 src/objective/cuda/cuda_xentropy_objective.cpp delete mode 100644 src/objective/cuda/cuda_xentropy_objective.cu delete mode 100644 src/objective/cuda/cuda_xentropy_objective.hpp diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 7fedc2355eb3..c0695b74bcde 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -396,16 +396,15 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true); } Boosting(); + } + if (config_->device_type == std::string("cuda")) { + const size_t total_size = static_cast(num_data_ * num_class_); + CopyFromHostToCUDADeviceOuter(gradients_pointer_, gradients, total_size, __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(hessians_pointer_, hessians, total_size, __FILE__, __LINE__); + } + if (gradients == nullptr || hessians == nullptr) { gradients = gradients_pointer_; hessians = hessians_pointer_; - } else { - if (config_->device_type == std::string("cuda")) { - const size_t total_size = static_cast(num_data_ * num_class_); - CopyFromHostToCUDADeviceOuter(gradients_pointer_, gradients, total_size, __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(hessians_pointer_, hessians, total_size, __FILE__, __LINE__); - gradients = gradients_pointer_; - hessians = hessians_pointer_; - } } // bagging logic Bagging(iter_); @@ -542,10 +541,10 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { if (config_->device_type == std::string("cuda")) { - std::vector tmp_score(num_data * num_class_, 0.0f); - CopyFromCUDADeviceToHostOuter(tmp_score.data(), score, static_cast(num_data * num_class_), __FILE__, __LINE__); + metric_temp_score_.resize(num_data * num_class_, 0.0f); + CopyFromCUDADeviceToHostOuter(metric_temp_score_.data(), score, static_cast(num_data * num_class_), __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - return metric->Eval(tmp_score.data(), objective_function_); + return metric->Eval(metric_temp_score_.data(), objective_function_); } else { return metric->Eval(score, objective_function_); } @@ -637,7 +636,13 @@ std::vector GBDT::GetEvalAt(int data_idx) const { /*! \brief Get training scores result */ const double* GBDT::GetTrainingScore(int64_t* out_len) { *out_len = static_cast(train_score_updater_->num_data()) * num_class_; - return train_score_updater_->score(); + if (config_->device_type == std::string("cpu")) { + return train_score_updater_->score(); + } else if (config_->device_type == std::string("cuda")) { + training_temp_score_.resize(*out_len); + CopyFromCUDADeviceToHostOuter(training_temp_score_.data(), train_score_updater_->score(), *out_len, __FILE__, __LINE__); + return training_temp_score_.data(); + } } void GBDT::PredictContrib(const double* features, double* output) const { diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 410e505a9d47..1ed115caf201 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -549,6 +549,10 @@ class GBDT : public GBDTBase { ParallelPartitionRunner bagging_runner_; Json forced_splits_json_; bool linear_tree_; + /*! \brief temporary storage on CPU for the evaluation of metric when CUDA tree learner is used */ + mutable std::vector metric_temp_score_; + /*! \brief temporary storage on CPU for training data when CUDA tree learner is used */ + std::vector training_temp_score_; }; } // namespace LightGBM diff --git a/src/c_api.cpp b/src/c_api.cpp index 03535a5cee2e..d81ea3ef5220 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -420,14 +420,8 @@ class Booster { } else { is_raw_score = false; } - - if (config.device_type == "cuda") { - return new CUDAPredictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, - config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); - } else { - return new Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, - config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); - } + return new Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); } void Predict(int start_iteration, int num_iteration, int predict_type, int nrow, int ncol, @@ -707,13 +701,8 @@ class Booster { is_raw_score = false; } std::unique_ptr predictor; - if (config.device_type == std::string("cuda")) { - predictor.reset(new CUDAPredictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, - config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin)); - } else { - predictor.reset(new Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, - config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin)); - } + predictor.reset(new Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin)); bool bool_data_has_header = data_has_header > 0 ? true : false; predictor->Predict(data_filename, result_filename, bool_data_has_header, config.predict_disable_shape_check, config.precise_float_parser); diff --git a/src/metric/cuda/cuda_binary_metric.cpp b/src/metric/cuda/cuda_binary_metric.cpp deleted file mode 100644 index efec22a74560..000000000000 --- a/src/metric/cuda/cuda_binary_metric.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_binary_metric.hpp" - -namespace LightGBM { - -template -CUDABinaryMetric::CUDABinaryMetric(const Config& config): BinaryMetric(config) {} - -template -CUDABinaryMetric::~CUDABinaryMetric() {} - -template -void CUDABinaryMetric::Init(const Metadata& metadata, data_size_t num_data) { - BinaryMetric::Init(metadata, num_data); - - cuda_label_ = metadata.cuda_metadata()->cuda_label(); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - - const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_BINARY_METRIC - 1) / EVAL_BLOCK_SIZE_BINARY_METRIC; - AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); -} - -template -std::vector CUDABinaryMetric::Eval(const double* score, const ObjectiveFunction* objective) const { - double sum_loss = 0.0f; - if (objective != nullptr) { - objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); - } - LaunchEvalKernel(cuda_score_convert_buffer_); - CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); - return std::vector(1, sum_loss / this->sum_weights_); -} - -CUDABinaryLoglossMetric::CUDABinaryLoglossMetric(const Config& config): CUDABinaryMetric(config) {} - -CUDABinaryErrorMetric::CUDABinaryErrorMetric(const Config& config) : CUDABinaryMetric(config) {} - -CUDAAUCMetric::CUDAAUCMetric(const Config& config): AUCMetric(config) {} - -CUDAAUCMetric::~CUDAAUCMetric() {} - -void CUDAAUCMetric::Init(const Metadata& metadata, data_size_t num_data) { - AUCMetric::Init(metadata, num_data); - AllocateCUDAMemoryOuter(&cuda_indices_buffer_, static_cast(num_data), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_pos_buffer_, static_cast(num_data), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_threshold_mark_, static_cast(num_data), __FILE__, __LINE__); - const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_BINARY_METRIC - 1) / EVAL_BLOCK_SIZE_BINARY_METRIC; - AllocateCUDAMemoryOuter(&cuda_block_sum_pos_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_sum_pos_buffer_, 0, 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_threshold_mark_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_threshold_mark_buffer_, 0, 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_mark_first_zero_, static_cast(num_blocks) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_mark_first_zero_, 0, 1, __FILE__, __LINE__); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - cuda_label_ = metadata.cuda_metadata()->cuda_label(); - if (cuda_weights_ != nullptr) { - AllocateCUDAMemoryOuter(&cuda_block_sum_neg_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_sum_neg_buffer_, 0, 1, __FILE__, __LINE__); - } -} - -std::vector CUDAAUCMetric::Eval(const double* score, const ObjectiveFunction*) const { - LaunchEvalKernel(score); - double total_area = 0.0f, sum_pos = 0.0f; - CopyFromCUDADeviceToHostOuter(&total_area, cuda_block_sum_pos_buffer_, 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(&sum_pos, cuda_sum_pos_buffer_ + static_cast(num_data_ - 1), 1, __FILE__, __LINE__); - if (sum_pos != sum_weights_ && sum_pos > 0.0f) { - return std::vector(1, total_area / (sum_pos * (sum_weights_ - sum_pos))); - } else { - return std::vector(1, 1.0f); - } -} - -CUDAAveragePrecisionMetric::CUDAAveragePrecisionMetric(const Config& config): AveragePrecisionMetric(config) {} - -CUDAAveragePrecisionMetric::~CUDAAveragePrecisionMetric() {} - -void CUDAAveragePrecisionMetric::Init(const Metadata& metadata, data_size_t num_data) { - AveragePrecisionMetric::Init(metadata, num_data); - AllocateCUDAMemoryOuter(&cuda_indices_buffer_, static_cast(num_data), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_pos_buffer_, static_cast(num_data), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_threshold_mark_, static_cast(num_data), __FILE__, __LINE__); - const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_BINARY_METRIC - 1) / EVAL_BLOCK_SIZE_BINARY_METRIC; - AllocateCUDAMemoryOuter(&cuda_block_sum_pos_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_sum_pos_buffer_, 0, 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_threshold_mark_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_threshold_mark_buffer_, 0, 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_mark_first_zero_, static_cast(num_blocks) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_mark_first_zero_, 0, 1, __FILE__, __LINE__); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - cuda_label_ = metadata.cuda_metadata()->cuda_label(); - if (cuda_weights_ != nullptr) { - AllocateCUDAMemoryOuter(&cuda_block_sum_neg_buffer_, static_cast(num_blocks) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_sum_neg_buffer_, 0, 1, __FILE__, __LINE__); - } -} - -std::vector CUDAAveragePrecisionMetric::Eval(const double* score, const ObjectiveFunction*) const { - LaunchEvalKernel(score); - double total_area = 0.0f, sum_pos = 0.0f; - CopyFromCUDADeviceToHostOuter(&total_area, cuda_block_sum_pos_buffer_, 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(&sum_pos, cuda_sum_pos_buffer_ + static_cast(num_data_ - 1), 1, __FILE__, __LINE__); - if (sum_pos != sum_weights_ && sum_pos > 0.0f) { - return std::vector(1, total_area / sum_pos); - } else { - return std::vector(1, 1.0f); - } -} - -} // namespace LightGBM diff --git a/src/metric/cuda/cuda_binary_metric.cu b/src/metric/cuda/cuda_binary_metric.cu deleted file mode 100644 index 75b8a2e5697d..000000000000 --- a/src/metric/cuda/cuda_binary_metric.cu +++ /dev/null @@ -1,108 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include -#include "cuda_binary_metric.hpp" - -#include - -namespace LightGBM { - -template -__global__ void EvalKernel_BinaryPointWiseLoss(const double* score, - const label_t* label, - const label_t* weights, - const data_size_t num_data, - const double sum_weight, - double* cuda_sum_loss_buffer) { - // assert that warpSize == 32 and maximum number of threads per block is 1024 - __shared__ double shared_buffer[32]; - const int data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const double pointwise_loss = data_index < num_data ? - (USE_WEIGHT ? CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score[data_index]) * weights[data_index] : - CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score[data_index])) : - 0.0f; - const double loss = ShuffleReduceSum(pointwise_loss, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - cuda_sum_loss_buffer[blockIdx.x] = loss; - } -} - -__global__ void ReduceLossKernel(const double* cuda_sum_loss_buffer, const data_size_t num_blocks, double* out_loss) { - __shared__ double shared_buffer[32]; - double thread_sum_loss = 0.0f; - for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { - thread_sum_loss += cuda_sum_loss_buffer[block_index]; - } - const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - *out_loss = sum_loss; - } -} - -template -void CUDABinaryMetric::LaunchEvalKernelInner(const double* score) const { - const data_size_t num_blocks = (BinaryMetric::num_data_ + EVAL_BLOCK_SIZE_BINARY_METRIC - 1) / EVAL_BLOCK_SIZE_BINARY_METRIC; - if (cuda_weights_ == nullptr) { - EvalKernel_BinaryPointWiseLoss<<>>( - score, cuda_label_, cuda_weights_, - this->num_data_, - this->sum_weights_, - cuda_sum_loss_buffer_); - } else { - EvalKernel_BinaryPointWiseLoss<<>>( - score, cuda_label_, cuda_weights_, - this->num_data_, - this->sum_weights_, - cuda_sum_loss_buffer_); - } - ReduceLossKernel<<<1, EVAL_BLOCK_SIZE_BINARY_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); -} - -template <> -void CUDABinaryMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDABinaryMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -void CUDAAUCMetric::LaunchEvalKernel(const double* score) const { - BitonicArgSortGlobal(score, cuda_indices_buffer_, static_cast(num_data_)); - SetCUDAMemoryOuter(cuda_block_sum_pos_buffer_, 0, 1, __FILE__, __LINE__); - if (cuda_weights_ == nullptr) { - GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); - } else { - GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); - GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_neg_buffer_, cuda_block_sum_neg_buffer_, num_data_); - } - GloblGenAUCMark(score, cuda_indices_buffer_, cuda_threshold_mark_, cuda_block_threshold_mark_buffer_, cuda_block_mark_first_zero_, num_data_); - if (cuda_weights_ == nullptr) { - GlobalCalcAUC(cuda_sum_pos_buffer_, nullptr, cuda_threshold_mark_, num_data_, cuda_block_sum_pos_buffer_); - } else { - GlobalCalcAUC(cuda_sum_pos_buffer_, cuda_sum_neg_buffer_, cuda_threshold_mark_, num_data_, cuda_block_sum_pos_buffer_); - } -} - -void CUDAAveragePrecisionMetric::LaunchEvalKernel(const double* score) const { - BitonicArgSortGlobal(score, cuda_indices_buffer_, static_cast(num_data_)); - SetCUDAMemoryOuter(cuda_block_sum_pos_buffer_, 0, 1, __FILE__, __LINE__); - if (cuda_weights_ == nullptr) { - GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); - } else { - GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_pos_buffer_, cuda_block_sum_pos_buffer_, num_data_); - GlobalGenAUCPosNegSum(cuda_label_, cuda_weights_, cuda_indices_buffer_, cuda_sum_neg_buffer_, cuda_block_sum_neg_buffer_, num_data_); - } - GloblGenAUCMark(score, cuda_indices_buffer_, cuda_threshold_mark_, cuda_block_threshold_mark_buffer_, cuda_block_mark_first_zero_, num_data_); - if (cuda_weights_ == nullptr) { - GlobalCalcAveragePrecision(cuda_sum_pos_buffer_, nullptr, cuda_threshold_mark_, num_data_, cuda_block_sum_pos_buffer_); - } else { - GlobalCalcAveragePrecision(cuda_sum_pos_buffer_, cuda_sum_neg_buffer_, cuda_threshold_mark_, num_data_, cuda_block_sum_pos_buffer_); - } -} - -} // namespace LightGBM diff --git a/src/metric/cuda/cuda_binary_metric.hpp b/src/metric/cuda/cuda_binary_metric.hpp deleted file mode 100644 index f28ee59092c6..000000000000 --- a/src/metric/cuda/cuda_binary_metric.hpp +++ /dev/null @@ -1,134 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_ -#define LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_ - -#include "cuda_metric.hpp" -#include "../binary_metric.hpp" - -#define EVAL_BLOCK_SIZE_BINARY_METRIC (1024) - -namespace LightGBM { - -template -class CUDABinaryMetric : public CUDAMetricInterface, public BinaryMetric { - public: - explicit CUDABinaryMetric(const Config& config); - - ~CUDABinaryMetric(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - std::vector Eval(const double* score, const ObjectiveFunction* objective) const override; - - inline static double LossOnPoint(label_t /*label*/, double /*score*/) { - Log::Fatal("Calling host LossOnPoint for a CUDA metric."); - return 0.0f; - } - - protected: - void LaunchEvalKernel(const double* score) const; - - void LaunchEvalKernelInner(const double* score) const; - - const label_t* cuda_label_; - const label_t* cuda_weights_; - double* cuda_sum_loss_buffer_; - double* cuda_score_convert_buffer_; - double* cuda_sum_loss_; -}; - -class CUDABinaryLoglossMetric : public CUDABinaryMetric { - public: - explicit CUDABinaryLoglossMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double prob) { - if (label <= 0) { - if (1.0f - prob > kEpsilon) { - return -log(1.0f - prob); - } - } else { - if (prob > kEpsilon) { - return -log(prob); - } - } - return -log(kEpsilon); - } - - inline static const char* Name() { - return "binary_logloss"; - } -}; - -class CUDABinaryErrorMetric: public CUDABinaryMetric { - public: - explicit CUDABinaryErrorMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double prob) { - if (prob <= 0.5f) { - return label > 0; - } else { - return label <= 0; - } - } - - inline static const char* Name() { - return "binary_error"; - } -}; - -class CUDAAUCMetric : public CUDAMetricInterface, public AUCMetric { - public: - CUDAAUCMetric(const Config& config); - - ~CUDAAUCMetric(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - std::vector Eval(const double* score, const ObjectiveFunction*) const override; - - private: - void LaunchEvalKernel(const double* score) const; - - data_size_t* cuda_indices_buffer_; - double* cuda_sum_pos_buffer_; - double* cuda_block_sum_pos_buffer_; - double* cuda_sum_neg_buffer_; - double* cuda_block_sum_neg_buffer_; - data_size_t* cuda_threshold_mark_; - data_size_t* cuda_block_threshold_mark_buffer_; - uint16_t* cuda_block_mark_first_zero_; - const label_t* cuda_label_; - const label_t* cuda_weights_; -}; - -class CUDAAveragePrecisionMetric : public CUDAMetricInterface, public AveragePrecisionMetric { - public: - explicit CUDAAveragePrecisionMetric(const Config&); - - ~CUDAAveragePrecisionMetric(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - std::vector Eval(const double* score, const ObjectiveFunction*) const override; - - private: - void LaunchEvalKernel(const double* score) const; - - data_size_t* cuda_indices_buffer_; - double* cuda_sum_pos_buffer_; - double* cuda_block_sum_pos_buffer_; - double* cuda_sum_neg_buffer_; - double* cuda_block_sum_neg_buffer_; - data_size_t* cuda_threshold_mark_; - data_size_t* cuda_block_threshold_mark_buffer_; - uint16_t* cuda_block_mark_first_zero_; - const label_t* cuda_label_; - const label_t* cuda_weights_; -}; - -} // namespace LightGBM - -#endif // LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_ diff --git a/src/metric/cuda/cuda_metric.hpp b/src/metric/cuda/cuda_metric.hpp deleted file mode 100644 index 56fb79d572fa..000000000000 --- a/src/metric/cuda/cuda_metric.hpp +++ /dev/null @@ -1,19 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_METRIC_CUDA_CUDA_METRIC_HPP_ -#define LIGHTGBM_METRIC_CUDA_CUDA_METRIC_HPP_ - -#include -#include - -namespace LightGBM { - -class CUDAMetricInterface { - -}; - -} // namespace LightGBM - -#endif // LIGHTGBM_METRIC_CUDA_CUDA_METRIC_HPP_ diff --git a/src/metric/cuda/cuda_multiclass_metric.cpp b/src/metric/cuda/cuda_multiclass_metric.cpp deleted file mode 100644 index 136c8f6de848..000000000000 --- a/src/metric/cuda/cuda_multiclass_metric.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_multiclass_metric.hpp" - -namespace LightGBM { - -template -CUDAMulticlassMetric::CUDAMulticlassMetric(const Config& config): MulticlassMetric(config) {} - -template -CUDAMulticlassMetric::~CUDAMulticlassMetric() {} - -template -void CUDAMulticlassMetric::Init(const Metadata& metadata, data_size_t num_data) { - MulticlassMetric::Init(metadata, num_data); - cuda_label_ = metadata.cuda_metadata()->cuda_label(); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - - const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; - AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data * this->num_class_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); -} - -template -std::vector CUDAMulticlassMetric::Eval(const double* score, const ObjectiveFunction* objective) const { - double sum_loss = 0.0f; - if (objective != nullptr) { - objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); - } - LaunchEvalKernel(cuda_score_convert_buffer_); - CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); - return std::vector(1, CUDAPointWiseLossCalculator::AverageLoss(sum_loss, this->sum_weights_)); -} - -CUDAMultiErrorMetric::CUDAMultiErrorMetric(const Config& config): CUDAMulticlassMetric(config) {} - -CUDAMultiSoftmaxLoglossMetric::CUDAMultiSoftmaxLoglossMetric(const Config& config): CUDAMulticlassMetric(config) {} - -CUDAAucMuMetric::CUDAAucMuMetric(const Config& config): AucMuMetric(config) {} - -CUDAAucMuMetric::~CUDAAucMuMetric() {} - -void CUDAAucMuMetric::Init(const Metadata& metadata, data_size_t num_data) { - AucMuMetric::Init(metadata, num_data); - std::vector class_start(num_class_, 0); - data_size_t max_class_size = 0; - int max_class_size_class = -1; - for (int i = 0; i < num_class_; ++i) { - const data_size_t this_class_size = class_sizes_[i]; - if (this_class_size > max_class_size) { - max_class_size = this_class_size; - max_class_size_class = i; - } - } - data_size_t second_max_class_size = 0; - for (int i = 0; i < num_class_; ++i) { - if (i != max_class_size_class) { - const data_size_t this_class_size = class_sizes_[i]; - if (this_class_size > second_max_class_size) { - second_max_class_size = this_class_size; - } - } - } - for (int i = 1; i < num_class_; ++i) { - class_start[i] += class_start[i - 1] + class_sizes_[i - 1]; - } - InitCUDAMemoryFromHostMemoryOuter(&cuda_class_start_, class_start.data(), class_start.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_class_size_, class_sizes_.data(), class_sizes_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_sorted_indices_, sorted_data_idx_.data(), sorted_data_idx_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_class_data_weights_, class_data_weights_.data(), class_data_weights_.size(), __FILE__, __LINE__); - const int num_class_pair = (num_class_ - 1) * num_class_ / 2; - max_pair_buffer_size_ = max_class_size + second_max_class_size; - const size_t total_pair_buffer_size = static_cast(max_pair_buffer_size_ * num_class_pair); - AllocateCUDAMemoryOuter(&cuda_dist_, total_pair_buffer_size, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sorted_indices_by_dist_, total_pair_buffer_size, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_pos_buffer_, total_pair_buffer_size, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_threshold_mark_, total_pair_buffer_size, __FILE__, __LINE__); - - const int num_blocks = (max_pair_buffer_size_ + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; - const size_t class_pair_block_buffer = static_cast(num_class_pair * (num_blocks + 1)); - AllocateCUDAMemoryOuter(&cuda_block_mark_buffer_, class_pair_block_buffer, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_mark_first_zero_, class_pair_block_buffer, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_reduce_block_buffer_, class_pair_block_buffer, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_reduce_block_buffer_, 0, class_pair_block_buffer, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_reduce_ans_buffer_, static_cast(num_class_pair), __FILE__, __LINE__); - const size_t curr_v_size = static_cast(num_class_pair * num_class_); - std::vector all_curr_v(curr_v_size, 0.0f); - for (int i = 0; i < num_class_ - 1; ++i) { - for (int j = i + 1; j < num_class_; ++j) { - const int i_p = num_class_ - 2 - i; - const int pair_index = i_p * (i_p + 1) / 2 + j - i - 1; - for (int k = 0; k < num_class_; ++k) { - all_curr_v[pair_index * num_class_ + k] = class_weights_[i][k] - class_weights_[j][k]; - } - } - } - InitCUDAMemoryFromHostMemoryOuter(&cuda_curr_v_, all_curr_v.data(), all_curr_v.size(), __FILE__, __LINE__); - - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - cuda_label_ = metadata.cuda_metadata()->cuda_label(); -} - -std::vector CUDAAucMuMetric::Eval(const double* score, const ObjectiveFunction*) const { - LaunchEvalKernel(score); - double ans = 0.0f; - const int num_class_pair = (num_class_ - 1) * num_class_ / 2; - CopyFromCUDADeviceToHostOuter(&ans, cuda_reduce_ans_buffer_, static_cast(num_class_pair), __FILE__, __LINE__); - return std::vector(1, ans / static_cast(num_class_pair)); -} - -} // namespace LightGBM diff --git a/src/metric/cuda/cuda_multiclass_metric.cu b/src/metric/cuda/cuda_multiclass_metric.cu deleted file mode 100644 index 621c139a1db7..000000000000 --- a/src/metric/cuda/cuda_multiclass_metric.cu +++ /dev/null @@ -1,416 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_multiclass_metric.hpp" -#include - -namespace LightGBM { - -template -__global__ void EvalKernel_MulticlassPointWiseLoss(const double* score, - const label_t* label, - const label_t* weights, - const data_size_t num_data, - const double sum_weight, - double* cuda_sum_loss_buffer, - const int num_classes, - const int multi_error_top_k) { - // assert that warpSize == 32 and maximum number of threads per block is 1024 - __shared__ double shared_buffer[32]; - const int data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - double pointwise_loss = 0.0f; - if (data_index < num_data) { - pointwise_loss = (USE_WEIGHT ? - CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score, data_index, num_data, num_classes, multi_error_top_k) * weights[data_index] : - CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score, data_index, num_data, num_classes, multi_error_top_k)); - } - const double loss = ShuffleReduceSum(pointwise_loss, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - cuda_sum_loss_buffer[blockIdx.x] = loss; - } -} - -template -__global__ void ReduceLossKernel_Multiclass(const double* cuda_sum_loss_buffer, const data_size_t num_blocks, double* out_loss) { - __shared__ double shared_buffer[32]; - double thread_sum_loss = 0.0f; - for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { - thread_sum_loss += cuda_sum_loss_buffer[block_index]; - } - const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - *out_loss = sum_loss; - } -} - -template -void CUDAMulticlassMetric::LaunchEvalKernelInner(const double* score) const { - const data_size_t num_blocks = (MulticlassMetric::num_data_ + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; - Log::Warning("num_blocks = %d", num_blocks); - if (cuda_weights_ == nullptr) { - EvalKernel_MulticlassPointWiseLoss<<>>( - score, cuda_label_, cuda_weights_, - this->num_data_, - this->sum_weights_, - cuda_sum_loss_buffer_, - this->num_class_, - this->config_.multi_error_top_k); - } else { - EvalKernel_MulticlassPointWiseLoss<<>>( - score, cuda_label_, cuda_weights_, - this->num_data_, - this->sum_weights_, - cuda_sum_loss_buffer_, - this->num_class_, - this->config_.multi_error_top_k); - } - ReduceLossKernel_Multiclass<<<1, EVAL_BLOCK_SIZE_MULTICLASS_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); -} - -template <> -void CUDAMulticlassMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDAMulticlassMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -__global__ void EvalKernel_AucMuWriteDist( - const data_size_t i_class_start, - const data_size_t i_class_size, - const data_size_t j_class_start, - const data_size_t j_class_size, - const data_size_t* cuda_sorted_indices, - const double* cuda_class_data_weights, - const double* cuda_curr_v, - const double* score, - const data_size_t max_pair_buffer_size, - const data_size_t num_data, - const int num_class, - const int i, - const int j, - double* cuda_dist) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < j_class_size + i_class_size) { - // put the dist of class j in the front - const data_size_t data_index_in_class = data_index < j_class_size ? data_index : data_index - j_class_size; - const data_size_t class_start = data_index < j_class_size ? j_class_start : i_class_start; - const data_size_t* sorted_indices_in_class = cuda_sorted_indices + class_start; - const data_size_t a = sorted_indices_in_class[data_index_in_class]; - double v_a = 0.0f; - for (int m = 0; m < num_class; ++m) { - v_a += cuda_curr_v[m] * score[num_data * m + a]; - } - const double t1 = cuda_curr_v[i] - cuda_curr_v[j]; - cuda_dist[data_index] = v_a * t1; - } -} - -__global__ void BitonicArgSortGlobal_AucMu( - const double* dist, - data_size_t* out_data_indices, - const data_size_t num_data) { - int max_depth = 1; - int len_to_shift = static_cast(num_data) - 1; - while (len_to_shift > 0) { - ++max_depth; - len_to_shift >>= 1; - } - const int num_blocks = (static_cast(num_data) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; - BitonicArgSortGlobalKernel<<>>(dist, out_data_indices, static_cast(num_data)); - for (int depth = max_depth - 11; depth >= 1; --depth) { - const int segment_length = (1 << (max_depth - depth)); - int half_segment_length = (segment_length >> 1); - { - BitonicArgCompareKernel<<>>( - dist, out_data_indices, half_segment_length, segment_length, static_cast(num_data)); - half_segment_length >>= 1; - } - for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { - BitonicArgCompareKernel<<>>( - dist, out_data_indices, half_segment_length, segment_length, static_cast(num_data)); - half_segment_length >>= 1; - } - BitonicArgSortMergeKernel<<>>( - dist, out_data_indices, segment_length, static_cast(num_data)); - } -} - -template -__global__ void GenAucMuPosPrefixSumWithinBlock( - const data_size_t* sorted_data_indices_global, - const data_size_t* sorted_data_indices_two_class, - const data_size_t i_class_size, - const data_size_t j_class_size, - const data_size_t i_class_start, - const data_size_t j_class_start, - const label_t* cuda_weights, - double* sum_pos_buffer, - double* block_sum_pos_buffer) { - __shared__ double shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; - const data_size_t inner_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - double pos = 0.0f; - if (inner_data_index < j_class_size + i_class_size) { - const data_size_t data_index_two_class = sorted_data_indices_two_class[inner_data_index]; - const bool is_pos_class = (data_index_two_class < j_class_size); - if (USE_WEIGHT) { - const data_size_t data_index_one_class = (is_pos_class ? data_index_two_class : data_index_two_class - j_class_size); - const data_size_t data_index_global = (is_pos_class ? sorted_data_indices_global[j_class_start + data_index_one_class] : - sorted_data_indices_global[i_class_start + data_index_one_class]); - pos = ((is_pos_class == IS_POS) ? cuda_weights[data_index_global] : 0.0f); - } else { - pos = ((is_pos_class == IS_POS) ? 1.0f : 0.0f); - } - } - shared_buffer[threadIdx.x] = pos; - __syncthreads(); - PrefixSum(shared_buffer, blockDim.x); - if (inner_data_index < j_class_size + i_class_size) { - sum_pos_buffer[inner_data_index] = shared_buffer[threadIdx.x + 1]; - } - if (threadIdx.x == 0) { - block_sum_pos_buffer[blockIdx.x + 1] = shared_buffer[blockDim.x]; - } -} - -template -__global__ void GenAucMuPosPrefixSum( - const data_size_t* sorted_data_indices_global, - const data_size_t* sorted_data_indices_two_class, - const data_size_t i_class_size, - const data_size_t j_class_size, - const data_size_t i_class_start, - const data_size_t j_class_start, - const label_t* cuda_weights, - double* prefix_sum_result, - double* block_buffer) { - const data_size_t num_data = i_class_size + j_class_size; - const int num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - GenAucMuPosPrefixSumWithinBlock<<>>( - sorted_data_indices_global, - sorted_data_indices_two_class, - i_class_size, - j_class_size, - i_class_start, - j_class_start, - cuda_weights, - prefix_sum_result, - block_buffer); - GlobalInclusivePrefixSumReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( - block_buffer, num_blocks); - GlobalInclusivePrefixSumAddBlockBaseKernel<<>>( - block_buffer, prefix_sum_result, num_data); -} - -__global__ void GenAucMuMark( - const double* dist, - const data_size_t* sorted_data_indices, - const data_size_t num_data, - data_size_t* threshold_mark, - data_size_t* block_mark_buffer, - uint16_t* block_mark_first_zero) { - const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - GlobalGenAUCMarkKernel<<>>(dist, sorted_data_indices, threshold_mark, block_mark_buffer, block_mark_first_zero, num_data); - GlobalInclusivePrefixSumReduceBlockZeroOutKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( - block_mark_buffer, block_mark_first_zero, num_blocks); - GlobalInclusivePrefixSumAddBlockBaseGenAUCMarkKernel<<>>( - block_mark_buffer, threshold_mark, block_mark_first_zero, num_data); -} - -template -__global__ void CalcAucMuArea( - const double* block_sum_pos_buffer, - const data_size_t* sorted_data_indices_global, - const data_size_t* sorted_data_indices_two_class, - const data_size_t* threshold_mark, - const label_t* cuda_weights, - const data_size_t num_data, - const data_size_t i_class_start, - const data_size_t j_class_size, - double* block_buffer) { - __shared__ double shared_mem_buffer[32]; - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - double area = 0.0f; - if (data_index < num_data) { - const data_size_t data_index_two_class = sorted_data_indices_two_class[data_index]; - if (data_index_two_class >= j_class_size) { - const data_size_t data_index_global = sorted_data_indices_global[i_class_start + data_index_two_class - j_class_size]; - const double num_j = block_sum_pos_buffer[data_index]; - if (USE_WEIGHT) { - const double curr_weight = static_cast(cuda_weights[data_index_global]); - if (threshold_mark[data_index] > 0) { - const data_size_t prev_data_index = data_index - threshold_mark[data_index] - 1; - const double prev_sum_pos = prev_data_index < 0 ? 0.0f : block_sum_pos_buffer[prev_data_index]; - const double num_curr_j = block_sum_pos_buffer[data_index] - prev_sum_pos; - area = curr_weight * (num_j - 0.5f * num_curr_j); - } else { - area = curr_weight * num_j; - } - } else { - if (threshold_mark[data_index] > 0) { - const data_size_t prev_data_index = data_index - threshold_mark[data_index] - 1; - const double prev_sum_pos = prev_data_index < 0 ? 0.0f : block_sum_pos_buffer[prev_data_index]; - const double num_curr_j = block_sum_pos_buffer[data_index] - prev_sum_pos; - area = num_j - 0.5f * num_curr_j; - } else { - area = num_j; - } - } - } - } - const double block_area = ShuffleReduceSum(area, shared_mem_buffer, blockDim.x); - if (threadIdx.x == 0) { - block_buffer[blockIdx.x] = block_area; - } -} - -template -__global__ void EvalKernel_AucMu( - const data_size_t* cuda_class_start, - const data_size_t* cuda_class_size, - const data_size_t* cuda_sorted_indices, - const double* cuda_class_data_weights, - const double* cuda_curr_v, - const double* score, - const data_size_t max_pair_buffer_size, - const data_size_t num_data, - const int num_class, - const label_t* cuda_weights, - double* cuda_dist, - data_size_t* cuda_sorted_indices_by_dist, - data_size_t* cuda_threshold_mark, - data_size_t* cuda_block_threshold_mark_buffer, - uint16_t* cuda_block_mark_first_zero, - double* sum_pos_buffer, - double* block_sum_pos_buffer, - double* reduce_ans_buffer) { - const int pair_index = static_cast(blockIdx.x); - const double index_2 = 2.0f * static_cast(pair_index); - const int sqrt_round = static_cast(sqrt(index_2)); - const int i_p = (pair_index == 0) ? 0 : static_cast(sqrt(index_2 - static_cast(sqrt_round) + 1)); - const int j_p = pair_index - ((i_p + 1) * i_p / 2); - const int i = num_class - 2 - i_p; - const int j = j_p + i + 1; - const data_size_t i_class_size = cuda_class_size[i]; - const data_size_t j_class_size = cuda_class_size[j]; - const data_size_t num_data_in_pair = i_class_size + j_class_size; - const int num_blocks = (num_data_in_pair + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; - const int num_blocks_for_offset = (max_pair_buffer_size + EVAL_BLOCK_SIZE_MULTICLASS_METRIC - 1) / EVAL_BLOCK_SIZE_MULTICLASS_METRIC; - const data_size_t i_class_start = cuda_class_start[i]; - const data_size_t j_class_start = cuda_class_start[j]; - double* cuda_dist_ptr = cuda_dist + pair_index * max_pair_buffer_size; - data_size_t* cuda_sorted_indices_by_dist_ptr = cuda_sorted_indices_by_dist + pair_index * max_pair_buffer_size; - const double* cuda_curr_v_ptr = cuda_curr_v + pair_index * num_class; - double* sum_pos_buffer_ptr = sum_pos_buffer + pair_index * max_pair_buffer_size; - double* block_sum_pos_buffer_ptr = block_sum_pos_buffer + pair_index * (num_blocks_for_offset + 1); - data_size_t* cuda_threshold_mark_ptr = cuda_threshold_mark + pair_index * max_pair_buffer_size; - data_size_t* cuda_block_threshold_mark_buffer_ptr = cuda_block_threshold_mark_buffer + pair_index * (num_blocks_for_offset + 1); - uint16_t* cuda_block_mark_first_zero_ptr = cuda_block_mark_first_zero + pair_index * (num_blocks_for_offset + 1); - cudaStream_t cuda_stream; - cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking); - EvalKernel_AucMuWriteDist<<>>( - i_class_start, - i_class_size, - j_class_start, - j_class_size, - cuda_sorted_indices, - cuda_class_data_weights, - cuda_curr_v_ptr, - score, - max_pair_buffer_size, - num_data, - num_class, - i, - j, - cuda_dist_ptr); - BitonicArgSortGlobal_AucMu<<<1, 1, 0, cuda_stream>>>( - cuda_dist_ptr, - cuda_sorted_indices_by_dist_ptr, - num_data_in_pair); - GenAucMuPosPrefixSum<<<1, 1, 0, cuda_stream>>>( - cuda_sorted_indices, - cuda_sorted_indices_by_dist_ptr, - i_class_size, - j_class_size, - i_class_start, - j_class_start, - cuda_weights, - sum_pos_buffer_ptr, - block_sum_pos_buffer_ptr); - GenAucMuMark<<<1, 1, 0, cuda_stream>>>( - cuda_dist_ptr, - cuda_sorted_indices_by_dist_ptr, - num_data_in_pair, - cuda_threshold_mark_ptr, - cuda_block_threshold_mark_buffer_ptr, - cuda_block_mark_first_zero_ptr); - CalcAucMuArea<<<1, 1, 0, cuda_stream>>>( - block_sum_pos_buffer_ptr, - cuda_sorted_indices, - cuda_sorted_indices_by_dist_ptr, - cuda_threshold_mark_ptr, - cuda_weights, - num_data_in_pair, - i_class_start, - j_class_size, - block_sum_pos_buffer_ptr); - BlockReduceSum<<<1, EVAL_BLOCK_SIZE_MULTICLASS_METRIC, 0, cuda_stream>>>(block_sum_pos_buffer_ptr, num_blocks); - if (USE_WEIGHT) { - reduce_ans_buffer[pair_index] = block_sum_pos_buffer_ptr[0] / cuda_class_data_weights[i] / cuda_class_data_weights[j]; - } else { - reduce_ans_buffer[pair_index] = block_sum_pos_buffer_ptr[0] / static_cast(cuda_class_size[i]) / static_cast(cuda_class_size[j]); - } - cudaStreamDestroy(cuda_stream); -} - -void CUDAAucMuMetric::LaunchEvalKernel(const double* score) const { - const int num_class_pair = (num_class_ - 1) * num_class_ / 2; - if (cuda_weights_ == nullptr) { - EvalKernel_AucMu<<>>( - cuda_class_start_, - cuda_class_size_, - cuda_sorted_indices_, - cuda_class_data_weights_, - cuda_curr_v_, - score, - max_pair_buffer_size_, - num_data_, - num_class_, - cuda_weights_, - cuda_dist_, - cuda_sorted_indices_by_dist_, - cuda_threshold_mark_, - cuda_block_mark_buffer_, - cuda_block_mark_first_zero_, - cuda_sum_pos_buffer_, - cuda_reduce_block_buffer_, - cuda_reduce_ans_buffer_); - } else { - EvalKernel_AucMu<<>>( - cuda_class_start_, - cuda_class_size_, - cuda_sorted_indices_, - cuda_class_data_weights_, - cuda_curr_v_, - score, - max_pair_buffer_size_, - num_data_, - num_class_, - cuda_weights_, - cuda_dist_, - cuda_sorted_indices_by_dist_, - cuda_threshold_mark_, - cuda_block_mark_buffer_, - cuda_block_mark_first_zero_, - cuda_sum_pos_buffer_, - cuda_reduce_block_buffer_, - cuda_reduce_ans_buffer_); - } - BlockReduceSum<<<1, EVAL_BLOCK_SIZE_MULTICLASS_METRIC>>>(cuda_reduce_ans_buffer_, num_class_pair); -} - -} // namespace LightGBM diff --git a/src/metric/cuda/cuda_multiclass_metric.hpp b/src/metric/cuda/cuda_multiclass_metric.hpp deleted file mode 100644 index ddbe5b057346..000000000000 --- a/src/metric/cuda/cuda_multiclass_metric.hpp +++ /dev/null @@ -1,139 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_METRIC_CUDA_CUDA_MULTICLASS_METRIC_HPP_ -#define LIGHTGBM_METRIC_CUDA_CUDA_MULTICLASS_METRIC_HPP_ - -#include "cuda_metric.hpp" -#include "../multiclass_metric.hpp" - -#define EVAL_BLOCK_SIZE_MULTICLASS_METRIC (1024) - -namespace LightGBM { - -template -class CUDAMulticlassMetric : public CUDAMetricInterface, public MulticlassMetric { - public: - explicit CUDAMulticlassMetric(const Config& config); - - ~CUDAMulticlassMetric(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - std::vector Eval(const double* score, const ObjectiveFunction* objective) const override; - - inline static double AverageLoss(double sum_loss, double sum_weights) { - return (sum_loss / sum_weights); - } - - inline static double LossOnPoint(label_t /*label*/, std::vector* /*score*/, const Config& /*config*/) { - Log::Fatal("Calling host LossOnPoint for a CUDA metric."); - return 0.0f; - } - - protected: - void LaunchEvalKernel(const double* score) const; - - void LaunchEvalKernelInner(const double* score) const; - - const label_t* cuda_label_; - const label_t* cuda_weights_; - double* cuda_score_convert_buffer_; - double* cuda_sum_loss_buffer_; - double* cuda_sum_loss_; -}; - -class CUDAMultiErrorMetric : public CUDAMulticlassMetric { - public: - explicit CUDAMultiErrorMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA( - label_t label, - const double* score, - const data_size_t data_index, - const data_size_t num_data, - const int num_classes, - const int multi_error_top_k) { - const size_t k = static_cast(label); - const double true_class_score = score[k * num_data + data_index]; - int num_larger = 0; - for (int i = 0; i < num_classes; ++i) { - const double this_class_score = score[i * num_data + data_index]; - if (this_class_score >= true_class_score) ++num_larger; - if (num_larger > multi_error_top_k) return 1.0f; - } - return 0.0f; - } - - inline static const std::string Name(const Config& config) { - if (config.multi_error_top_k == 1) { - return "multi_error"; - } else { - return "multi_error@" + std::to_string(config.multi_error_top_k); - } - } -}; - -class CUDAMultiSoftmaxLoglossMetric : public CUDAMulticlassMetric { - public: - explicit CUDAMultiSoftmaxLoglossMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, const double* score, - const data_size_t data_index, - const data_size_t num_data, - const int /*num_classes*/, const int /*multi_error_top_k*/) { - size_t k = static_cast(label); - const double point_score = score[k * num_data + data_index]; - if (point_score > kEpsilon) { - return static_cast(-log(point_score)); - } else { - return -log(kEpsilon); - } - } - - inline static const std::string Name(const Config& /*config*/) { - return "multi_logloss"; - } -}; - -class CUDAAucMuMetric : public CUDAMetricInterface, public AucMuMetric { - public: - explicit CUDAAucMuMetric(const Config& config); - - ~CUDAAucMuMetric(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - std::vector Eval(const double* score, const ObjectiveFunction*) const override; - - private: - void LaunchEvalKernel(const double* score) const; - - const label_t* cuda_label_; - const label_t* cuda_weights_; - - int num_class_pair_; - data_size_t max_pair_buffer_size_; - - data_size_t* cuda_class_start_; - data_size_t* cuda_class_size_; - data_size_t* cuda_sorted_indices_; - double* cuda_dist_; - double* cuda_class_data_weights_; - double* cuda_class_weights_; - data_size_t* cuda_sorted_indices_by_dist_; - double* cuda_curr_v_; - - double* cuda_sum_pos_buffer_; - data_size_t* cuda_threshold_mark_; - data_size_t* cuda_block_mark_buffer_; - uint16_t* cuda_block_mark_first_zero_; - - double* cuda_reduce_block_buffer_; - double* cuda_reduce_ans_buffer_; -}; - -} // namespace LightGBM - -#endif // LIGHTGBM_METRIC_CUDA_CUDA_MULTICLASS_METRIC_HPP_ diff --git a/src/metric/cuda/cuda_rank_metric.cpp b/src/metric/cuda/cuda_rank_metric.cpp deleted file mode 100644 index 06ac6d9b5368..000000000000 --- a/src/metric/cuda/cuda_rank_metric.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_rank_metric.hpp" - -namespace LightGBM { - -CUDANDCGMetric::CUDANDCGMetric(const Config& config): NDCGMetric(config) {} - -CUDANDCGMetric::~CUDANDCGMetric() {} - -void CUDANDCGMetric::Init(const Metadata& metadata, data_size_t num_data) { - NDCGMetric::Init(metadata, num_data); - cuda_label_ = metadata.cuda_metadata()->cuda_label(); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - cuda_query_boundaries_ = metadata.cuda_metadata()->cuda_query_boundaries(); - cuda_query_weights_ = metadata.cuda_metadata()->cuda_query_weights(); - const int num_threads = OMP_NUM_THREADS(); - std::vector thread_max_num_items_in_query(num_threads); - Threading::For(0, num_queries_, 1, - [this, &thread_max_num_items_in_query] (int thread_index, data_size_t start, data_size_t end) { - for (data_size_t query_index = start; query_index < end; ++query_index) { - const data_size_t query_item_count = query_boundaries_[query_index + 1] - query_boundaries_[query_index]; - if (query_item_count > thread_max_num_items_in_query[thread_index]) { - thread_max_num_items_in_query[thread_index] = query_item_count; - } - } - }); - max_items_in_query_ = 0; - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - if (thread_max_num_items_in_query[thread_index] > max_items_in_query_) { - max_items_in_query_ = thread_max_num_items_in_query[thread_index]; - } - } - max_items_in_query_aligned_ = 1; - --max_items_in_query_; - while (max_items_in_query_ > 0) { - max_items_in_query_ >>= 1; - max_items_in_query_aligned_ <<= 1; - } - num_eval_ = static_cast(eval_at_.size()); - InitCUDAMemoryFromHostMemoryOuter(&cuda_eval_at_, eval_at_.data(), eval_at_.size(), __FILE__, __LINE__); - const size_t total_inverse_max_dcg_items = static_cast(num_queries_ * num_eval_); - std::vector flatten_inverse_max_dcgs(total_inverse_max_dcg_items, 0.0f); - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (data_size_t query_index = 0; query_index < num_queries_; ++query_index) { - OMP_LOOP_EX_BEGIN(); - for (data_size_t eval_index = 0; eval_index < num_eval_; ++eval_index) { - flatten_inverse_max_dcgs[query_index * num_eval_ + eval_index] = inverse_max_dcgs_[query_index][eval_index]; - } - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - InitCUDAMemoryFromHostMemoryOuter(&cuda_inverse_max_dcgs_, flatten_inverse_max_dcgs.data(), flatten_inverse_max_dcgs.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_label_gain_, DCGCalculator::label_gain().data(), DCGCalculator::label_gain().size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_discount_, DCGCalculator::discount().data(), DCGCalculator::discount().size(), __FILE__, __LINE__); - const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK_METRIC - 1) / NUM_QUERY_PER_BLOCK_METRIC; - AllocateCUDAMemoryOuter(&cuda_block_dcg_buffer_, static_cast(num_blocks * num_eval_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_item_indices_buffer_, static_cast(num_data_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_ndcg_result_, static_cast(num_eval_), __FILE__, __LINE__); -} - -std::vector CUDANDCGMetric::Eval(const double* score, const ObjectiveFunction*) const { - LaunchEvalKernel(score); - std::vector result(num_eval_, 0.0f); - CopyFromCUDADeviceToHostOuter(result.data(), cuda_ndcg_result_, static_cast(num_eval_), __FILE__, __LINE__); - return result; -} - -} // namespace LightGBM diff --git a/src/metric/cuda/cuda_rank_metric.cu b/src/metric/cuda/cuda_rank_metric.cu deleted file mode 100644 index c93acf59276c..000000000000 --- a/src/metric/cuda/cuda_rank_metric.cu +++ /dev/null @@ -1,265 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_rank_metric.hpp" -#include - -namespace LightGBM { - -template -__global__ void EvalKernel_NDCG_SharedMemory( - const double* score, - const label_t* label, - const label_t* query_weights, - const data_size_t* query_boundareis, - const data_size_t num_queries, - const data_size_t* eval_at, - const data_size_t num_eval, - const double* inverse_max_dcgs, - const double* label_gains, - const double* discount, - double* block_ndcg_buffer) { - __shared__ uint16_t shared_item_indices[SHARED_MEMORY_SIZE]; - __shared__ score_t shared_item_scores[SHARED_MEMORY_SIZE]; - __shared__ double shared_eval_result[MAX_NUM_EVAL]; - __shared__ data_size_t shared_eval_at[MAX_NUM_EVAL]; - __shared__ double shared_shuffle_buffer[32]; - for (data_size_t eval_index = static_cast(threadIdx.x); eval_index < num_eval; eval_index += static_cast(blockDim.x)) { - shared_eval_at[eval_index] = eval_at[eval_index]; - shared_eval_result[eval_index] = 0.0f; - } - __syncthreads(); - const data_size_t start_query_index = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK_METRIC; - const data_size_t end_query_index = min(start_query_index + NUM_QUERY_PER_BLOCK_METRIC, num_queries); - for (data_size_t query_index = start_query_index; query_index < end_query_index; ++query_index) { - const double* inverse_max_dcgs_ptr = inverse_max_dcgs + query_index * num_eval; - if (inverse_max_dcgs_ptr[0] < 0.0f) { - for (data_size_t eval_index = static_cast(threadIdx.x); eval_index < num_eval; eval_index += static_cast(blockDim.x)) { - shared_eval_result[eval_index] += 1.0f; - } - } else { - const data_size_t item_start_index = query_boundareis[query_index]; - const data_size_t item_end_index = query_boundareis[query_index + 1]; - const data_size_t num_items = item_end_index - item_start_index; - const double* score_ptr = score + item_start_index; - const label_t* label_ptr = label + item_start_index; - for (data_size_t item_index = static_cast(threadIdx.x); item_index < num_items; item_index += static_cast(blockDim.x)) { - shared_item_scores[item_index] = static_cast(score_ptr[item_index]); - shared_item_indices[item_index] = item_index; - } - for (data_size_t item_index = num_items + static_cast(threadIdx.x); item_index < SHARED_MEMORY_SIZE; item_index += static_cast(blockDim.x)) { - shared_item_scores[item_index] = kMinScore; - shared_item_indices[item_index] = item_index; - } - __syncthreads(); - if (MAX_ITEM_GREATER_THAN_1024) { - if (num_items > 1024) { - for (data_size_t item_index = num_items + static_cast(threadIdx.x); item_index < SHARED_MEMORY_SIZE; item_index += static_cast(blockDim.x)) { - shared_item_scores[item_index] = kMinScore; - } - __syncthreads(); - BitonicArgSort_2048(shared_item_scores, shared_item_indices); - } else { - BitonicArgSort_1024(shared_item_scores, shared_item_indices, static_cast(num_items)); - } - } else { - BitonicArgSort_1024(shared_item_scores, shared_item_indices, static_cast(num_items)); - } - __syncthreads(); - double thread_eval = 0.0f; - data_size_t item_index = static_cast(threadIdx.x); - for (data_size_t eval_index = 0; eval_index < num_eval; ++eval_index) { - data_size_t cur_eval_pos = min(num_items, shared_eval_at[eval_index]); - for (; item_index < cur_eval_pos; item_index += static_cast(blockDim.x)) { - const int data_label = static_cast(label_ptr[shared_item_indices[item_index]]); - thread_eval += label_gains[data_label] * discount[item_index]; - } - __syncthreads(); - double block_eval = ShuffleReduceSum(thread_eval, shared_shuffle_buffer, blockDim.x); - if (USE_QUERY_WEIGHT) { - block_eval *= static_cast(query_weights[query_index]); - } - if (threadIdx.x == 0) { - shared_eval_result[eval_index] += block_eval * inverse_max_dcgs_ptr[eval_index]; - } - } - __syncthreads(); - } - } - for (data_size_t eval_index = static_cast(threadIdx.x); eval_index < num_eval; eval_index += static_cast(blockDim.x)) { - block_ndcg_buffer[eval_index * gridDim.x + blockIdx.x] = shared_eval_result[eval_index]; - } -} - -template -__global__ void EvalKernel_NDCG_GlobalMemory( - const double* score, - const label_t* label, - const label_t* query_weights, - const data_size_t* query_boundareis, - const data_size_t num_queries, - const data_size_t* eval_at, - const data_size_t num_eval, - const double* inverse_max_dcgs, - const double* label_gains, - const double* discount, - double* block_ndcg_buffer, - const data_size_t* cuda_item_indices_buffer) { - __shared__ double shared_eval_result[MAX_NUM_EVAL]; - __shared__ data_size_t shared_eval_at[MAX_NUM_EVAL]; - __shared__ double shared_shuffle_buffer[32]; - for (data_size_t eval_index = 0; eval_index < num_eval; eval_index += static_cast(blockDim.x)) { - shared_eval_at[eval_index] = eval_at[eval_index]; - shared_eval_result[eval_index] = 0.0f; - } - __syncthreads(); - const data_size_t start_query_index = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK_METRIC; - const data_size_t end_query_index = min(start_query_index + NUM_QUERY_PER_BLOCK_METRIC, num_queries); - for (data_size_t query_index = start_query_index; query_index < end_query_index; ++query_index) { - const data_size_t item_start_index = query_boundareis[query_index]; - const data_size_t item_end_index = query_boundareis[query_index + 1]; - const data_size_t num_items = item_end_index - item_start_index; - const label_t* label_ptr = label + item_start_index; - const double* inverse_max_dcgs_ptr = inverse_max_dcgs + query_index * num_eval; - const data_size_t* sorted_item_indices_ptr = cuda_item_indices_buffer + item_start_index; - double thread_eval = 0.0f; - data_size_t item_index = static_cast(threadIdx.x); - for (data_size_t eval_index = 0; eval_index < num_eval; ++eval_index) { - data_size_t cur_eval_pos = min(num_items, shared_eval_at[eval_index]); - for (; item_index < cur_eval_pos; item_index += static_cast(blockDim.x)) { - const uint16_t sorted_item_index = sorted_item_indices_ptr[item_index]; - if (static_cast(sorted_item_index) >= num_items) { - printf("error sorted_item_index = %d, num_items = %d\n", sorted_item_index, num_items); - } - const int data_label = static_cast(label_ptr[sorted_item_indices_ptr[item_index]]); - thread_eval += label_gains[data_label] * discount[item_index]; - } - __syncthreads(); - double block_eval = ShuffleReduceSum(thread_eval, shared_shuffle_buffer, blockDim.x); - if (USE_QUERY_WEIGHT) { - block_eval *= static_cast(query_weights[query_index]); - } - if (threadIdx.x == 0) { - shared_eval_result[eval_index] += block_eval * inverse_max_dcgs_ptr[eval_index]; - } - } - __syncthreads(); - } - for (data_size_t eval_index = static_cast(threadIdx.x); eval_index < num_eval; eval_index += static_cast(blockDim.x)) { - block_ndcg_buffer[eval_index * gridDim.x + blockIdx.x] = shared_eval_result[eval_index]; - } -} - -__global__ void ReduceNDCGFromBlocks( - const double* block_ndcg_buffer, - const data_size_t num_eval, - const int num_blocks, - double* ndcg_result, - const double sum_query_weights) { - __shared__ double shared_mem_buffer[32]; - const data_size_t eval_index = static_cast(blockIdx.x); - const double* block_ndcg_buffer_ptr = block_ndcg_buffer + eval_index * num_blocks; - double thread_sum = 0.0f; - for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { - thread_sum += block_ndcg_buffer_ptr[block_index]; - } - __syncthreads(); - const double block_sum = ShuffleReduceSum(thread_sum, shared_mem_buffer, blockDim.x); - if (threadIdx.x == 0) { - ndcg_result[eval_index] = block_sum / sum_query_weights; - } -} - -#define EvalKernel_NDCG_ARGS \ - score, \ - cuda_label_, \ - cuda_query_weights_, \ - cuda_query_boundaries_, \ - num_queries_, \ - cuda_eval_at_, \ - num_eval_, \ - cuda_inverse_max_dcgs_, \ - cuda_label_gain_, \ - cuda_discount_, \ - cuda_block_dcg_buffer_ - -void CUDANDCGMetric::LaunchEvalKernel(const double* score) const { - const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK_METRIC - 1) / NUM_QUERY_PER_BLOCK_METRIC; - if (cuda_query_weights_ == nullptr) { - if (max_items_in_query_aligned_ <= 1024) { - if (num_eval_ <= 32) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else if (num_eval_ <= 256) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else if (num_eval_ <= 1024) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else { - Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); - } - } else if (max_items_in_query_aligned_ <= 2048) { - if (num_eval_ <= 32) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else if (num_eval_ <= 256) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else if (num_eval_ <= 1024) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else { - Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); - } - } else { - BitonicArgSortItemsGlobal(score, num_queries_, cuda_query_boundaries_, cuda_item_indices_buffer_); - if (num_eval_ <= 32) { - EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); - } else if (num_eval_ <= 256) { - EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); - } else if (num_eval_ <= 1024) { - EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); - } else { - Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); - } - } - } else { - if (max_items_in_query_aligned_ <= 1024) { - if (num_eval_ <= 32) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else if (num_eval_ <= 256) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else if (num_eval_ <= 1024) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else { - Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); - } - } else if (max_items_in_query_aligned_ <= 2048) { - if (num_eval_ <= 32) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else if (num_eval_ <= 256) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else if (num_eval_ <= 1024) { - EvalKernel_NDCG_SharedMemory<<>>(EvalKernel_NDCG_ARGS); - } else { - Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); - } - } else { - BitonicArgSortItemsGlobal(score, num_queries_, cuda_query_boundaries_, cuda_item_indices_buffer_); - if (num_eval_ <= 32) { - EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); - } else if (num_eval_ <= 256) { - EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); - } else if (num_eval_ <= 1024) { - EvalKernel_NDCG_GlobalMemory<<>>(EvalKernel_NDCG_ARGS, cuda_item_indices_buffer_); - } else { - Log::Fatal("Number of eval_at %d exceeds the maximum %d for NDCG metric in CUDA version.", num_eval_, 1024); - } - } - } - ReduceNDCGFromBlocks<<>>( - cuda_block_dcg_buffer_, - num_eval_, - num_blocks, - cuda_ndcg_result_, - sum_query_weights_); -} - -} // namespace LightGBM diff --git a/src/metric/cuda/cuda_rank_metric.hpp b/src/metric/cuda/cuda_rank_metric.hpp deleted file mode 100644 index 5058fe9117b3..000000000000 --- a/src/metric/cuda/cuda_rank_metric.hpp +++ /dev/null @@ -1,48 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_METRIC_CUDA_CUDA_RANK_METRIC_HPP_ -#define LIGHTGBM_METRIC_CUDA_CUDA_RANK_METRIC_HPP_ - -#include "cuda_metric.hpp" -#include "../rank_metric.hpp" - -#define EVAL_BLOCK_SIZE_RANK_METRIC (1024) -#define NUM_QUERY_PER_BLOCK_METRIC (10) -#define MAX_RANK_LABEL_METRIC (32) - -namespace LightGBM { - -class CUDANDCGMetric : public CUDAMetricInterface, public NDCGMetric { - public: - explicit CUDANDCGMetric(const Config& config); - - ~CUDANDCGMetric(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - std::vector Eval(const double* score, const ObjectiveFunction*) const override; - - private: - void LaunchEvalKernel(const double* score) const; - - const label_t* cuda_label_; - const label_t* cuda_weights_; - const data_size_t* cuda_query_boundaries_; - const label_t* cuda_query_weights_; - data_size_t* cuda_eval_at_; - double* cuda_inverse_max_dcgs_; - double* cuda_label_gain_; - double* cuda_discount_; - double* cuda_block_dcg_buffer_; - double* cuda_ndcg_result_; - data_size_t* cuda_item_indices_buffer_; - int max_items_in_query_aligned_; - int max_items_in_query_; - int num_eval_; -}; - -} // namespace LightGBM - -#endif // LIGHTGBM_METRIC_CUDA_CUDA_RANK_METRIC_HPP_ diff --git a/src/metric/cuda/cuda_regression_metric.cpp b/src/metric/cuda/cuda_regression_metric.cpp deleted file mode 100644 index 91af8b4c1e64..000000000000 --- a/src/metric/cuda/cuda_regression_metric.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_regression_metric.hpp" - -namespace LightGBM { - -template -CUDARegressionMetric::CUDARegressionMetric(const Config& config): RegressionMetric(config) {} - -template -CUDARegressionMetric::~CUDARegressionMetric() {} - -template -void CUDARegressionMetric::Init(const Metadata& metadata, data_size_t num_data) { - RegressionMetric::Init(metadata, num_data); - cuda_label_ = metadata.cuda_metadata()->cuda_label(); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - - const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_REGRESSION_METRIC - 1) / EVAL_BLOCK_SIZE_REGRESSION_METRIC; - AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); -} - -template -std::vector CUDARegressionMetric::Eval(const double* score, const ObjectiveFunction* objective) const { - double sum_loss = 0.0f; - if (objective != nullptr) { - objective->GetCUDAConvertOutputFunc()(this->num_data_, score, cuda_score_convert_buffer_); - } - LaunchEvalKernel(cuda_score_convert_buffer_); - CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); - return std::vector(1, CUDAPointWiseLossCalculator::AverageLoss(sum_loss, this->sum_weights_)); -} - -CUDARMSEMetric::CUDARMSEMetric(const Config& config): CUDARegressionMetric(config) {} - -CUDAL2Metric::CUDAL2Metric(const Config& config): CUDARegressionMetric(config) {} - -CUDAL1Metric::CUDAL1Metric(const Config& config): CUDARegressionMetric(config) {} - -CUDAQuantileMetric::CUDAQuantileMetric(const Config& config): CUDARegressionMetric(config) {} - -CUDAHuberLossMetric::CUDAHuberLossMetric(const Config& config): CUDARegressionMetric(config) {} - -CUDAFairLossMetric::CUDAFairLossMetric(const Config& config): CUDARegressionMetric(config) {} - -CUDAPoissonMetric::CUDAPoissonMetric(const Config& config): CUDARegressionMetric(config) {} - -CUDAMAPEMetric::CUDAMAPEMetric(const Config& config): CUDARegressionMetric(config) {} - -CUDAGammaMetric::CUDAGammaMetric(const Config& config): CUDARegressionMetric(config) {} - -CUDAGammaDevianceMetric::CUDAGammaDevianceMetric(const Config& config): CUDARegressionMetric(config) {} - -CUDATweedieMetric::CUDATweedieMetric(const Config& config): CUDARegressionMetric(config) {} - -} // namespace LightGBM diff --git a/src/metric/cuda/cuda_regression_metric.cu b/src/metric/cuda/cuda_regression_metric.cu deleted file mode 100644 index f66ce058a766..000000000000 --- a/src/metric/cuda/cuda_regression_metric.cu +++ /dev/null @@ -1,128 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - - -#include -#include "cuda_regression_metric.hpp" - -namespace LightGBM { - -template -__global__ void EvalKernel_RegressionPointWiseLoss(const double* score, - const label_t* label, - const label_t* weights, - const data_size_t num_data, - const double sum_weight, - double* cuda_sum_loss_buffer, - const double alpha, - const double fair_c, - const double tweedie_variance_power) { - // assert that warpSize == 32 and maximum number of threads per block is 1024 - __shared__ double shared_buffer[32]; - const int data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const double pointwise_loss = data_index < num_data ? - (USE_WEIGHT ? CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score[data_index], alpha, fair_c, tweedie_variance_power) * weights[data_index] : - CUDAPointWiseLossCalculator::LossOnPointCUDA(label[data_index], score[data_index], alpha, fair_c, tweedie_variance_power)) : - 0.0f; - const double loss = ShuffleReduceSum(pointwise_loss, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - cuda_sum_loss_buffer[blockIdx.x] = loss; - } -} - -template -__global__ void ReduceLossKernel_Regression(const double* cuda_sum_loss_buffer, const data_size_t num_blocks, double* out_loss) { - __shared__ double shared_buffer[32]; - double thread_sum_loss = 0.0f; - for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { - thread_sum_loss += cuda_sum_loss_buffer[block_index]; - } - const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - *out_loss = sum_loss; - } -} - -template -void CUDARegressionMetric::LaunchEvalKernelInner(const double* score) const { - const data_size_t num_blocks = (RegressionMetric::num_data_ + EVAL_BLOCK_SIZE_REGRESSION_METRIC - 1) / EVAL_BLOCK_SIZE_REGRESSION_METRIC; - if (cuda_weights_ == nullptr) { - EvalKernel_RegressionPointWiseLoss<<>>( - score, cuda_label_, cuda_weights_, - this->num_data_, - this->sum_weights_, - cuda_sum_loss_buffer_, - this->config_.alpha, - this->config_.fair_c, - this->config_.tweedie_variance_power); - } else { - EvalKernel_RegressionPointWiseLoss<<>>( - score, cuda_label_, cuda_weights_, - this->num_data_, - this->sum_weights_, - cuda_sum_loss_buffer_, - this->config_.alpha, - this->config_.fair_c, - this->config_.tweedie_variance_power); - } - ReduceLossKernel_Regression<<<1, EVAL_BLOCK_SIZE_REGRESSION_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); -} - -template <> -void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -template <> -void CUDARegressionMetric::LaunchEvalKernel(const double* score) const { - LaunchEvalKernelInner(score); -} - -} // namespace LightGBM diff --git a/src/metric/cuda/cuda_regression_metric.hpp b/src/metric/cuda/cuda_regression_metric.hpp deleted file mode 100644 index fe82d2739daf..000000000000 --- a/src/metric/cuda/cuda_regression_metric.hpp +++ /dev/null @@ -1,258 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_ -#define LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_ - -#include "cuda_metric.hpp" -#include "../regression_metric.hpp" - -#define EVAL_BLOCK_SIZE_REGRESSION_METRIC (1024) - -namespace LightGBM { - -// TODO(shiyu1994): merge CUDARegressionMetric and CUDABinaryLossMetric into CUDAPointWiseMetric -template -class CUDARegressionMetric : public CUDAMetricInterface, public RegressionMetric { - public: - explicit CUDARegressionMetric(const Config& config); - - ~CUDARegressionMetric(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - std::vector Eval(const double* score, const ObjectiveFunction* objective) const override; - - inline static double AverageLoss(double sum_loss, double sum_weights) { - // need sqrt the result for RMSE loss - return (sum_loss / sum_weights); - } - - inline static double LossOnPoint(label_t /*label*/, double /*score*/, const Config& /*config*/) { - Log::Fatal("Calling host LossOnPoint for a CUDA metric."); - return 0.0f; - } - - protected: - void LaunchEvalKernel(const double* score) const; - - void LaunchEvalKernelInner(const double* score) const; - - __device__ inline static double SafeLogCUDA(const double x) { - if (x > 0) { - return log(x); - } else { - return -INFINITY; - } - } - - const label_t* cuda_label_; - const label_t* cuda_weights_; - double* cuda_score_convert_buffer_; - double* cuda_sum_loss_buffer_; - double* cuda_sum_loss_; -}; - -class CUDARMSEMetric : public CUDARegressionMetric { - public: - explicit CUDARMSEMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double score, - const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { - return (score - label) * (score - label); - } - - inline static double AverageLoss(double sum_loss, double sum_weights) { - // need sqrt the result for RMSE loss - return std::sqrt(sum_loss / sum_weights); - } - - inline static const char* Name() { - return "rmse"; - } -}; - -class CUDAL2Metric : public CUDARegressionMetric { - public: - explicit CUDAL2Metric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double score, - const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { - return (score - label)*(score - label); - } - - inline static const char* Name() { - return "l2"; - } -}; - -class CUDAQuantileMetric : public CUDARegressionMetric { - public: - explicit CUDAQuantileMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double score, - const double alpha, const double /*fair_c*/, const double /*tweedie_variance_power*/) { - double delta = label - score; - if (delta < 0) { - return (alpha - 1.0f) * delta; - } else { - return alpha * delta; - } - } - - inline static const char* Name() { - return "quantile"; - } -}; - -class CUDAL1Metric : public CUDARegressionMetric { - public: - explicit CUDAL1Metric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double score, - const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { - return fabs(score - label); - } - - inline static const char* Name() { - return "l1"; - } -}; - -class CUDAHuberLossMetric : public CUDARegressionMetric { - public: - explicit CUDAHuberLossMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double score, - const double alpha, const double /*fair_c*/, const double /*tweedie_variance_power*/) { - const double diff = score - label; - if (fabs(diff) <= alpha) { - return 0.5f * diff * diff; - } else { - return alpha * (fabs(diff) - 0.5f * alpha); - } - } - - inline static const char* Name() { - return "huber"; - } -}; - -class CUDAFairLossMetric: public CUDARegressionMetric { - public: - explicit CUDAFairLossMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double score, - const double /*alpha*/, const double fair_c, const double /*tweedie_variance_power*/) { - const double x = fabs(score - label); - const double c = fair_c; - return c * x - c * c * log(1.0f + x / c); - } - - inline static const char* Name() { - return "fair"; - } -}; - -class CUDAPoissonMetric: public CUDARegressionMetric { - public: - explicit CUDAPoissonMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double score, - const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { - const double eps = 1e-10f; - if (score < eps) { - score = eps; - } - return score - label * log(score); - } - - inline static const char* Name() { - return "poisson"; - } -}; - -class CUDAMAPEMetric : public CUDARegressionMetric { - public: - explicit CUDAMAPEMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double score, - const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { - return fabs((label - score)) / fmax(1.0f, fabs(label)); - } - inline static const char* Name() { - return "mape"; - } -}; - -class CUDAGammaMetric : public CUDARegressionMetric { - public: - explicit CUDAGammaMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double score, - const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { - const double psi = 1.0; - const double theta = -1.0 / score; - const double a = psi; - const double b = -SafeLogCUDA(-theta); - const double c = 1. / psi * SafeLogCUDA(label / psi) - SafeLogCUDA(label) - 0; // 0 = std::lgamma(1.0 / psi) = std::lgamma(1.0); - return -((label * theta - b) / a + c); - } - inline static const char* Name() { - return "gamma"; - } - - inline static void CheckLabel(label_t label) { - CHECK_GT(label, 0); - } -}; - -class CUDAGammaDevianceMetric : public CUDARegressionMetric { - public: - explicit CUDAGammaDevianceMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double score, - const double /*alpha*/, const double /*fair_c*/, const double /*tweedie_variance_power*/) { - const double epsilon = 1.0e-9; - const double tmp = label / (score + epsilon); - return tmp - SafeLogCUDA(tmp) - 1; - } - - inline static const char* Name() { - return "gamma_deviance"; - } - - inline static double AverageLoss(double sum_loss, double) { - return sum_loss * 2; - } - - inline static void CheckLabel(label_t label) { - CHECK_GT(label, 0); - } -}; - -class CUDATweedieMetric : public CUDARegressionMetric { - public: - explicit CUDATweedieMetric(const Config& config); - - __device__ inline static double LossOnPointCUDA(label_t label, double score, - const double /*alpha*/, const double /*fair_c*/, const double tweedie_variance_power) { - const double rho = tweedie_variance_power; - const double eps = 1e-10f; - if (score < eps) { - score = eps; - } - const double a = label * exp((1 - rho) * log(score)) / (1 - rho); - const double b = exp((2 - rho) * log(score)) / (2 - rho); - return -a + b; - } - - inline static const char* Name() { - return "tweedie"; - } -}; - -} // namespace LightGBM - -#endif // LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_ diff --git a/src/metric/cuda/cuda_xentropy_metric.cpp b/src/metric/cuda/cuda_xentropy_metric.cpp deleted file mode 100644 index 7b7d69ace57d..000000000000 --- a/src/metric/cuda/cuda_xentropy_metric.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_xentropy_metric.hpp" - -namespace LightGBM { - -CUDACrossEntropyMetric::CUDACrossEntropyMetric(const Config& config): CrossEntropyMetric(config) {} - -CUDACrossEntropyMetric::~CUDACrossEntropyMetric() {} - -void CUDACrossEntropyMetric::Init(const Metadata& metadata, data_size_t num_data) { - CrossEntropyMetric::Init(metadata, num_data); - cuda_label_ = metadata.cuda_metadata()->cuda_label(); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data_), __FILE__, __LINE__); - - const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; - AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); -} - -std::vector CUDACrossEntropyMetric::Eval(const double* score, const ObjectiveFunction* objective) const { - double sum_loss = 0.0f; - if (objective != nullptr) { - objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); - } - LaunchEvalKernel(cuda_score_convert_buffer_); - CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); - return std::vector(1, sum_loss / sum_weights_); -} - -CUDACrossEntropyLambdaMetric::CUDACrossEntropyLambdaMetric(const Config& config): CrossEntropyLambdaMetric(config) {} - -CUDACrossEntropyLambdaMetric::~CUDACrossEntropyLambdaMetric() {} - -void CUDACrossEntropyLambdaMetric::Init(const Metadata& metadata, data_size_t num_data) { - CrossEntropyLambdaMetric::Init(metadata, num_data); - cuda_label_ = metadata.cuda_metadata()->cuda_label(); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data_), __FILE__, __LINE__); - - const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; - AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); -} - -std::vector CUDACrossEntropyLambdaMetric::Eval(const double* score, const ObjectiveFunction* objective) const { - if (objective != nullptr) { - objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); - } - LaunchEvalKernel(cuda_score_convert_buffer_); - double sum_loss = 0.0f; - CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); - return std::vector(1, sum_loss / static_cast(num_data_)); -} - -CUDAKullbackLeiblerDivergence::CUDAKullbackLeiblerDivergence(const Config& config): KullbackLeiblerDivergence(config) {} - -CUDAKullbackLeiblerDivergence::~CUDAKullbackLeiblerDivergence() {} - -void CUDAKullbackLeiblerDivergence::Init(const Metadata& metadata, data_size_t num_data) { - KullbackLeiblerDivergence::Init(metadata, num_data); - cuda_label_ = metadata.cuda_metadata()->cuda_label(); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - AllocateCUDAMemoryOuter(&cuda_score_convert_buffer_, static_cast(num_data_), __FILE__, __LINE__); - - const data_size_t num_blocks = (num_data + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; - AllocateCUDAMemoryOuter(&cuda_sum_loss_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_loss_, 1, __FILE__, __LINE__); -} - -std::vector CUDAKullbackLeiblerDivergence::Eval(const double* score, const ObjectiveFunction* objective) const { - if (objective != nullptr) { - objective->GetCUDAConvertOutputFunc()(num_data_, score, cuda_score_convert_buffer_); - } - LaunchEvalKernel(cuda_score_convert_buffer_); - double sum_loss = 0.0f; - CopyFromCUDADeviceToHostOuter(&sum_loss, cuda_sum_loss_, 1, __FILE__, __LINE__); - return std::vector(1, presum_label_entropy_ + sum_loss / sum_weights_); -} - -} // namespace LightGBM diff --git a/src/metric/cuda/cuda_xentropy_metric.cu b/src/metric/cuda/cuda_xentropy_metric.cu deleted file mode 100644 index 793efa92e421..000000000000 --- a/src/metric/cuda/cuda_xentropy_metric.cu +++ /dev/null @@ -1,149 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_xentropy_metric.hpp" -#include - -namespace LightGBM { - -__device__ inline static double XentLossCUDA(label_t label, double prob) { - const double log_arg_epsilon = 1.0e-12; - double a = label; - if (prob > log_arg_epsilon) { - a *= log(prob); - } else { - a *= log(log_arg_epsilon); - } - double b = 1.0f - label; - if (1.0f - prob > log_arg_epsilon) { - b *= log(1.0f - prob); - } else { - b *= log(log_arg_epsilon); - } - return - (a + b); -} - -__device__ inline static double XentLambdaLossCUDA(label_t label, label_t weight, double hhat) { - return XentLossCUDA(label, 1.0f - exp(-weight * hhat)); -} - -template -__global__ void EvalKernel_CrossEntropy( - const double* score, - const label_t* cuda_label, - const label_t* cuda_weights, - const data_size_t num_data, - double* cuda_sum_loss_buffer) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - double point_loss = 0.0f; - __shared__ double shared_mem_buffer[32]; - if (data_index < num_data) { - const label_t label = cuda_label[data_index]; - if (!USE_WEIGHT) { - point_loss = XentLossCUDA(label, score[data_index]); - } else { - const label_t weight = cuda_weights[data_index]; - point_loss = XentLossCUDA(label, score[data_index]) * weight; - } - } - const double block_sum_loss = ShuffleReduceSum(point_loss, shared_mem_buffer, blockDim.x); - if (threadIdx.x == 0) { - cuda_sum_loss_buffer[blockIdx.x] = block_sum_loss; - } -} - -__global__ void ReduceLossKernel_CrossEntropy(const double* cuda_sum_loss_buffer, const data_size_t num_blocks, double* out_loss) { - __shared__ double shared_buffer[32]; - double thread_sum_loss = 0.0f; - for (int block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { - thread_sum_loss += cuda_sum_loss_buffer[block_index]; - } - const double sum_loss = ShuffleReduceSum(thread_sum_loss, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - *out_loss = sum_loss; - } -} - -void CUDACrossEntropyMetric::LaunchEvalKernel(const double* score) const { - const data_size_t num_blocks = (num_data_ + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; - if (cuda_weights_ == nullptr) { - EvalKernel_CrossEntropy<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); - } else { - EvalKernel_CrossEntropy<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); - } - ReduceLossKernel_CrossEntropy<<<1, EVAL_BLOCK_SIZE_XENTROPY_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); -} - -template -__global__ void EvalKernel_CrossEntropyLambda( - const double* score, - const label_t* cuda_label, - const label_t* cuda_weights, - const data_size_t num_data, - double* cuda_sum_loss_buffer) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - double point_loss = 0.0f; - __shared__ double shared_mem_buffer[32]; - if (data_index < num_data) { - const label_t label = cuda_label[data_index]; - if (!USE_WEIGHT) { - point_loss = XentLambdaLossCUDA(label, 1.0f, score[data_index]); - } else { - const label_t weight = cuda_weights[data_index]; - point_loss = XentLambdaLossCUDA(label, weight, score[data_index]); - } - } - const double block_sum_loss = ShuffleReduceSum(point_loss, shared_mem_buffer, blockDim.x); - if (threadIdx.x == 0) { - cuda_sum_loss_buffer[blockIdx.x] = block_sum_loss; - } -} - -void CUDACrossEntropyLambdaMetric::LaunchEvalKernel(const double* score) const { - const data_size_t num_blocks = (num_data_ + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; - if (cuda_weights_ == nullptr) { - EvalKernel_CrossEntropyLambda<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); - } else { - EvalKernel_CrossEntropyLambda<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); - } - ReduceLossKernel_CrossEntropy<<<1, EVAL_BLOCK_SIZE_XENTROPY_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); -} - -template -__global__ void EvalKernel_KullbackLeiblerDivergence( - const double* score, - const label_t* cuda_label, - const label_t* cuda_weights, - const data_size_t num_data, - double* cuda_sum_loss_buffer) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - double point_loss = 0.0f; - __shared__ double shared_mem_buffer[32]; - if (data_index < num_data) { - const label_t label = cuda_label[data_index]; - if (!USE_WEIGHT) { - point_loss = XentLossCUDA(label, score[data_index]); - } else { - const label_t weight = cuda_weights[data_index]; - point_loss = XentLossCUDA(label, score[data_index]) * weight; - } - } - const double block_sum_loss = ShuffleReduceSum(point_loss, shared_mem_buffer, blockDim.x); - if (threadIdx.x == 0) { - cuda_sum_loss_buffer[blockIdx.x] = block_sum_loss; - } -} - -void CUDAKullbackLeiblerDivergence::LaunchEvalKernel(const double* score) const { - const data_size_t num_blocks = (num_data_ + EVAL_BLOCK_SIZE_XENTROPY_METRIC - 1) / EVAL_BLOCK_SIZE_XENTROPY_METRIC; - if (cuda_weights_ == nullptr) { - EvalKernel_KullbackLeiblerDivergence<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); - } else { - EvalKernel_KullbackLeiblerDivergence<<>>(score, cuda_label_, cuda_weights_, num_data_, cuda_sum_loss_buffer_); - } - ReduceLossKernel_CrossEntropy<<<1, EVAL_BLOCK_SIZE_XENTROPY_METRIC>>>(cuda_sum_loss_buffer_, num_blocks, cuda_sum_loss_); -} - -} // namespace LightGBM diff --git a/src/metric/cuda/cuda_xentropy_metric.hpp b/src/metric/cuda/cuda_xentropy_metric.hpp deleted file mode 100644 index 7fbe6ae48de6..000000000000 --- a/src/metric/cuda/cuda_xentropy_metric.hpp +++ /dev/null @@ -1,77 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_METRIC_CUDA_CUDA_XENTROPY_METRIC_HPP_ -#define LIGHTGBM_METRIC_CUDA_CUDA_XENTROPY_METRIC_HPP_ - -#include "cuda_metric.hpp" -#include "../xentropy_metric.hpp" - -#define EVAL_BLOCK_SIZE_XENTROPY_METRIC (1024) - -namespace LightGBM { - -class CUDACrossEntropyMetric : public CUDAMetricInterface, public CrossEntropyMetric { - public: - explicit CUDACrossEntropyMetric(const Config&); - - ~CUDACrossEntropyMetric(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - std::vector Eval(const double* score, const ObjectiveFunction* objective) const; - - private: - void LaunchEvalKernel(const double* score) const; - - const label_t* cuda_label_; - const label_t* cuda_weights_; - double* cuda_score_convert_buffer_; - double* cuda_sum_loss_buffer_; - double* cuda_sum_loss_; -}; - -class CUDACrossEntropyLambdaMetric : public CUDAMetricInterface, public CrossEntropyLambdaMetric { - public: - explicit CUDACrossEntropyLambdaMetric(const Config&); - - ~CUDACrossEntropyLambdaMetric(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - std::vector Eval(const double* score, const ObjectiveFunction* objective) const; - - private: - void LaunchEvalKernel(const double* score) const; - - const label_t* cuda_label_; - const label_t* cuda_weights_; - double* cuda_score_convert_buffer_; - double* cuda_sum_loss_buffer_; - double* cuda_sum_loss_; -}; - -class CUDAKullbackLeiblerDivergence : public CUDAMetricInterface, public KullbackLeiblerDivergence { - public: - explicit CUDAKullbackLeiblerDivergence(const Config&); - - ~CUDAKullbackLeiblerDivergence(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - std::vector Eval(const double* score, const ObjectiveFunction* objective) const; - - private: - void LaunchEvalKernel(const double* score) const; - - const label_t* cuda_label_; - const label_t* cuda_weights_; - double* cuda_score_convert_buffer_; - double* cuda_sum_loss_buffer_; - double* cuda_sum_loss_; -}; - -} // namespace LightGBM - -#endif // LIGHTGBM_METRIC_CUDA_CUDA_XENTROPY_METRIC_HPP_ diff --git a/src/metric/metric.cpp b/src/metric/metric.cpp index f9fb8c7efe19..321399405aa8 100644 --- a/src/metric/metric.cpp +++ b/src/metric/metric.cpp @@ -11,107 +11,55 @@ #include "regression_metric.hpp" #include "xentropy_metric.hpp" -#include "cuda/cuda_binary_metric.hpp" -#include "cuda/cuda_regression_metric.hpp" -#include "cuda/cuda_multiclass_metric.hpp" -#include "cuda/cuda_xentropy_metric.hpp" -#include "cuda/cuda_rank_metric.hpp" - namespace LightGBM { Metric* Metric::CreateMetric(const std::string& type, const Config& config) { - if (config.device_type == std::string("cuda")) { - if (type == std::string("l2")) { - return new CUDAL2Metric(config); - } else if (type == std::string("rmse")) { - return new CUDARMSEMetric(config); - } else if (type == std::string("l1")) { - return new CUDAL1Metric(config); - } else if (type == std::string("quantile")) { - return new CUDAQuantileMetric(config); - } else if (type == std::string("huber")) { - return new CUDAHuberLossMetric(config); - } else if (type == std::string("fair")) { - return new CUDAFairLossMetric(config); - } else if (type == std::string("poisson")) { - return new CUDAPoissonMetric(config); - } else if (type == std::string("binary_logloss")) { - return new CUDABinaryLoglossMetric(config); - } else if (type == std::string("binary_error")) { - return new CUDABinaryErrorMetric(config); - } else if (type == std::string("auc")) { - return new CUDAAUCMetric(config); - } else if (type == std::string("average_precision")) { - return new CUDAAveragePrecisionMetric(config); - } else if (type == std::string("multi_logloss")) { - return new CUDAMultiSoftmaxLoglossMetric(config); - } else if (type == std::string("multi_error")) { - return new CUDAMultiErrorMetric(config); - } else if (type == std::string("ndcg")) { - return new CUDANDCGMetric(config); - } else if (type == std::string("cross_entropy")) { - return new CUDACrossEntropyMetric(config); - } else if (type == std::string("cross_entropy_lambda")) { - return new CUDACrossEntropyLambdaMetric(config); - } else if (type == std::string("kullback_leibler")) { - return new CUDAKullbackLeiblerDivergence(config); - } else if (type == std::string("mape")) { - return new CUDAMAPEMetric(config); - } else if (type == std::string("gamma")) { - return new CUDAGammaMetric(config); - } else if (type == std::string("gamma_deviance")) { - return new CUDAGammaDevianceMetric(config); - } else if (type == std::string("tweedie")) { - return new CUDATweedieMetric(config); - } - } else { - if (type == std::string("l2")) { - return new L2Metric(config); - } else if (type == std::string("rmse")) { - return new RMSEMetric(config); - } else if (type == std::string("l1")) { - return new L1Metric(config); - } else if (type == std::string("quantile")) { - return new QuantileMetric(config); - } else if (type == std::string("huber")) { - return new HuberLossMetric(config); - } else if (type == std::string("fair")) { - return new FairLossMetric(config); - } else if (type == std::string("poisson")) { - return new PoissonMetric(config); - } else if (type == std::string("binary_logloss")) { - return new BinaryLoglossMetric(config); - } else if (type == std::string("binary_error")) { - return new BinaryErrorMetric(config); - } else if (type == std::string("auc")) { - return new AUCMetric(config); - } else if (type == std::string("average_precision")) { - return new AveragePrecisionMetric(config); - } else if (type == std::string("auc_mu")) { - return new AucMuMetric(config); - } else if (type == std::string("ndcg")) { - return new NDCGMetric(config); - } else if (type == std::string("map")) { - return new MapMetric(config); - } else if (type == std::string("multi_logloss")) { - return new MultiSoftmaxLoglossMetric(config); - } else if (type == std::string("multi_error")) { - return new MultiErrorMetric(config); - } else if (type == std::string("cross_entropy")) { - return new CrossEntropyMetric(config); - } else if (type == std::string("cross_entropy_lambda")) { - return new CrossEntropyLambdaMetric(config); - } else if (type == std::string("kullback_leibler")) { - return new KullbackLeiblerDivergence(config); - } else if (type == std::string("mape")) { - return new MAPEMetric(config); - } else if (type == std::string("gamma")) { - return new GammaMetric(config); - } else if (type == std::string("gamma_deviance")) { - return new GammaDevianceMetric(config); - } else if (type == std::string("tweedie")) { - return new TweedieMetric(config); - } + if (type == std::string("l2")) { + return new L2Metric(config); + } else if (type == std::string("rmse")) { + return new RMSEMetric(config); + } else if (type == std::string("l1")) { + return new L1Metric(config); + } else if (type == std::string("quantile")) { + return new QuantileMetric(config); + } else if (type == std::string("huber")) { + return new HuberLossMetric(config); + } else if (type == std::string("fair")) { + return new FairLossMetric(config); + } else if (type == std::string("poisson")) { + return new PoissonMetric(config); + } else if (type == std::string("binary_logloss")) { + return new BinaryLoglossMetric(config); + } else if (type == std::string("binary_error")) { + return new BinaryErrorMetric(config); + } else if (type == std::string("auc")) { + return new AUCMetric(config); + } else if (type == std::string("average_precision")) { + return new AveragePrecisionMetric(config); + } else if (type == std::string("auc_mu")) { + return new AucMuMetric(config); + } else if (type == std::string("ndcg")) { + return new NDCGMetric(config); + } else if (type == std::string("map")) { + return new MapMetric(config); + } else if (type == std::string("multi_logloss")) { + return new MultiSoftmaxLoglossMetric(config); + } else if (type == std::string("multi_error")) { + return new MultiErrorMetric(config); + } else if (type == std::string("cross_entropy")) { + return new CrossEntropyMetric(config); + } else if (type == std::string("cross_entropy_lambda")) { + return new CrossEntropyLambdaMetric(config); + } else if (type == std::string("kullback_leibler")) { + return new KullbackLeiblerDivergence(config); + } else if (type == std::string("mape")) { + return new MAPEMetric(config); + } else if (type == std::string("gamma")) { + return new GammaMetric(config); + } else if (type == std::string("gamma_deviance")) { + return new GammaDevianceMetric(config); + } else if (type == std::string("tweedie")) { + return new TweedieMetric(config); } Log::Fatal("Unknown metric type name: %s", type.c_str()); return nullptr; diff --git a/src/objective/cuda/cuda_binary_objective.cpp b/src/objective/cuda/cuda_binary_objective.cpp deleted file mode 100644 index a2465caf2ad9..000000000000 --- a/src/objective/cuda/cuda_binary_objective.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_binary_objective.hpp" - -namespace LightGBM { - -CUDABinaryLogloss::CUDABinaryLogloss(const Config& config): -BinaryLogloss(config), ova_class_id_(-1) {} - -CUDABinaryLogloss::CUDABinaryLogloss(const Config& config, const int ova_class_id): -BinaryLogloss(config, [ova_class_id](label_t label) { return static_cast(label) == ova_class_id; }), ova_class_id_(ova_class_id) {} - -CUDABinaryLogloss::CUDABinaryLogloss(const std::vector& strs): BinaryLogloss(strs) {} - -CUDABinaryLogloss::~CUDABinaryLogloss() {} - -void CUDABinaryLogloss::Init(const Metadata& metadata, data_size_t num_data) { - BinaryLogloss::Init(metadata, num_data); - if (ova_class_id_ == -1) { - cuda_label_ = metadata.cuda_metadata()->cuda_label(); - cuda_ova_label_ = nullptr; - } else { - Log::Warning("converting cuda labels with ova_class_id_ = %d", ova_class_id_); - InitCUDAMemoryFromHostMemoryOuter(&cuda_ova_label_, metadata.cuda_metadata()->cuda_label(), static_cast(num_data), __FILE__, __LINE__); - LaunchResetOVACUDALableKernel(); - cuda_label_ = cuda_ova_label_; - } - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - AllocateCUDAMemoryOuter(&cuda_boost_from_score_, 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_boost_from_score_, 0, 1, __FILE__, __LINE__); - if (label_weights_[0] != 1.0f || label_weights_[1] != 1.0f) { - InitCUDAMemoryFromHostMemoryOuter(&cuda_label_weights_, label_weights_, 2, __FILE__, __LINE__); - } else { - cuda_label_weights_ = nullptr; - } -} - -void CUDABinaryLogloss::GetGradients(const double* scores, score_t* gradients, score_t* hessians) const { - LaunchGetGradientsKernel(scores, gradients, hessians); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -double CUDABinaryLogloss::BoostFromScore(int) const { - LaunchBoostFromScoreKernel(); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - double boost_from_score = 0.0f; - CopyFromCUDADeviceToHostOuter(&boost_from_score, cuda_boost_from_score_, 1, __FILE__, __LINE__); - Log::Warning("boost_from_score = %f", boost_from_score); - return boost_from_score; -} - -void CUDABinaryLogloss::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { - LaunchConvertOutputCUDAKernel(num_data, input, output); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_binary_objective.cu b/src/objective/cuda/cuda_binary_objective.cu deleted file mode 100644 index f29dacbe1ca0..000000000000 --- a/src/objective/cuda/cuda_binary_objective.cu +++ /dev/null @@ -1,165 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_binary_objective.hpp" - -namespace LightGBM { - -__global__ void BoostFromScoreKernel_1_BinaryLogloss(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_init_score) { - __shared__ label_t shared_label[CALC_INIT_SCORE_BLOCK_SIZE_BINARY]; - const unsigned int tid = threadIdx.x; - const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; - shared_label[tid] = 0.0f; - __syncthreads(); - for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; ++j) { - if (i + j < num_data) { - shared_label[tid] += cuda_labels[i + j]; - } - } - __syncthreads(); - for (unsigned int s = 1; s < blockDim.x; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < CALC_INIT_SCORE_BLOCK_SIZE_BINARY) { - shared_label[tid] += shared_label[tid + s]; - } - __syncthreads(); - } - if (tid == 0) { - atomicAdd_system(out_cuda_init_score, shared_label[0]); - } -} - -__global__ void BoostFromScoreKernel_2_BinaryLogloss(double* out_cuda_init_score, const data_size_t num_data, const double sigmoid) { - const double suml = *out_cuda_init_score; - const double sumw = static_cast(num_data); - const double pavg = suml / sumw; - const double init_score = log(pavg / (1.0f - pavg)) / sigmoid; - *out_cuda_init_score = init_score; -} - -void CUDABinaryLogloss::LaunchBoostFromScoreKernel() const { - const data_size_t num_data_per_block = CALC_INIT_SCORE_BLOCK_SIZE_BINARY * NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY; - const int num_blocks = (num_data_ + num_data_per_block - 1) / num_data_per_block; - BoostFromScoreKernel_1_BinaryLogloss<<>>(cuda_label_, num_data_, cuda_boost_from_score_); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - BoostFromScoreKernel_2_BinaryLogloss<<<1, 1>>>(cuda_boost_from_score_, num_data_, sigmoid_); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -template -__global__ void GetGradientsKernel_BinaryLogloss(const double* cuda_scores, const label_t* cuda_labels, - const double* cuda_label_weights, const label_t* cuda_weights, const int ova_class_id, - const double sigmoid, const data_size_t num_data, - score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - const label_t cuda_label = static_cast(cuda_labels[data_index]); - const int label = IS_OVA ? (cuda_label == ova_class_id ? 1 : -1) : (cuda_label > 0 ? 1 : -1); - const double response = -label * sigmoid / (1.0f + exp(label * sigmoid * cuda_scores[data_index])); - const double abs_response = fabs(response); - if (!USE_WEIGHT) { - if (USE_LABEL_WEIGHT) { - const double label_weight = cuda_label_weights[label]; - cuda_out_gradients[data_index] = static_cast(response * label_weight); - cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response) * label_weight); - } else { - cuda_out_gradients[data_index] = static_cast(response); - cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response)); - } - } else { - const double sample_weight = cuda_weights[data_index]; - if (USE_LABEL_WEIGHT) { - const double label_weight = cuda_label_weights[label]; - cuda_out_gradients[data_index] = static_cast(response * label_weight * sample_weight); - cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response) * label_weight * sample_weight); - } else { - cuda_out_gradients[data_index] = static_cast(response * sample_weight); - cuda_out_hessians[data_index] = static_cast(abs_response * (sigmoid - abs_response) * sample_weight); - } - } - } -} - -#define GetGradientsKernel_BinaryLogloss_ARGS \ - scores, \ - cuda_label_, \ - cuda_label_weights_, \ - cuda_weights_, \ - ova_class_id_, \ - sigmoid_, \ - num_data_, \ - gradients, \ - hessians - -void CUDABinaryLogloss::LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY; - if (ova_class_id_ == -1) { - if (cuda_label_weights_ == nullptr) { - if (cuda_weights_ == nullptr) { - GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); - } else { - GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); - } - } else { - if (cuda_weights_ == nullptr) { - GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); - } else { - GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); - } - } - } else { - if (cuda_label_weights_ == nullptr) { - if (cuda_weights_ == nullptr) { - GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); - } else { - GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); - } - } else { - if (cuda_weights_ == nullptr) { - GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); - } else { - GetGradientsKernel_BinaryLogloss<<>>(GetGradientsKernel_BinaryLogloss_ARGS); - } - } - } -} - -#undef GetGradientsKernel_BinaryLogloss_ARGS - -__global__ void ConvertOutputCUDAKernel_BinaryLogloss(const double sigmoid, const data_size_t num_data, const double* input, double* output) { - const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (data_index < num_data) { - output[data_index] = 1.0f / (1.0f + exp(-sigmoid * input[data_index])); - } -} - -void CUDABinaryLogloss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { - const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY; - ConvertOutputCUDAKernel_BinaryLogloss<<>>(sigmoid_, num_data, input, output); -} - -__global__ void ResetOVACUDALableKernel( - const int ova_class_id, - const data_size_t num_data, - label_t* cuda_label) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - const int int_label = static_cast(cuda_label[data_index]); - cuda_label[data_index] == (int_label == ova_class_id ? 1.0f : 0.0f); - } -} - -void CUDABinaryLogloss::LaunchResetOVACUDALableKernel() const { - Log::Warning("before LaunchResetOVACUDALableKernel, ova_class_id = %d", ova_class_id_); - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY; - ResetOVACUDALableKernel<<>>(ova_class_id_, num_data_, cuda_ova_label_); - Log::Warning("after LaunchResetOVACUDALableKernel"); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_binary_objective.hpp b/src/objective/cuda/cuda_binary_objective.hpp deleted file mode 100644 index f7397593353f..000000000000 --- a/src/objective/cuda/cuda_binary_objective.hpp +++ /dev/null @@ -1,68 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_ -#define LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_ - -#ifdef USE_CUDA - -#define GET_GRADIENTS_BLOCK_SIZE_BINARY (1024) -#define CALC_INIT_SCORE_BLOCK_SIZE_BINARY (1024) -#define NUM_DATA_THREAD_ADD_CALC_INIT_SCORE_BINARY (6) - -#include -#include "../binary_objective.hpp" - -namespace LightGBM { - -class CUDABinaryLogloss : public CUDAObjectiveInterface, public BinaryLogloss { - public: - explicit CUDABinaryLogloss(const Config& config); - - explicit CUDABinaryLogloss(const Config& config, const int ova_class_id); - - explicit CUDABinaryLogloss(const std::vector& strs); - - ~CUDABinaryLogloss(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - void GetGradients(const double* scores, score_t* gradients, score_t* hessians) const override; - - double BoostFromScore(int) const override; - - void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; - - std::function GetCUDAConvertOutputFunc() const override { - return [this] (data_size_t num_data, const double* input, double* output) { - ConvertOutputCUDA(num_data, input, output); - }; - } - - private: - void LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const; - - void LaunchBoostFromScoreKernel() const; - - void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; - - void LaunchResetOVACUDALableKernel() const; - - // CUDA memory, held by other objects - const label_t* cuda_label_; - label_t* cuda_ova_label_; - const label_t* cuda_weights_; - - // CUDA memory, held by this object - double* cuda_boost_from_score_; - double* cuda_label_weights_; - const int ova_class_id_ = -1; -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_ diff --git a/src/objective/cuda/cuda_multiclass_objective.cpp b/src/objective/cuda/cuda_multiclass_objective.cpp deleted file mode 100644 index 5a5e5d1be99d..000000000000 --- a/src/objective/cuda/cuda_multiclass_objective.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#include "cuda_multiclass_objective.hpp" - -namespace LightGBM { - -CUDAMulticlassSoftmax::CUDAMulticlassSoftmax(const Config& config): MulticlassSoftmax(config) {} - -CUDAMulticlassSoftmax::CUDAMulticlassSoftmax(const std::vector& strs): MulticlassSoftmax(strs) {} - -CUDAMulticlassSoftmax::~CUDAMulticlassSoftmax() {} - -void CUDAMulticlassSoftmax::Init(const Metadata& metadata, data_size_t num_data) { - MulticlassSoftmax::Init(metadata, num_data); - cuda_label_ = metadata.cuda_metadata()->cuda_label(); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - AllocateCUDAMemoryOuter(&cuda_boost_from_score_, num_class_, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_softmax_buffer_, static_cast(num_data) * static_cast(num_class_), __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_boost_from_score_, 0, num_class_, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -void CUDAMulticlassSoftmax::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { - LaunchGetGradientsKernel(score, gradients, hessians); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - for (int class_index = 0; class_index < num_class_; ++class_index) { - std::vector host_gradients(num_data_, 0.0f); - std::vector host_hessians(num_data_, 0.0f); - const size_t offset = static_cast(class_index * num_data_); - CopyFromCUDADeviceToHostOuter(host_gradients.data(), gradients + offset, static_cast(num_data_), __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(host_hessians.data(), hessians + offset, static_cast(num_data_), __FILE__, __LINE__); - const int num_threads = OMP_NUM_THREADS(); - std::vector thread_abs_max_gradient(num_threads, 0.0f); - std::vector thread_abs_max_hessian(num_threads, 0.0f); - std::vector thread_abs_min_hessian(num_threads, std::numeric_limits::infinity()); - Threading::For(0, num_data_, 512, - [&thread_abs_max_gradient, &thread_abs_max_hessian, &thread_abs_min_hessian, &host_gradients, &host_hessians] (int thread_index, data_size_t start, data_size_t end) { - for (data_size_t index = start; index < end; ++index) { - const score_t gradient = host_gradients[index]; - const score_t hessian = host_hessians[index]; - if (std::fabs(gradient) > std::fabs(thread_abs_max_gradient[thread_index])) { - thread_abs_max_gradient[thread_index] = gradient; - } - if (std::fabs(hessian) > std::fabs(thread_abs_max_hessian[thread_index])) { - thread_abs_max_hessian[thread_index] = hessian; - } - if (std::fabs(hessian) < std::fabs(thread_abs_min_hessian[thread_index])) { - thread_abs_min_hessian[thread_index] = hessian; - } - } - }); - double max_abs_gradient = 0.0f; - double max_abs_hessian = 0.0f; - double min_abs_hessian = std::numeric_limits::infinity(); - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - if (std::fabs(thread_abs_max_gradient[thread_index]) > std::fabs(max_abs_gradient)) { - max_abs_gradient = thread_abs_max_gradient[thread_index]; - } - if (std::fabs(thread_abs_max_hessian[thread_index] > std::fabs(max_abs_hessian))) { - max_abs_hessian = thread_abs_max_hessian[thread_index]; - } - if (std::fabs(thread_abs_min_hessian[thread_index] < std::fabs(min_abs_hessian))) { - min_abs_hessian = thread_abs_min_hessian[thread_index]; - } - } - Log::Warning("class %d max_abs_gradient = %f, max_abs_hessian = %f, min_abs_hessian = %f", class_index, max_abs_gradient, max_abs_hessian, min_abs_hessian); - } -} - -void CUDAMulticlassSoftmax::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { - LaunchConvertOutputCUDAKernel(num_data, input, output); -} - -CUDAMulticlassOVA::CUDAMulticlassOVA(const Config& config) { - num_class_ = config.num_class; - for (int i = 0; i < num_class_; ++i) { - cuda_binary_loss_.emplace_back(new CUDABinaryLogloss(config, i)); - } - sigmoid_ = config.sigmoid; -} - -CUDAMulticlassOVA::CUDAMulticlassOVA(const std::vector& strs): MulticlassOVA(strs) {} - -CUDAMulticlassOVA::~CUDAMulticlassOVA() {} - -void CUDAMulticlassOVA::Init(const Metadata& metadata, data_size_t num_data) { - num_data_ = num_data; - for (int i = 0; i < num_class_; ++i) { - cuda_binary_loss_[i]->Init(metadata, num_data); - } -} - -void CUDAMulticlassOVA::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { - for (int i = 0; i < num_class_; ++i) { - int64_t offset = static_cast(num_data_) * i; - cuda_binary_loss_[i]->GetGradients(score + offset, gradients + offset, hessians + offset); - } -} - -void CUDAMulticlassOVA::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { - for (int i = 0; i < num_class_; ++i) { - cuda_binary_loss_[i]->ConvertOutputCUDA(num_data, input + i * num_data, output + i * num_data); - } -} - -} // namespace LightGBM diff --git a/src/objective/cuda/cuda_multiclass_objective.cu b/src/objective/cuda/cuda_multiclass_objective.cu deleted file mode 100644 index 98fc1d0f460b..000000000000 --- a/src/objective/cuda/cuda_multiclass_objective.cu +++ /dev/null @@ -1,104 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#include "cuda_multiclass_objective.hpp" - -namespace LightGBM { - -__device__ void SoftmaxCUDA(double* softmax_buffer, int len) { - double wmax = softmax_buffer[0]; - for (int i = 1; i < len; ++i) { - wmax = max(softmax_buffer[i], wmax); - } - double wsum = 0.0f; - for (int i = 0; i < len; ++i) { - softmax_buffer[i] = exp(softmax_buffer[i] - wmax); - wsum += softmax_buffer[i]; - } - for (int i = 0; i < len; ++i) { - softmax_buffer[i] /= static_cast(wsum); - } -} - -template -__global__ void GetGradientsKernel_MulticlassSoftmax( - const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, - const double factor, const int num_class, const data_size_t num_data, - double* cuda_softmax_buffer, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - const data_size_t offset = data_index * num_class; - double* softmax_result = cuda_softmax_buffer + offset; - for (int k = 0; k < num_class; ++k) { - const double point_score = cuda_scores[k * num_data + data_index]; - if (isnan(point_score)) { - printf("error find nan %f in score ==================================================\n", point_score); - } - softmax_result[k] = cuda_scores[k * num_data + data_index]; - } - SoftmaxCUDA(softmax_result, num_class); - if (!USE_WEIGHT) { - for (int k = 0; k < num_class; ++k) { - const double p = softmax_result[k]; - size_t idx = static_cast(num_data) * k + data_index; - if (static_cast(cuda_labels[data_index]) == k) { - cuda_out_gradients[idx] = static_cast(p - 1.0f); - } else { - cuda_out_gradients[idx] = static_cast(p); - } - cuda_out_hessians[idx] = static_cast(factor * p * (1.0f - p)); - } - } else { - for (int k = 0; k < num_class; ++k) { - const double p = softmax_result[k]; - const double weight = cuda_weights[data_index]; - size_t idx = static_cast(num_data) * k + data_index; - if (static_cast(cuda_labels[data_index]) == k) { - cuda_out_gradients[idx] = static_cast((p - 1.0f) * weight); - } else { - cuda_out_gradients[idx] = static_cast(p * weight); - } - cuda_out_hessians[idx] = static_cast((factor * p * (1.0f - p)) * weight); - } - } - } -} - -void CUDAMulticlassSoftmax::LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_MULTICLASS - 1) / GET_GRADIENTS_BLOCK_SIZE_MULTICLASS; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_MulticlassSoftmax<<>>( - scores, cuda_label_, cuda_weights_, factor_, num_class_, num_data_, - cuda_softmax_buffer_, gradients, hessians); - } else { - GetGradientsKernel_MulticlassSoftmax<<>>( - scores, cuda_label_, cuda_weights_, factor_, num_class_, num_data_, - cuda_softmax_buffer_, gradients, hessians); - } -} - -__global__ void ConvertOutputCUDAKernel_MulticlassSoftmax( - const int num_class, const data_size_t num_data, const double* input, double* cuda_softmax_buffer, double* output) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - const data_size_t offset = data_index * num_class; - double* cuda_softmax_buffer_ptr = cuda_softmax_buffer + offset; - for (int class_index = 0; class_index < num_class; ++class_index) { - cuda_softmax_buffer_ptr[class_index] = input[class_index * num_data + data_index]; - } - SoftmaxCUDA(cuda_softmax_buffer_ptr, num_class); - for (int class_index = 0; class_index < num_class; ++class_index) { - output[class_index * num_data + data_index] = cuda_softmax_buffer_ptr[class_index]; - } - } -} - -void CUDAMulticlassSoftmax::LaunchConvertOutputCUDAKernel( - const data_size_t num_data, const double* input, double* output) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_MULTICLASS - 1) / GET_GRADIENTS_BLOCK_SIZE_MULTICLASS; - ConvertOutputCUDAKernel_MulticlassSoftmax<<>>( - num_class_, num_data, input, cuda_softmax_buffer_, output); -} - -} // namespace LightGBM diff --git a/src/objective/cuda/cuda_multiclass_objective.hpp b/src/objective/cuda/cuda_multiclass_objective.hpp deleted file mode 100644 index 994fdcb0a696..000000000000 --- a/src/objective/cuda/cuda_multiclass_objective.hpp +++ /dev/null @@ -1,80 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_ -#define LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_ - -#include -#include "cuda_binary_objective.hpp" -#include "../multiclass_objective.hpp" - -#define GET_GRADIENTS_BLOCK_SIZE_MULTICLASS (1024) - -namespace LightGBM { - -class CUDAMulticlassSoftmax: public CUDAObjectiveInterface, public MulticlassSoftmax { - public: - explicit CUDAMulticlassSoftmax(const Config& config); - - explicit CUDAMulticlassSoftmax(const std::vector& strs); - - ~CUDAMulticlassSoftmax(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; - - void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; - - std::function GetCUDAConvertOutputFunc() const override { - return [this] (data_size_t num_data, const double* input, double* output) { - ConvertOutputCUDA(num_data, input, output); - }; - } - - private: - void LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const; - - void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; - - // CUDA memory, held by other objects - const label_t* cuda_label_; - // TODO(shiyu1994): add weighted gradients - const label_t* cuda_weights_; - - // CUDA memory, held by this object - double* cuda_boost_from_score_; - double* cuda_softmax_buffer_; -}; - -class CUDAMulticlassOVA: public CUDAObjectiveInterface, public MulticlassOVA { - public: - explicit CUDAMulticlassOVA(const Config& config); - - explicit CUDAMulticlassOVA(const std::vector& strs); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; - - void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; - - double BoostFromScore(int class_id) const override { - Log::Warning("BoostFromScore class_id = %d", class_id); - return cuda_binary_loss_[class_id]->BoostFromScore(0); - } - - bool ClassNeedTrain(int class_id) const override { - return cuda_binary_loss_[class_id]->ClassNeedTrain(0); - } - - ~CUDAMulticlassOVA(); - - private: - std::vector> cuda_binary_loss_; -}; - -} // namespace LightGBM - -#endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_ diff --git a/src/objective/cuda/cuda_objective_function.cpp b/src/objective/cuda/cuda_objective_function.cpp deleted file mode 100644 index 733586f7d441..000000000000 --- a/src/objective/cuda/cuda_objective_function.cpp +++ /dev/null @@ -1,12 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#include -#include "cuda_binary_objective.hpp" - -namespace LightGBM { - -} // namespace LightGBM diff --git a/src/objective/cuda/cuda_rank_objective.cpp b/src/objective/cuda/cuda_rank_objective.cpp deleted file mode 100644 index 12143543f190..000000000000 --- a/src/objective/cuda/cuda_rank_objective.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_rank_objective.hpp" - -namespace LightGBM { - -CUDALambdarankNDCG::CUDALambdarankNDCG(const Config& config): -LambdarankNDCG(config) {} - -CUDALambdarankNDCG::CUDALambdarankNDCG(const std::vector& strs): LambdarankNDCG(strs) {} - -void CUDALambdarankNDCG::Init(const Metadata& metadata, data_size_t num_data) { - const int num_threads = OMP_NUM_THREADS(); - LambdarankNDCG::Init(metadata, num_data); - - std::vector thread_max_num_items_in_query(num_threads); - Threading::For(0, num_queries_, 1, - [this, &thread_max_num_items_in_query] (int thread_index, data_size_t start, data_size_t end) { - for (data_size_t query_index = start; query_index < end; ++query_index) { - const data_size_t query_item_count = query_boundaries_[query_index + 1] - query_boundaries_[query_index]; - if (query_item_count > thread_max_num_items_in_query[thread_index]) { - thread_max_num_items_in_query[thread_index] = query_item_count; - } - } - }); - data_size_t max_items_in_query = 0; - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - if (thread_max_num_items_in_query[thread_index] > max_items_in_query) { - max_items_in_query = thread_max_num_items_in_query[thread_index]; - } - } - max_items_in_query_aligned_ = 1; - --max_items_in_query; - while (max_items_in_query > 0) { - max_items_in_query >>= 1; - max_items_in_query_aligned_ <<= 1; - } - if (max_items_in_query_aligned_ > 2048) { - AllocateCUDAMemoryOuter(&cuda_item_indices_buffer_, - static_cast(metadata.query_boundaries()[metadata.num_queries()]), - __FILE__, __LINE__); - } - cuda_labels_ = metadata.cuda_metadata()->cuda_label(); - cuda_query_boundaries_ = metadata.cuda_metadata()->cuda_query_boundaries(); - AllocateCUDAMemoryOuter(&cuda_lambdas_, num_data_, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_inverse_max_dcgs_, num_queries_, __FILE__, __LINE__); - LaunchCalcInverseMaxDCGKernel(); -} - -void CUDALambdarankNDCG::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { - LaunchGetGradientsKernel(score, gradients, hessians); -} - -CUDARankXENDCG::CUDARankXENDCG(const Config& config): CUDALambdarankNDCG(config) {} - -CUDARankXENDCG::CUDARankXENDCG(const std::vector& strs): CUDALambdarankNDCG(strs) {} - -CUDARankXENDCG::~CUDARankXENDCG() {} - -void CUDARankXENDCG::Init(const Metadata& metadata, data_size_t num_data) { - CUDALambdarankNDCG::Init(metadata, num_data); - for (data_size_t i = 0; i < num_queries_; ++i) { - rands_.emplace_back(seed_ + i); - } - item_rands_.resize(num_data, 0.0f); - AllocateCUDAMemoryOuter(&cuda_item_rands_, static_cast(num_data), __FILE__, __LINE__); - if (max_items_in_query_aligned_ >= 2048) { - AllocateCUDAMemoryOuter(&cuda_params_buffer_, static_cast(num_data_), __FILE__, __LINE__); - } -} - -void CUDARankXENDCG::GenerateItemRands() const { - const int num_threads = OMP_NUM_THREADS(); - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) num_threads(num_threads) - for (data_size_t i = 0; i < num_queries_; ++i) { - OMP_LOOP_EX_BEGIN(); - const data_size_t start = query_boundaries_[i]; - const data_size_t end = query_boundaries_[i + 1]; - for (data_size_t j = start; j < end; ++j) { - item_rands_[j] = rands_[i].NextFloat(); - } - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); -} - -void CUDARankXENDCG::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { - GenerateItemRands(); - CopyFromHostToCUDADeviceOuter(cuda_item_rands_, item_rands_.data(), item_rands_.size(), __FILE__, __LINE__); - LaunchGetGradientsKernel(score, gradients, hessians); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_rank_objective.cu b/src/objective/cuda/cuda_rank_objective.cu deleted file mode 100644 index 8322531df3ab..000000000000 --- a/src/objective/cuda/cuda_rank_objective.cu +++ /dev/null @@ -1,652 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_rank_objective.hpp" - -#include -#include -#include - -#define BITONIC_SORT_NUM_ELEMENTS_LOCAL (1024) -#define BITONIC_SORT_DEPTH_LOCAL (11) - -namespace LightGBM { - -template -__global__ void GetGradientsKernel_LambdarankNDCG(const double* cuda_scores, const label_t* cuda_labels, const data_size_t num_data, - const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, - const bool norm, const double sigmoid, const int truncation_level, - score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - __shared__ score_t shared_scores[MAX_ITEM_GREATER_THAN_1024 ? 2048 : 1024]; - __shared__ uint16_t shared_indices[MAX_ITEM_GREATER_THAN_1024 ? 2048 : 1024]; - __shared__ score_t shared_lambdas[MAX_ITEM_GREATER_THAN_1024 ? 2048 : 1024]; - __shared__ score_t shared_hessians[MAX_ITEM_GREATER_THAN_1024 ? 2048 : 1024]; - const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; - const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); - for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { - const double inverse_max_dcg = cuda_inverse_max_dcgs[query_index]; - const data_size_t query_start = cuda_query_boundaries[query_index]; - const data_size_t query_end = cuda_query_boundaries[query_index + 1]; - const data_size_t query_item_count = query_end - query_start; - const double* cuda_scores_pointer = cuda_scores + query_start; - score_t* cuda_out_gradients_pointer = cuda_out_gradients + query_start; - score_t* cuda_out_hessians_pointer = cuda_out_hessians + query_start; - const label_t* cuda_label_pointer = cuda_labels + query_start; - if (threadIdx.x < query_item_count) { - shared_scores[threadIdx.x] = cuda_scores_pointer[threadIdx.x]; - shared_indices[threadIdx.x] = static_cast(threadIdx.x); - shared_lambdas[threadIdx.x] = 0.0f; - shared_hessians[threadIdx.x] = 0.0f; - } else { - shared_scores[threadIdx.x] = kMinScore; - shared_indices[threadIdx.x] = static_cast(threadIdx.x); - } - if (MAX_ITEM_GREATER_THAN_1024) { - if (query_item_count > 1024) { - const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; - if (threadIdx_x_plus_1024 < query_item_count) { - shared_scores[threadIdx_x_plus_1024] = cuda_scores_pointer[threadIdx_x_plus_1024]; - shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); - shared_lambdas[threadIdx_x_plus_1024] = 0.0f; - shared_hessians[threadIdx_x_plus_1024] = 0.0f; - } else { - shared_scores[threadIdx_x_plus_1024] = kMinScore; - shared_indices[threadIdx_x_plus_1024] = static_cast(threadIdx_x_plus_1024); - } - } - } - __syncthreads(); - if (MAX_ITEM_GREATER_THAN_1024) { - if (query_item_count > 1024) { - BitonicArgSort_2048(shared_scores, shared_indices); - } else { - BitonicArgSort_1024(shared_scores, shared_indices, static_cast(query_item_count)); - } - } else { - BitonicArgSort_1024(shared_scores, shared_indices, static_cast(query_item_count)); - } - __syncthreads(); - // get best and worst score - const double best_score = shared_scores[shared_indices[0]]; - data_size_t worst_idx = query_item_count - 1; - if (worst_idx > 0 && shared_scores[shared_indices[worst_idx]] == kMinScore) { - worst_idx -= 1; - } - const double worst_score = shared_scores[shared_indices[worst_idx]]; - __shared__ double sum_lambdas; - if (threadIdx.x == 0) { - sum_lambdas = 0.0f; - } - __syncthreads(); - // start accumulate lambdas by pairs that contain at least one document above truncation level - const data_size_t num_items_i = min(query_item_count - 1, truncation_level); - const data_size_t num_j_per_i = query_item_count - 1; - const data_size_t s = num_j_per_i - num_items_i + 1; - const data_size_t num_pairs = (num_j_per_i + s) * num_items_i / 2; - double thread_sum_lambdas = 0.0f; - for (data_size_t pair_index = static_cast(threadIdx.x); pair_index < num_pairs; pair_index += static_cast(blockDim.x)) { - const double square = 2 * static_cast(pair_index) + s * s - s; - const double sqrt_result = floor(sqrt(square)); - const data_size_t row_index = static_cast(floor(sqrt(square - sqrt_result)) + 1 - s); - const data_size_t i = num_items_i - 1 - row_index; - const data_size_t j = num_j_per_i - (pair_index - (2 * s + row_index - 1) * row_index / 2); - if (cuda_label_pointer[shared_indices[i]] != cuda_label_pointer[shared_indices[j]] && shared_scores[shared_indices[j]] != kMinScore) { - data_size_t high_rank, low_rank; - if (cuda_label_pointer[shared_indices[i]] > cuda_label_pointer[shared_indices[j]]) { - high_rank = i; - low_rank = j; - } else { - high_rank = j; - low_rank = i; - } - const data_size_t high = shared_indices[high_rank]; - const int high_label = static_cast(cuda_label_pointer[high]); - const double high_score = shared_scores[high]; - const double high_label_gain = static_cast((1 << high_label) - 1); - const double high_discount = log2(2.0f + high_rank); - const data_size_t low = shared_indices[low_rank]; - const int low_label = static_cast(cuda_label_pointer[low]); - const double low_score = shared_scores[low]; - const double low_label_gain = static_cast((1 << low_label) - 1); - const double low_discount = log2(2.0f + low_rank); - - const double delta_score = high_score - low_score; - - // get dcg gap - const double dcg_gap = high_label_gain - low_label_gain; - // get discount of this pair - const double paired_discount = fabs(high_discount - low_discount); - // get delta NDCG - double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; - // regular the delta_pair_NDCG by score distance - if (norm && best_score != worst_score) { - delta_pair_NDCG /= (0.01f + fabs(delta_score)); - } - // calculate lambda for this pair - double p_lambda = 1.0f / (1.0f + exp(sigmoid * delta_score)); - double p_hessian = p_lambda * (1.0f - p_lambda); - // update - p_lambda *= -sigmoid * delta_pair_NDCG; - p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; - atomicAdd_block(shared_lambdas + low, -static_cast(p_lambda)); - atomicAdd_block(shared_hessians + low, static_cast(p_hessian)); - atomicAdd_block(shared_lambdas + high, static_cast(p_lambda)); - atomicAdd_block(shared_hessians + high, static_cast(p_hessian)); - // lambda is negative, so use minus to accumulate - thread_sum_lambdas -= 2 * p_lambda; - } - } - atomicAdd_block(&sum_lambdas, thread_sum_lambdas); - __syncthreads(); - if (norm && sum_lambdas > 0) { - const double norm_factor = log2(1 + sum_lambdas) / sum_lambdas; - if (threadIdx.x < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x] * norm_factor); - cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x] * norm_factor); - } - if (MAX_ITEM_GREATER_THAN_1024) { - if (query_item_count > 1024) { - const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; - if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024] * norm_factor); - cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024] * norm_factor); - } - } - } - } else { - if (threadIdx.x < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx.x] = static_cast(shared_lambdas[threadIdx.x]); - cuda_out_hessians_pointer[threadIdx.x] = static_cast(shared_hessians[threadIdx.x]); - } - if (MAX_ITEM_GREATER_THAN_1024) { - if (query_item_count > 1024) { - const unsigned int threadIdx_x_plus_1024 = threadIdx.x + 1024; - if (threadIdx_x_plus_1024 < static_cast(query_item_count)) { - cuda_out_gradients_pointer[threadIdx_x_plus_1024] = static_cast(shared_lambdas[threadIdx_x_plus_1024]); - cuda_out_hessians_pointer[threadIdx_x_plus_1024] = static_cast(shared_hessians[threadIdx_x_plus_1024]); - } - } - } - } - __syncthreads(); - } -} - -__global__ void GetGradientsKernel_LambdarankNDCG_Sorted( - const double* cuda_scores, const int* cuda_item_indices_buffer, const label_t* cuda_labels, const data_size_t num_data, - const data_size_t num_queries, const data_size_t* cuda_query_boundaries, const double* cuda_inverse_max_dcgs, - const bool norm, const double sigmoid, const int truncation_level, - score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; - const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); - for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { - const double inverse_max_dcg = cuda_inverse_max_dcgs[query_index]; - const data_size_t query_start = cuda_query_boundaries[query_index]; - const data_size_t query_end = cuda_query_boundaries[query_index + 1]; - const data_size_t query_item_count = query_end - query_start; - const double* cuda_scores_pointer = cuda_scores + query_start; - const int* cuda_item_indices_buffer_pointer = cuda_item_indices_buffer + query_start; - score_t* cuda_out_gradients_pointer = cuda_out_gradients + query_start; - score_t* cuda_out_hessians_pointer = cuda_out_hessians + query_start; - const label_t* cuda_label_pointer = cuda_labels + query_start; - // get best and worst score - const double best_score = cuda_scores_pointer[cuda_item_indices_buffer_pointer[0]]; - data_size_t worst_idx = query_item_count - 1; - if (worst_idx > 0 && cuda_scores_pointer[cuda_item_indices_buffer_pointer[worst_idx]] == kMinScore) { - worst_idx -= 1; - } - const double worst_score = cuda_scores_pointer[cuda_item_indices_buffer_pointer[worst_idx]]; - __shared__ double sum_lambdas; - if (threadIdx.x == 0) { - sum_lambdas = 0.0f; - } - for (int item_index = static_cast(threadIdx.x); item_index < query_item_count; item_index += static_cast(blockDim.x)) { - cuda_out_gradients_pointer[item_index] = 0.0f; - cuda_out_hessians_pointer[item_index] = 0.0f; - } - __syncthreads(); - // start accumulate lambdas by pairs that contain at least one document above truncation level - const data_size_t num_items_i = min(query_item_count - 1, truncation_level); - const data_size_t num_j_per_i = query_item_count - 1; - const data_size_t s = num_j_per_i - num_items_i + 1; - const data_size_t num_pairs = (num_j_per_i + s) * num_items_i / 2; - double thread_sum_lambdas = 0.0f; - for (data_size_t pair_index = static_cast(threadIdx.x); pair_index < num_pairs; pair_index += static_cast(blockDim.x)) { - const double square = 2 * static_cast(pair_index) + s * s - s; - const double sqrt_result = floor(sqrt(square)); - const data_size_t row_index = static_cast(floor(sqrt(square - sqrt_result)) + 1 - s); - const data_size_t i = num_items_i - 1 - row_index; - const data_size_t j = num_j_per_i - (pair_index - (2 * s + row_index - 1) * row_index / 2); - if (j > i) { - // skip pairs with the same labels - if (cuda_label_pointer[cuda_item_indices_buffer_pointer[i]] != cuda_label_pointer[cuda_item_indices_buffer_pointer[j]] && cuda_scores_pointer[cuda_item_indices_buffer_pointer[j]] != kMinScore) { - data_size_t high_rank, low_rank; - if (cuda_label_pointer[cuda_item_indices_buffer_pointer[i]] > cuda_label_pointer[cuda_item_indices_buffer_pointer[j]]) { - high_rank = i; - low_rank = j; - } else { - high_rank = j; - low_rank = i; - } - const data_size_t high = cuda_item_indices_buffer_pointer[high_rank]; - const int high_label = static_cast(cuda_label_pointer[high]); - const double high_score = cuda_scores_pointer[high]; - const double high_label_gain = static_cast((1 << high_label) - 1); - const double high_discount = log2(2.0f + high_rank); - const data_size_t low = cuda_item_indices_buffer_pointer[low_rank]; - const int low_label = static_cast(cuda_label_pointer[low]); - const double low_score = cuda_scores_pointer[low]; - const double low_label_gain = static_cast((1 << low_label) - 1); - const double low_discount = log2(2.0f + low_rank); - - const double delta_score = high_score - low_score; - - // get dcg gap - const double dcg_gap = high_label_gain - low_label_gain; - // get discount of this pair - const double paired_discount = fabs(high_discount - low_discount); - // get delta NDCG - double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; - // regular the delta_pair_NDCG by score distance - if (norm && best_score != worst_score) { - delta_pair_NDCG /= (0.01f + fabs(delta_score)); - } - // calculate lambda for this pair - double p_lambda = 1.0f / (1.0f + exp(sigmoid * delta_score)); - double p_hessian = p_lambda * (1.0f - p_lambda); - // update - p_lambda *= -sigmoid * delta_pair_NDCG; - p_hessian *= sigmoid * sigmoid * delta_pair_NDCG; - atomicAdd_block(cuda_out_gradients_pointer + low, -static_cast(p_lambda)); - atomicAdd_block(cuda_out_hessians_pointer + low, static_cast(p_hessian)); - atomicAdd_block(cuda_out_gradients_pointer + high, static_cast(p_lambda)); - atomicAdd_block(cuda_out_hessians_pointer + high, static_cast(p_hessian)); - // lambda is negative, so use minus to accumulate - thread_sum_lambdas -= 2 * p_lambda; - } - } - } - atomicAdd_block(&sum_lambdas, thread_sum_lambdas); - __syncthreads(); - if (norm && sum_lambdas > 0) { - const double norm_factor = log2(1 + sum_lambdas) / sum_lambdas; - for (int item_index = static_cast(threadIdx.x); item_index < query_item_count; item_index += static_cast(blockDim.x)) { - cuda_out_gradients_pointer[item_index] *= norm_factor; - cuda_out_hessians_pointer[item_index] *= norm_factor; - } - } - __syncthreads(); - } -} - -void CUDALambdarankNDCG::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; - if (max_items_in_query_aligned_ <= 1024) { - GetGradientsKernel_LambdarankNDCG<<>>(score, cuda_labels_, num_data_, - num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, - norm_, sigmoid_, truncation_level_, - gradients, hessians); - } else if (max_items_in_query_aligned_ <= 2048) { - GetGradientsKernel_LambdarankNDCG<<>>(score, cuda_labels_, num_data_, - num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, - norm_, sigmoid_, truncation_level_, - gradients, hessians); - } else { - BitonicArgSortItemsGlobal(score, num_queries_, cuda_query_boundaries_, cuda_item_indices_buffer_); - GetGradientsKernel_LambdarankNDCG_Sorted<<>>(score, cuda_item_indices_buffer_, cuda_labels_, num_data_, - num_queries_, cuda_query_boundaries_, cuda_inverse_max_dcgs_, - norm_, sigmoid_, truncation_level_, - gradients, hessians); - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -__global__ void CalcInverseMaxDCGKernel( - const data_size_t* cuda_query_boundaries, - const label_t* cuda_labels, - const int truncation_level, - const data_size_t num_queries, - double* cuda_inverse_max_dcgs) { - __shared__ uint32_t label_sum[MAX_RANK_LABEL]; - __shared__ uint16_t label_pos[MAX_RANK_LABEL + 1]; - const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; - const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); - for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { - const data_size_t query_start = cuda_query_boundaries[query_index]; - const data_size_t query_end = cuda_query_boundaries[query_index + 1]; - const data_size_t query_count = query_end - query_start; - if (threadIdx.x < MAX_RANK_LABEL) { - label_sum[threadIdx.x] = 0; - } - __syncthreads(); - const label_t* label_pointer = cuda_labels + query_start; - if (threadIdx.x < static_cast(query_count)) { - atomicAdd_system(label_sum + (MAX_RANK_LABEL - 1 - static_cast(label_pointer[threadIdx.x])), 1); - } - __syncthreads(); - if (threadIdx.x < MAX_RANK_LABEL) { - label_pos[threadIdx.x] = label_sum[threadIdx.x]; - } - __syncthreads(); - PrefixSum(label_pos, MAX_RANK_LABEL); - __syncthreads(); - __shared__ double gain; - if (threadIdx.x == 0) { - gain = 0.0f; - } - __syncthreads(); - if (threadIdx.x < MAX_RANK_LABEL && label_sum[threadIdx.x] > 0) { - const uint16_t start_pos = label_pos[threadIdx.x]; - const uint16_t end_pos = min(label_pos[threadIdx.x + 1], truncation_level); - double label_gain = 0.0f; - for (uint16_t k = start_pos; k < end_pos; ++k) { - label_gain += ((1 << (MAX_RANK_LABEL - 1 - threadIdx.x)) - 1) / log(2.0f + k); - } - atomicAdd_system(&gain, label_gain); - } - __syncthreads(); - if (threadIdx.x == 0) { - if (gain > 0.0f) { - cuda_inverse_max_dcgs[query_index] = 1.0f / gain; - } else { - cuda_inverse_max_dcgs[query_index] = 0.0f; - } - } - __syncthreads(); - } -} - -void CUDALambdarankNDCG::LaunchCalcInverseMaxDCGKernel() { - const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; - CalcInverseMaxDCGKernel<<>>( - cuda_query_boundaries_, - cuda_labels_, - truncation_level_, - num_queries_, - cuda_inverse_max_dcgs_); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -__device__ __forceinline__ double CUDAPhi(const label_t l, double g) { - return pow(2.0f, static_cast(l)) - g; -} - -template -__global__ void GetGradientsKernel_RankXENDCG_SharedMemory( - const double* cuda_scores, - const label_t* cuda_labels, - const double* cuda_item_rands, - const data_size_t num_data, - const data_size_t num_queries, - const data_size_t* cuda_query_boundaries, - score_t* cuda_out_gradients, - score_t* cuda_out_hessians) { - const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; - const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); - for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { - const data_size_t item_index_start = cuda_query_boundaries[query_index]; - const data_size_t item_index_end = cuda_query_boundaries[query_index + 1]; - const data_size_t query_item_count = item_index_end - item_index_start; - score_t* cuda_out_gradients_pointer = cuda_out_gradients + item_index_start; - score_t* cuda_out_hessians_pointer = cuda_out_hessians + item_index_start; - const label_t* cuda_labels_pointer = cuda_labels + item_index_start; - const double* cuda_scores_pointer = cuda_scores + item_index_start; - const double* cuda_item_rands_pointer = cuda_item_rands + item_index_start; - const data_size_t block_reduce_size = query_item_count >= 1024 ? 1024 : query_item_count; - __shared__ double shared_rho[SHARED_MEMORY_SIZE]; - // assert that warpSize == 32 - __shared__ double shared_buffer[32]; - __shared__ double shared_params[SHARED_MEMORY_SIZE]; - __shared__ score_t shared_lambdas[SHARED_MEMORY_SIZE]; - __shared__ double reduce_result; - if (query_item_count <= 1) { - for (data_size_t i = 0; i <= query_item_count; ++i) { - cuda_out_gradients_pointer[i] = 0.0f; - cuda_out_hessians_pointer[i] = 0.0f; - } - __syncthreads(); - } else { - // compute softmax - double thread_reduce_result = kMinScore; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - const double rho = cuda_scores_pointer[i]; - shared_rho[i] = rho; - if (rho > thread_reduce_result) { - thread_reduce_result = rho; - } - } - __syncthreads(); - thread_reduce_result = ShuffleReduceMax(thread_reduce_result, shared_buffer, block_reduce_size); - if (threadIdx.x == 0) { - reduce_result = thread_reduce_result; - } - __syncthreads(); - thread_reduce_result = 0.0f; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - const double exp_value = exp(shared_rho[i] - reduce_result); - shared_rho[i] = exp_value; - thread_reduce_result += exp_value; - } - thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); - if (threadIdx.x == 0) { - reduce_result = thread_reduce_result; - } - __syncthreads(); - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - shared_rho[i] /= reduce_result; - } - __syncthreads(); - - // compute params - thread_reduce_result = 0.0f; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - const double param_value = CUDAPhi(cuda_labels_pointer[i], cuda_item_rands_pointer[i]); - shared_params[i] = param_value; - thread_reduce_result += param_value; - } - thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); - if (threadIdx.x == 0) { - reduce_result = thread_reduce_result; - reduce_result = 1.0f / max(kEpsilon, reduce_result); - } - __syncthreads(); - const double inv_denominator = reduce_result; - thread_reduce_result = 0.0f; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - const double term = -shared_params[i] * inv_denominator + shared_rho[i]; - shared_lambdas[i] = static_cast(term); - shared_params[i] = term / (1.0f - shared_rho[i]); - thread_reduce_result += shared_params[i]; - } - thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); - if (threadIdx.x == 0) { - reduce_result = thread_reduce_result; - } - __syncthreads(); - const double sum_l1 = reduce_result; - thread_reduce_result = 0.0f; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - const double term = shared_rho[i] * (sum_l1 - shared_params[i]); - shared_lambdas[i] += static_cast(term); - shared_params[i] = term / (1.0f - shared_rho[i]); - thread_reduce_result += shared_params[i]; - } - thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); - if (threadIdx.x == 0) { - reduce_result = thread_reduce_result; - } - __syncthreads(); - const double sum_l2 = reduce_result; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - shared_lambdas[i] += static_cast(shared_rho[i] * (sum_l2 - shared_params[i])); - cuda_out_hessians_pointer[i] = static_cast(shared_rho[i] * (1.0f - shared_rho[i])); - } - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - cuda_out_gradients_pointer[i] = shared_lambdas[i]; - } - __syncthreads(); - } - } -} - -__global__ void GetGradientsKernel_RankXENDCG_GlobalMemory( - const double* cuda_scores, - const label_t* cuda_labels, - const double* cuda_item_rands, - const data_size_t num_data, - const data_size_t num_queries, - const data_size_t* cuda_query_boundaries, - double* cuda_params_buffer, - score_t* cuda_out_gradients, - score_t* cuda_out_hessians) { - const data_size_t query_index_start = static_cast(blockIdx.x) * NUM_QUERY_PER_BLOCK; - const data_size_t query_index_end = min(query_index_start + NUM_QUERY_PER_BLOCK, num_queries); - for (data_size_t query_index = query_index_start; query_index < query_index_end; ++query_index) { - const data_size_t item_index_start = cuda_query_boundaries[query_index]; - const data_size_t item_index_end = cuda_query_boundaries[query_index + 1]; - const data_size_t query_item_count = item_index_end - item_index_start; - score_t* cuda_out_gradients_pointer = cuda_out_gradients + item_index_start; - score_t* cuda_out_hessians_pointer = cuda_out_hessians + item_index_start; - const label_t* cuda_labels_pointer = cuda_labels + item_index_start; - const double* cuda_scores_pointer = cuda_scores + item_index_start; - const double* cuda_item_rands_pointer = cuda_item_rands + item_index_start; - double* cuda_params_buffer_pointer = cuda_params_buffer + item_index_start; - const data_size_t block_reduce_size = query_item_count > 1024 ? 1024 : query_item_count; - // assert that warpSize == 32, so we use buffer size 1024 / 32 = 32 - __shared__ double shared_buffer[32]; - __shared__ double reduce_result; - if (query_item_count <= 1) { - for (data_size_t i = 0; i <= query_item_count; ++i) { - cuda_out_gradients_pointer[i] = 0.0f; - cuda_out_hessians_pointer[i] = 0.0f; - } - __syncthreads(); - } else { - // compute softmax - double thread_reduce_result = kMinScore; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - const double rho = cuda_scores_pointer[i]; - if (rho > thread_reduce_result) { - thread_reduce_result = rho; - } - } - __syncthreads(); - thread_reduce_result = ShuffleReduceMax(thread_reduce_result, shared_buffer, block_reduce_size); - if (threadIdx.x == 0) { - reduce_result = thread_reduce_result; - } - __syncthreads(); - thread_reduce_result = 0.0f; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - const double exp_value = exp(cuda_scores_pointer[i] - reduce_result); - cuda_out_hessians_pointer[i] = exp_value; - thread_reduce_result += exp_value; - } - thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); - if (threadIdx.x == 0) { - reduce_result = thread_reduce_result; - } - __syncthreads(); - // store probability into hessians - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - cuda_out_hessians_pointer[i] /= reduce_result; - } - __syncthreads(); - - // compute params - thread_reduce_result = 0.0f; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - const double param_value = CUDAPhi(cuda_labels_pointer[i], cuda_item_rands_pointer[i]); - cuda_params_buffer_pointer[i] = param_value; - thread_reduce_result += param_value; - } - thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); - if (threadIdx.x == 0) { - reduce_result = thread_reduce_result; - reduce_result = 1.0f / max(kEpsilon, reduce_result); - } - __syncthreads(); - const double inv_denominator = reduce_result; - thread_reduce_result = 0.0f; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - const double term = -cuda_params_buffer_pointer[i] * inv_denominator + cuda_out_hessians_pointer[i]; - cuda_out_gradients_pointer[i] = static_cast(term); - const double param = term / (1.0f - cuda_out_hessians_pointer[i]); - cuda_params_buffer_pointer[i] = param; - thread_reduce_result += param; - } - thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); - if (threadIdx.x == 0) { - reduce_result = thread_reduce_result; - } - __syncthreads(); - const double sum_l1 = reduce_result; - thread_reduce_result = 0.0f; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - const double term = cuda_out_hessians_pointer[i] * (sum_l1 - cuda_params_buffer_pointer[i]); - cuda_out_gradients_pointer[i] += static_cast(term); - const double param = term / (1.0f - cuda_out_hessians_pointer[i]); - cuda_params_buffer_pointer[i] = param; - thread_reduce_result += param; - } - thread_reduce_result = ShuffleReduceSum(thread_reduce_result, shared_buffer, block_reduce_size); - if (threadIdx.x == 0) { - reduce_result = thread_reduce_result; - } - __syncthreads(); - const double sum_l2 = reduce_result; - for (data_size_t i = static_cast(threadIdx.x); i < query_item_count; i += static_cast(blockDim.x)) { - const double prob = cuda_out_hessians_pointer[i]; - cuda_out_gradients_pointer[i] += static_cast(prob * (sum_l2 - cuda_params_buffer_pointer[i])); - cuda_out_hessians_pointer[i] = static_cast(prob * (1.0f - prob)); - } - __syncthreads(); - } - } -} - -void CUDARankXENDCG::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK; - if (max_items_in_query_aligned_ <= 1024) { - GetGradientsKernel_RankXENDCG_SharedMemory<1024><<>>( - score, - cuda_labels_, - cuda_item_rands_, - num_data_, - num_queries_, - cuda_query_boundaries_, - gradients, - hessians); - } else if (max_items_in_query_aligned_ <= 2 * 1024) { - GetGradientsKernel_RankXENDCG_SharedMemory<2 * 1024><<>>( - score, - cuda_labels_, - cuda_item_rands_, - num_data_, - num_queries_, - cuda_query_boundaries_, - gradients, - hessians); - } else { - GetGradientsKernel_RankXENDCG_GlobalMemory<<>>( - score, - cuda_labels_, - cuda_item_rands_, - num_data_, - num_queries_, - cuda_query_boundaries_, - cuda_params_buffer_, - gradients, - hessians); - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_rank_objective.hpp b/src/objective/cuda/cuda_rank_objective.hpp deleted file mode 100644 index 8343ddeb4750..000000000000 --- a/src/objective/cuda/cuda_rank_objective.hpp +++ /dev/null @@ -1,79 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifndef LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ -#define LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ - -#ifdef USE_CUDA - -#define NUM_QUERY_PER_BLOCK (10) -#define MAX_RANK_LABEL (32) - -#include -#include "../rank_objective.hpp" -#include - -#include - -namespace LightGBM { - -class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG { - public: - explicit CUDALambdarankNDCG(const Config& config); - - explicit CUDALambdarankNDCG(const std::vector& strs); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; - - protected: - - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; - - void LaunchCalcInverseMaxDCGKernel(); - - // CUDA memory, held by this object - double* cuda_lambdas_; - double* cuda_inverse_max_dcgs_; - int* cuda_item_indices_buffer_; - - // CUDA memory, held by other objects - const label_t* cuda_labels_; - const data_size_t* cuda_query_boundaries_; - - // Host memory - int max_items_in_query_aligned_; -}; - -class CUDARankXENDCG : public CUDALambdarankNDCG { - public: - explicit CUDARankXENDCG(const Config& config); - - explicit CUDARankXENDCG(const std::vector& strs); - - ~CUDARankXENDCG(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; - - private: - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; - - // TODO(shiyu1994): move random number generation into CUDA - void GenerateItemRands() const; - - mutable std::vector item_rands_; - mutable std::vector rands_; - mutable double* cuda_item_rands_; - mutable double* cuda_params_buffer_; -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_NEW_CUDA_RANKING_OBJECTIVE_HPP_ diff --git a/src/objective/cuda/cuda_regression_objective.cpp b/src/objective/cuda/cuda_regression_objective.cpp deleted file mode 100644 index c8b9576c8ded..000000000000 --- a/src/objective/cuda/cuda_regression_objective.cpp +++ /dev/null @@ -1,193 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_regression_objective.hpp" - -namespace LightGBM { - -CUDARegressionL2loss::CUDARegressionL2loss(const Config& config): -RegressionL2loss(config) {} - -CUDARegressionL2loss::CUDARegressionL2loss(const std::vector& strs): -RegressionL2loss(strs) {} - -CUDARegressionL2loss::~CUDARegressionL2loss() {} - -void CUDARegressionL2loss::Init(const Metadata& metadata, data_size_t num_data) { - RegressionL2loss::Init(metadata, num_data); - cuda_labels_ = metadata.cuda_metadata()->cuda_label(); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); - num_get_gradients_blocks_ = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - AllocateCUDAMemoryOuter(&cuda_block_buffer_, static_cast(num_get_gradients_blocks_), __FILE__, __LINE__); - if (sqrt_) { - InitCUDAMemoryFromHostMemoryOuter(&cuda_trans_label_, trans_label_.data(), trans_label_.size(), __FILE__, __LINE__); - cuda_labels_ = cuda_trans_label_; - } -} - -void CUDARegressionL2loss::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { - LaunchGetGradientsKernel(score, gradients, hessians); -} - -double CUDARegressionL2loss::BoostFromScore(int) const { - return LaunchCalcInitScoreKernel(); -} - -void CUDARegressionL2loss::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { - LaunchConvertOutputCUDAKernel(num_data, input, output); -} - -void CUDARegressionL2loss::RenewTreeOutputCUDA( - const double* score, - const data_size_t* data_indices_in_leaf, - const data_size_t* num_data_in_leaf, - const data_size_t* data_start_in_leaf, - const int num_leaves, - double* leaf_value) const { - global_timer.Start("CUDARegressionL1loss::LaunchRenewTreeOutputCUDAKernel"); - LaunchRenewTreeOutputCUDAKernel(score, data_indices_in_leaf, num_data_in_leaf, data_start_in_leaf, num_leaves, leaf_value); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - global_timer.Stop("CUDARegressionL1loss::LaunchRenewTreeOutputCUDAKernel"); -} - -CUDARegressionL1loss::CUDARegressionL1loss(const Config& config): -CUDARegressionL2loss(config) {} - -CUDARegressionL1loss::CUDARegressionL1loss(const std::vector& strs): -CUDARegressionL2loss(strs) {} - -CUDARegressionL1loss::~CUDARegressionL1loss() {} - -void CUDARegressionL1loss::Init(const Metadata& metadata, data_size_t num_data) { - CUDARegressionL2loss::Init(metadata, num_data); - AllocateCUDAMemoryOuter(&cuda_data_indices_buffer_, static_cast(num_data), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_percentile_result_, 1, __FILE__, __LINE__); - if (cuda_weights_ != nullptr) { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION + 1; - AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_, static_cast(num_data), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_weight_by_leaf_buffer_, static_cast(num_data_), __FILE__, __LINE__); - } - AllocateCUDAMemoryOuter(&cuda_residual_buffer_, static_cast(num_data_), __FILE__, __LINE__); -} - -CUDARegressionHuberLoss::CUDARegressionHuberLoss(const Config& config): -CUDARegressionL2loss(config), alpha_(config.alpha) { - if (sqrt_) { - Log::Warning("Cannot use sqrt transform in %s Regression, will auto disable it", GetName()); - sqrt_ = false; - } -} - -CUDARegressionHuberLoss::CUDARegressionHuberLoss(const std::vector& strs): -CUDARegressionL2loss(strs) {} - -CUDARegressionHuberLoss::~CUDARegressionHuberLoss() {} - -CUDARegressionFairLoss::CUDARegressionFairLoss(const Config& config): -CUDARegressionL2loss(config), c_(config.fair_c) {} - -CUDARegressionFairLoss::CUDARegressionFairLoss(const std::vector& strs): -CUDARegressionL2loss(strs) {} - -CUDARegressionFairLoss::~CUDARegressionFairLoss() {} - -CUDARegressionPoissonLoss::CUDARegressionPoissonLoss(const Config& config): -CUDARegressionL2loss(config), max_delta_step_(config.poisson_max_delta_step) { - if (sqrt_) { - Log::Warning("Cannot use sqrt transform in %s Regression, will auto disable it", GetName()); - sqrt_ = false; - } -} - -CUDARegressionPoissonLoss::CUDARegressionPoissonLoss(const std::vector& strs): -CUDARegressionL2loss(strs) {} - -CUDARegressionPoissonLoss::~CUDARegressionPoissonLoss() {} - -void CUDARegressionPoissonLoss::Init(const Metadata& metadata, data_size_t num_data) { - CUDARegressionL2loss::Init(metadata, num_data); - AllocateCUDAMemoryOuter(&cuda_block_buffer_, static_cast(num_get_gradients_blocks_), __FILE__, __LINE__); - LaunchCheckLabelKernel(); -} - -double CUDARegressionPoissonLoss::LaunchCalcInitScoreKernel() const { - return Common::SafeLog(CUDARegressionL2loss::LaunchCalcInitScoreKernel()); -} - -CUDARegressionQuantileloss::CUDARegressionQuantileloss(const Config& config): -CUDARegressionL2loss(config), alpha_(config.alpha) { - CHECK(alpha_ > 0 && alpha_ < 1); -} - -CUDARegressionQuantileloss::CUDARegressionQuantileloss(const std::vector& strs): -CUDARegressionL2loss(strs) {} - -void CUDARegressionQuantileloss::Init(const Metadata& metadata, data_size_t num_data) { - CUDARegressionL2loss::Init(metadata, num_data); - AllocateCUDAMemoryOuter(&cuda_data_indices_buffer_, static_cast(num_data), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_percentile_result_, 1, __FILE__, __LINE__); - if (cuda_weights_ != nullptr) { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION + 1; - AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_, static_cast(num_data), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_weight_by_leaf_buffer_, static_cast(num_data_), __FILE__, __LINE__); - } - AllocateCUDAMemoryOuter(&cuda_residual_buffer_, static_cast(num_data_), __FILE__, __LINE__); -} - -CUDARegressionQuantileloss::~CUDARegressionQuantileloss() {} - -CUDARegressionMAPELOSS::CUDARegressionMAPELOSS(const Config& config): -CUDARegressionL1loss(config) {} - -CUDARegressionMAPELOSS::CUDARegressionMAPELOSS(const std::vector& strs): -CUDARegressionL1loss(strs) {} - -CUDARegressionMAPELOSS::~CUDARegressionMAPELOSS() {} - -void CUDARegressionMAPELOSS::Init(const Metadata& metadata, data_size_t num_data) { - CUDARegressionL1loss::Init(metadata, num_data); - if (cuda_weights_ == nullptr) { - // allocate buffer for weights when they are not allocated in L1 loss - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION + 1; - AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_, static_cast(num_data), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_weights_prefix_sum_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_weight_by_leaf_buffer_, static_cast(num_data_), __FILE__, __LINE__); - } - for (data_size_t i = 0; i < num_data_; ++i) { - if (std::fabs(label_[i]) < 1) { - Log::Warning( - "Some label values are < 1 in absolute value. MAPE is unstable with such values, " - "so LightGBM rounds them to 1.0 when calculating MAPE."); - break; - } - } - AllocateCUDAMemoryOuter(&cuda_label_weights_, static_cast(num_data_), __FILE__, __LINE__); - LaunchCalcLabelWeightKernel(); -} - -CUDARegressionGammaLoss::CUDARegressionGammaLoss(const Config& config): -CUDARegressionPoissonLoss(config) {} - -CUDARegressionGammaLoss::CUDARegressionGammaLoss(const std::vector& strs): -CUDARegressionPoissonLoss(strs) {} - -CUDARegressionGammaLoss::~CUDARegressionGammaLoss() {} - -CUDARegressionTweedieLoss::CUDARegressionTweedieLoss(const Config& config): -CUDARegressionPoissonLoss(config), rho_(config.tweedie_variance_power) {} - -CUDARegressionTweedieLoss::CUDARegressionTweedieLoss(const std::vector& strs): -CUDARegressionPoissonLoss(strs) {} - -CUDARegressionTweedieLoss::~CUDARegressionTweedieLoss() {} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_regression_objective.cu b/src/objective/cuda/cuda_regression_objective.cu deleted file mode 100644 index 4d288be64389..000000000000 --- a/src/objective/cuda/cuda_regression_objective.cu +++ /dev/null @@ -1,616 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifdef USE_CUDA - -#include "cuda_regression_objective.hpp" -#include - -namespace LightGBM { - -double CUDARegressionL2loss::LaunchCalcInitScoreKernel() const { - double label_sum = 0.0f, weight_sum = 0.0f; - ReduceSumGlobal(cuda_labels_, static_cast(num_data_), cuda_block_buffer_); - CopyFromCUDADeviceToHostOuter(&label_sum, cuda_block_buffer_, 1, __FILE__, __LINE__); - if (cuda_weights_ == nullptr) { - weight_sum = static_cast(num_data_); - } else { - ReduceSumGlobal(cuda_weights_, static_cast(num_data_), cuda_block_buffer_); - CopyFromCUDADeviceToHostOuter(&weight_sum, cuda_block_buffer_, 1, __FILE__, __LINE__); - } - return label_sum / weight_sum; -} - -// TODO(shiyu1994): try to use global kernels as class methods -__global__ void ConvertOutputCUDAKernel_Regression(const bool sqrt, const data_size_t num_data, const double* input, double* output) { - const int data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (data_index < num_data) { - if (sqrt) { - const double sign = input[data_index] >= 0.0f ? 1 : -1; - output[data_index] = sign * input[data_index] * input[data_index]; - } else { - output[data_index] = input[data_index]; - } - } -} - -void CUDARegressionL2loss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { - const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - ConvertOutputCUDAKernel_Regression<<>>(sqrt_, num_data, input, output); -} - - -template -__global__ void GetGradientsKernel_RegressionL2(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, - score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - if (!USE_WEIGHT) { - cuda_out_gradients[data_index] = static_cast(cuda_scores[data_index] - cuda_labels[data_index]); - cuda_out_hessians[data_index] = 1.0f; - } else { - const score_t weight = static_cast(cuda_weights[data_index]); - cuda_out_gradients[data_index] = static_cast(cuda_scores[data_index] - cuda_labels[data_index]) * weight; - cuda_out_hessians[data_index] = weight; - } - } -} - -void CUDARegressionL2loss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_RegressionL2<<>>(score, cuda_labels_, nullptr, num_data_, gradients, hessians); - } else { - GetGradientsKernel_RegressionL2<<>>(score, cuda_labels_, cuda_weights_, num_data_, gradients, hessians); - } -} - -double CUDARegressionL1loss::LaunchCalcInitScoreKernel() const { - const double alpha = 0.9f; - if (cuda_weights_ == nullptr) { - PercentileGlobal( - cuda_labels_, nullptr, cuda_data_indices_buffer_, nullptr, nullptr, alpha, num_data_, cuda_percentile_result_); - } else { - PercentileGlobal( - cuda_labels_, cuda_weights_, cuda_data_indices_buffer_, cuda_weights_prefix_sum_, cuda_weights_prefix_sum_buffer_, alpha, num_data_, cuda_percentile_result_); - } - label_t percentile_result = 0.0f; - CopyFromCUDADeviceToHostOuter(&percentile_result, cuda_percentile_result_, 1, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - return static_cast(percentile_result); -} - -template -__global__ void GetGradientsKernel_RegressionL1(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, - score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - if (!USE_WEIGHT) { - const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); - cuda_out_gradients[data_index] = static_cast((diff > 0.0f) - (diff < 0.0f)); - cuda_out_hessians[data_index] = 1.0f; - } else { - const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); - const score_t weight = static_cast(cuda_weights[data_index]); - cuda_out_gradients[data_index] = static_cast((diff > 0.0f) - (diff < 0.0f)) * weight; - cuda_out_hessians[data_index] = weight; - } - } -} - -void CUDARegressionL1loss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_RegressionL1<<>>(score, cuda_labels_, nullptr, num_data_, gradients, hessians); - } else { - GetGradientsKernel_RegressionL1<<>>(score, cuda_labels_, cuda_weights_, num_data_, gradients, hessians); - } -} - -template -__global__ void RenewTreeOutputCUDAKernel_RegressionL1( - const double* score, - const label_t* label, - const label_t* weight, - double* residual_buffer, - label_t* weight_by_leaf, - double* weight_prefix_sum_buffer, - const data_size_t* data_indices_in_leaf, - const data_size_t* num_data_in_leaf, - const data_size_t* data_start_in_leaf, - data_size_t* data_indices_buffer, - double* leaf_value) { - const int leaf_index = static_cast(blockIdx.x); - const data_size_t data_start = data_start_in_leaf[leaf_index]; - const data_size_t num_data = num_data_in_leaf[leaf_index]; - data_size_t* data_indices_buffer_pointer = data_indices_buffer + data_start; - const label_t* weight_by_leaf_pointer = weight_by_leaf + data_start; - double* weight_prefix_sum_buffer_pointer = weight_prefix_sum_buffer + data_start; - const double* residual_buffer_pointer = residual_buffer + data_start; - const double alpha = 0.5f; - for (data_size_t inner_data_index = data_start + static_cast(threadIdx.x); inner_data_index < data_start + num_data; inner_data_index += static_cast(blockDim.x)) { - const data_size_t data_index = data_indices_in_leaf[inner_data_index]; - const label_t data_label = label[data_index]; - const double data_score = score[data_index]; - residual_buffer[inner_data_index] = static_cast(data_label) - data_score; - if (USE_WEIGHT) { - weight_by_leaf[inner_data_index] = weight[data_index]; - } - } - __syncthreads(); - // TODO(shiyu1994): replace this bitonic sort based percentile method with a more efficient one - const double renew_leaf_value = PercentileDevice( - residual_buffer_pointer, weight_by_leaf_pointer, data_indices_buffer_pointer, - weight_prefix_sum_buffer_pointer, alpha, num_data); - if (threadIdx.x == 0) { - leaf_value[leaf_index] = renew_leaf_value; - } -} - -void CUDARegressionL1loss::LaunchRenewTreeOutputCUDAKernel( - const double* score, - const data_size_t* data_indices_in_leaf, - const data_size_t* num_data_in_leaf, - const data_size_t* data_start_in_leaf, - const int num_leaves, - double* leaf_value) const { - if (cuda_weights_ == nullptr) { - RenewTreeOutputCUDAKernel_RegressionL1<<>>( - score, - cuda_labels_, - cuda_weights_, - cuda_residual_buffer_, - cuda_weight_by_leaf_buffer_, - cuda_weights_prefix_sum_, - data_indices_in_leaf, - num_data_in_leaf, - data_start_in_leaf, - cuda_data_indices_buffer_, - leaf_value); - } else { - RenewTreeOutputCUDAKernel_RegressionL1<<>>( - score, - cuda_labels_, - cuda_weights_, - cuda_residual_buffer_, - cuda_weight_by_leaf_buffer_, - cuda_weights_prefix_sum_, - data_indices_in_leaf, - num_data_in_leaf, - data_start_in_leaf, - cuda_data_indices_buffer_, - leaf_value); - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -template -__global__ void GetGradientsKernel_Huber(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, - const double alpha, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - if (!USE_WEIGHT) { - const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); - if (fabs(diff) <= alpha) { - cuda_out_gradients[data_index] = static_cast(diff); - } else { - const score_t sign = static_cast((diff > 0.0f) - (diff < 0.0f)); - cuda_out_gradients[data_index] = static_cast(sign * alpha); - } - cuda_out_hessians[data_index] = 1.0f; - } else { - const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); - const score_t weight = static_cast(cuda_weights[data_index]); - if (fabs(diff) <= alpha) { - cuda_out_gradients[data_index] = static_cast(diff) * weight; - } else { - const score_t sign = static_cast((diff > 0.0f) - (diff < 0.0f)); - cuda_out_gradients[data_index] = static_cast(sign * alpha) * weight; - } - cuda_out_hessians[data_index] = weight; - } - } -} - -void CUDARegressionHuberLoss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_Huber<<>>(score, cuda_labels_, nullptr, num_data_, alpha_, gradients, hessians); - } else { - GetGradientsKernel_Huber<<>>(score, cuda_labels_, cuda_weights_, num_data_, alpha_, gradients, hessians); - } -} - -template -__global__ void GetGradientsKernel_Fair(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, - const double c, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - if (!USE_WEIGHT) { - const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); - cuda_out_gradients[data_index] = static_cast(c * diff / (fabs(diff) + c)); - cuda_out_hessians[data_index] = static_cast(c * c / ((fabs(diff) + c) * (fabs(diff) + c))); - } else { - const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); - const score_t weight = static_cast(cuda_weights[data_index]); - cuda_out_gradients[data_index] = static_cast(c * diff / (fabs(diff) + c) * weight); - cuda_out_hessians[data_index] = static_cast(c * c / ((fabs(diff) + c) * (fabs(diff) + c)) * weight); - } - } -} - -void CUDARegressionFairLoss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_Fair<<>>(score, cuda_labels_, nullptr, num_data_, c_, gradients, hessians); - } else { - GetGradientsKernel_Fair<<>>(score, cuda_labels_, cuda_weights_, num_data_, c_, gradients, hessians); - } -} - -void CUDARegressionPoissonLoss::LaunchCheckLabelKernel() const { - ReduceSumGlobal(cuda_labels_, static_cast(num_data_), cuda_block_buffer_); - double label_sum = 0.0f; - CopyFromCUDADeviceToHostOuter(&label_sum, cuda_block_buffer_, 1, __FILE__, __LINE__); - - ReduceMinGlobal(cuda_labels_, static_cast(num_data_), cuda_block_buffer_); - double label_min = 0.0f; - CopyFromCUDADeviceToHostOuter(&label_min, cuda_block_buffer_, 1, __FILE__, __LINE__); - - if (label_min < 0.0f) { - Log::Fatal("[%s]: at least one target label is negative", GetName()); - } - if (label_sum == 0.0f) { - Log::Fatal("[%s]: sum of labels is zero", GetName()); - } -} - -template -__global__ void GetGradientsKernel_Poisson(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, - const double max_delta_step, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - if (!USE_WEIGHT) { - cuda_out_gradients[data_index] = static_cast(exp(cuda_scores[data_index]) - cuda_labels[data_index]); - cuda_out_hessians[data_index] = static_cast(std::exp(cuda_scores[data_index] + max_delta_step)); - } else { - const score_t weight = static_cast(cuda_weights[data_index]); - cuda_out_gradients[data_index] = static_cast(exp(cuda_scores[data_index]) - cuda_labels[data_index]) * weight; - cuda_out_hessians[data_index] = static_cast(std::exp(cuda_scores[data_index] + max_delta_step)) * weight; - } - } -} - -void CUDARegressionPoissonLoss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_Poisson<<>>(score, cuda_labels_, nullptr, num_data_, max_delta_step_, gradients, hessians); - } else { - GetGradientsKernel_Poisson<<>>(score, cuda_labels_, cuda_weights_, num_data_, max_delta_step_, gradients, hessians); - } -} - -// TODO(shiyu1994): try to use global kernels as class methods -__global__ void ConvertOutputCUDAKernel_Regression_Poissson(const bool sqrt, const data_size_t num_data, const double* input, double* output) { - const int data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (data_index < num_data) { - output[data_index] = exp(input[data_index]); - } -} - -void CUDARegressionPoissonLoss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { - const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - ConvertOutputCUDAKernel_Regression_Poissson<<>>(sqrt_, num_data, input, output); -} - -double CUDARegressionQuantileloss::LaunchCalcInitScoreKernel() const { - if (cuda_weights_ == nullptr) { - PercentileGlobal( - cuda_labels_, nullptr, cuda_data_indices_buffer_, nullptr, nullptr, alpha_, num_data_, cuda_percentile_result_); - } else { - PercentileGlobal( - cuda_labels_, cuda_weights_, cuda_data_indices_buffer_, cuda_weights_prefix_sum_, cuda_weights_prefix_sum_buffer_, alpha_, num_data_, cuda_percentile_result_); - } - label_t percentile_result = 0.0f; - CopyFromCUDADeviceToHostOuter(&percentile_result, cuda_percentile_result_, 1, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - return static_cast(percentile_result); -} - -template -__global__ void RenewTreeOutputCUDAKernel_RegressionQuantile( - const double* score, - const label_t* label, - const label_t* weight, - double* residual_buffer, - label_t* weight_by_leaf, - double* weight_prefix_sum_buffer, - const data_size_t* data_indices_in_leaf, - const data_size_t* num_data_in_leaf, - const data_size_t* data_start_in_leaf, - data_size_t* data_indices_buffer, - double* leaf_value, - const double alpha) { - const int leaf_index = static_cast(blockIdx.x); - const data_size_t data_start = data_start_in_leaf[leaf_index]; - const data_size_t num_data = num_data_in_leaf[leaf_index]; - data_size_t* data_indices_buffer_pointer = data_indices_buffer + data_start; - const label_t* weight_by_leaf_pointer = weight_by_leaf + data_start; - double* weight_prefix_sum_buffer_pointer = weight_prefix_sum_buffer + data_start; - const double* residual_buffer_pointer = residual_buffer + data_start; - for (data_size_t inner_data_index = data_start + static_cast(threadIdx.x); inner_data_index < data_start + num_data; inner_data_index += static_cast(blockDim.x)) { - const data_size_t data_index = data_indices_in_leaf[inner_data_index]; - const label_t data_label = label[data_index]; - const double data_score = score[data_index]; - residual_buffer[inner_data_index] = static_cast(data_label) - data_score; - if (USE_WEIGHT) { - weight_by_leaf[inner_data_index] = weight[data_index]; - } - } - __syncthreads(); - // TODO(shiyu1994): replace this bitonic sort based percentile method with a more efficient one - const double renew_leaf_value = PercentileDevice( - residual_buffer_pointer, weight_by_leaf_pointer, data_indices_buffer_pointer, - weight_prefix_sum_buffer_pointer, alpha, num_data); - if (threadIdx.x == 0) { - leaf_value[leaf_index] = renew_leaf_value; - } -} - -void CUDARegressionQuantileloss::LaunchRenewTreeOutputCUDAKernel( - const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, - const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const { - if (cuda_weights_ == nullptr) { - RenewTreeOutputCUDAKernel_RegressionQuantile<<>>( - score, - cuda_labels_, - cuda_weights_, - cuda_residual_buffer_, - cuda_weight_by_leaf_buffer_, - cuda_weights_prefix_sum_, - data_indices_in_leaf, - num_data_in_leaf, - data_start_in_leaf, - cuda_data_indices_buffer_, - leaf_value, - alpha_); - } else { - RenewTreeOutputCUDAKernel_RegressionQuantile<<>>( - score, - cuda_labels_, - cuda_weights_, - cuda_residual_buffer_, - cuda_weight_by_leaf_buffer_, - cuda_weights_prefix_sum_, - data_indices_in_leaf, - num_data_in_leaf, - data_start_in_leaf, - cuda_data_indices_buffer_, - leaf_value, - alpha_); - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -template -__global__ void GetGradientsKernel_RegressionQuantile(const double* cuda_scores, const label_t* cuda_labels, - const label_t* cuda_weights, const data_size_t num_data, const double alpha, - score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - if (!USE_WEIGHT) { - const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); - if (diff >= 0.0f) { - cuda_out_gradients[data_index] = (1.0f - alpha); - } else { - cuda_out_gradients[data_index] = -alpha; - } - cuda_out_hessians[data_index] = 1.0f; - } else { - const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); - const score_t weight = static_cast(cuda_weights[data_index]); - if (diff >= 0.0f) { - cuda_out_gradients[data_index] = (1.0f - alpha) * weight; - } else { - cuda_out_gradients[data_index] = -alpha * weight; - } - cuda_out_hessians[data_index] = weight; - } - } -} - -void CUDARegressionQuantileloss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_RegressionQuantile<<>>(score, cuda_labels_, nullptr, num_data_, alpha_, gradients, hessians); - } else { - GetGradientsKernel_RegressionQuantile<<>>(score, cuda_labels_, cuda_weights_, num_data_, alpha_, gradients, hessians); - } -} - -template -__global__ void CalcLabelWeightKernel( - const label_t* cuda_labels, - const label_t* cuda_weights, - const data_size_t num_data, - label_t* label_weights -) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - const label_t label = cuda_labels[data_index]; - if (!USE_WEIGHT) { - label_weights[data_index] = 1.0f / max(1.0f, fabs(label)); - } else { - const label_t weight = cuda_weights[data_index]; - label_weights[data_index] = 1.0f / max(1.0f, fabs(label)) * weight; - } - } -} - -void CUDARegressionMAPELOSS::LaunchCalcLabelWeightKernel() { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - if (cuda_weights_ == nullptr) { - CalcLabelWeightKernel<<>>(cuda_labels_, cuda_weights_, num_data_, cuda_label_weights_); - } else { - CalcLabelWeightKernel<<>>(cuda_labels_, cuda_weights_, num_data_, cuda_label_weights_); - } -} - -template -__global__ void GetGradientsKernel_RegressionMAPELOSS(const double* cuda_scores, const label_t* cuda_labels, - const label_t* cuda_weights, const label_t* cuda_label_weights, const data_size_t num_data, - score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - const double diff = cuda_scores[data_index] - static_cast(cuda_labels[data_index]); - const label_t label_weight = cuda_label_weights[data_index]; - const double sign = static_cast((diff > 0) - (diff < 0)); - if (!USE_WEIGHT) { - cuda_out_gradients[data_index] = static_cast(sign * label_weight); - cuda_out_hessians[data_index] = 1.0f; - } else { - const score_t weight = static_cast(cuda_weights[data_index]); - cuda_out_gradients[data_index] = static_cast(sign * label_weight) * weight; - cuda_out_hessians[data_index] = weight; - } - } -} - -void CUDARegressionMAPELOSS::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_RegressionMAPELOSS<<>>(score, cuda_labels_, nullptr, cuda_label_weights_, num_data_, gradients, hessians); - } else { - GetGradientsKernel_RegressionMAPELOSS<<>>(score, cuda_labels_, cuda_weights_, cuda_label_weights_, num_data_, gradients, hessians); - } -} - -double CUDARegressionMAPELOSS::LaunchCalcInitScoreKernel() const { - PercentileGlobal( - cuda_labels_, cuda_label_weights_, cuda_data_indices_buffer_, - cuda_weights_prefix_sum_, cuda_weights_prefix_sum_buffer_, 0.5f, num_data_, cuda_percentile_result_); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - label_t percentile_result = 0.0f; - CopyFromCUDADeviceToHostOuter(&percentile_result, cuda_percentile_result_, 1, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - return static_cast(percentile_result); -} - -__global__ void RenewTreeOutputCUDAKernel_RegressionMAPE( - const double* score, - const label_t* label, - const label_t* weight, - double* residual_buffer, - label_t* weight_by_leaf, - double* weight_prefix_sum_buffer, - const data_size_t* data_indices_in_leaf, - const data_size_t* num_data_in_leaf, - const data_size_t* data_start_in_leaf, - data_size_t* data_indices_buffer, - double* leaf_value) { - const int leaf_index = static_cast(blockIdx.x); - const data_size_t data_start = data_start_in_leaf[leaf_index]; - const data_size_t num_data = num_data_in_leaf[leaf_index]; - data_size_t* data_indices_buffer_pointer = data_indices_buffer + data_start; - const label_t* weight_by_leaf_pointer = weight_by_leaf + data_start; - double* weight_prefix_sum_buffer_pointer = weight_prefix_sum_buffer + data_start; - const double* residual_buffer_pointer = residual_buffer + data_start; - const double alpha = 0.5f; - for (data_size_t inner_data_index = data_start + static_cast(threadIdx.x); inner_data_index < data_start + num_data; inner_data_index += static_cast(blockDim.x)) { - const data_size_t data_index = data_indices_in_leaf[inner_data_index]; - const label_t data_label = label[data_index]; - const double data_score = score[data_index]; - residual_buffer[inner_data_index] = static_cast(data_label) - data_score; - weight_by_leaf[inner_data_index] = weight[data_index]; - } - __syncthreads(); - // TODO(shiyu1994): replace this bitonic sort based percentile method with a more efficient one - const double renew_leaf_value = PercentileDevice( - residual_buffer_pointer, weight_by_leaf_pointer, data_indices_buffer_pointer, - weight_prefix_sum_buffer_pointer, alpha, num_data); - if (threadIdx.x == 0) { - leaf_value[leaf_index] = renew_leaf_value; - } -} - -void CUDARegressionMAPELOSS::LaunchRenewTreeOutputCUDAKernel( - const double* score, - const data_size_t* data_indices_in_leaf, - const data_size_t* num_data_in_leaf, - const data_size_t* data_start_in_leaf, - const int num_leaves, - double* leaf_value) const { - RenewTreeOutputCUDAKernel_RegressionMAPE<<>>( - score, - cuda_labels_, - cuda_label_weights_, - cuda_residual_buffer_, - cuda_weight_by_leaf_buffer_, - cuda_weights_prefix_sum_, - data_indices_in_leaf, - num_data_in_leaf, - data_start_in_leaf, - cuda_data_indices_buffer_, - leaf_value); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -template -__global__ void GetGradientsKernel_Gamma(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, - const double max_delta_step, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - if (!USE_WEIGHT) { - cuda_out_gradients[data_index] = static_cast(1.0 - cuda_labels[data_index] / exp(cuda_scores[data_index])); - cuda_out_hessians[data_index] = static_cast(cuda_labels[data_index] / exp(cuda_scores[data_index])); - } else { - const score_t weight = static_cast(cuda_weights[data_index]); - cuda_out_gradients[data_index] = static_cast(1.0 - cuda_labels[data_index] / exp(cuda_scores[data_index])) * weight; - cuda_out_hessians[data_index] = static_cast(cuda_labels[data_index] / exp(cuda_scores[data_index])) * weight; - } - } -} - -void CUDARegressionGammaLoss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_Gamma<<>>(score, cuda_labels_, nullptr, num_data_, max_delta_step_, gradients, hessians); - } else { - GetGradientsKernel_Gamma<<>>(score, cuda_labels_, cuda_weights_, num_data_, max_delta_step_, gradients, hessians); - } -} - -template -__global__ void GetGradientsKernel_Tweedie(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data, - const double rho, score_t* cuda_out_gradients, score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(blockDim.x * blockIdx.x + threadIdx.x); - if (data_index < num_data) { - if (!USE_WEIGHT) { - cuda_out_gradients[data_index] = static_cast(-cuda_labels[data_index] * exp((1 - rho) * cuda_scores[data_index]) + exp((2 - rho) * cuda_scores[data_index])); - cuda_out_hessians[data_index] = static_cast(-cuda_labels[data_index] * (1 - rho) * exp((1 - rho) * cuda_scores[data_index]) + - (2 - rho) * exp((2 - rho) * cuda_scores[data_index])); - } else { - const score_t weight = static_cast(cuda_weights[data_index]); - cuda_out_gradients[data_index] = static_cast(-cuda_labels[data_index] * exp((1 - rho) * cuda_scores[data_index]) + - exp((2 - rho) * cuda_scores[data_index])) * weight; - cuda_out_hessians[data_index] = static_cast(-cuda_labels[data_index] * (1 - rho) * exp((1 - rho) * cuda_scores[data_index]) + - (2 - rho) * exp((2 - rho) * cuda_scores[data_index])) * weight; - } - } -} - -void CUDARegressionTweedieLoss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_Tweedie<<>>(score, cuda_labels_, nullptr, num_data_, rho_, gradients, hessians); - } else { - GetGradientsKernel_Tweedie<<>>(score, cuda_labels_, cuda_weights_, num_data_, rho_, gradients, hessians); - } -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_regression_objective.hpp b/src/objective/cuda/cuda_regression_objective.hpp deleted file mode 100644 index e3b2d77f9ec8..000000000000 --- a/src/objective/cuda/cuda_regression_objective.hpp +++ /dev/null @@ -1,272 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifndef LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ -#define LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ - -#ifdef USE_CUDA - -#define GET_GRADIENTS_BLOCK_SIZE_REGRESSION (1024) - -#include -#include "../regression_objective.hpp" - -namespace LightGBM { - -class CUDARegressionL2loss : public CUDAObjectiveInterface, public RegressionL2loss { - public: - explicit CUDARegressionL2loss(const Config& config); - - explicit CUDARegressionL2loss(const std::vector& strs); - - ~CUDARegressionL2loss(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; - - void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; - - double BoostFromScore(int) const override; - - void RenewTreeOutputCUDA(const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, - const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override; - - std::function GetCUDAConvertOutputFunc() const override { - return [this] (data_size_t num_data, const double* input, double* output) { - ConvertOutputCUDA(num_data, input, output); - }; - } - - bool IsConstantHessian() const override { - if (cuda_weights_ == nullptr) { - return true; - } else { - return false; - } - } - - protected: - virtual double LaunchCalcInitScoreKernel() const; - - virtual void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; - - virtual void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; - - virtual void LaunchRenewTreeOutputCUDAKernel( - const double* /*score*/, const data_size_t* /*data_indices_in_leaf*/, const data_size_t* /*num_data_in_leaf*/, - const data_size_t* /*data_start_in_leaf*/, const int /*num_leaves*/, double* /*leaf_value*/) const {} - - const label_t* cuda_labels_; - const label_t* cuda_weights_; - label_t* cuda_trans_label_; - double* cuda_block_buffer_; - data_size_t num_get_gradients_blocks_; - data_size_t num_init_score_blocks_; -}; - -class CUDARegressionL1loss : public CUDARegressionL2loss { - public: - explicit CUDARegressionL1loss(const Config& config); - - explicit CUDARegressionL1loss(const std::vector& strs); - - ~CUDARegressionL1loss(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - const char* GetName() const override { - return "regression_l1"; - } - - bool IsRenewTreeOutput() const override { return true; } - - protected: - data_size_t* cuda_data_indices_buffer_; - mutable double* cuda_weights_prefix_sum_; - double* cuda_weights_prefix_sum_buffer_; - mutable double* cuda_residual_buffer_; - mutable label_t* cuda_weight_by_leaf_buffer_; - label_t* cuda_percentile_result_; - - double LaunchCalcInitScoreKernel() const override; - - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; - - void LaunchRenewTreeOutputCUDAKernel( - const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, - const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override; -}; - -class CUDARegressionHuberLoss : public CUDARegressionL2loss { - public: - explicit CUDARegressionHuberLoss(const Config& config); - - explicit CUDARegressionHuberLoss(const std::vector& strs); - - ~CUDARegressionHuberLoss(); - - const char* GetName() const override { - return "huber"; - } - - private: - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; - - const double alpha_ = 0.0f; -}; - -class CUDARegressionFairLoss : public CUDARegressionL2loss { - public: - explicit CUDARegressionFairLoss(const Config& config); - - explicit CUDARegressionFairLoss(const std::vector& strs); - - ~CUDARegressionFairLoss(); - - const char* GetName() const override { - return "fair"; - } - - bool IsConstantHessian() const override { - return false; - } - - private: - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; - - const double c_ = 0.0f; -}; - -class CUDARegressionPoissonLoss : public CUDARegressionL2loss { - public: - explicit CUDARegressionPoissonLoss(const Config& config); - - explicit CUDARegressionPoissonLoss(const std::vector& strs); - - ~CUDARegressionPoissonLoss(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - double LaunchCalcInitScoreKernel() const override; - - bool IsConstantHessian() const override { - return false; - } - - const char* GetName() const override { - return "poisson"; - } - - protected: - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; - - void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const override; - - void LaunchCheckLabelKernel() const; - - const double max_delta_step_ = 0.0f; - mutable double* cuda_block_buffer_; -}; - -class CUDARegressionQuantileloss : public CUDARegressionL2loss { - public: - explicit CUDARegressionQuantileloss(const Config& config); - - explicit CUDARegressionQuantileloss(const std::vector& strs); - - ~CUDARegressionQuantileloss(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - const char* GetName() const override { - return "quantile"; - } - - bool IsRenewTreeOutput() const override { return true; } - - private: - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; - - double LaunchCalcInitScoreKernel() const override; - - void LaunchRenewTreeOutputCUDAKernel( - const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, - const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override; - - const double alpha_ = 0.0f; - data_size_t* cuda_data_indices_buffer_; - mutable double* cuda_weights_prefix_sum_; - double* cuda_weights_prefix_sum_buffer_; - mutable double* cuda_residual_buffer_; - mutable label_t* cuda_weight_by_leaf_buffer_; - label_t* cuda_percentile_result_; -}; - -class CUDARegressionMAPELOSS : public CUDARegressionL1loss { - public: - explicit CUDARegressionMAPELOSS(const Config& config); - - explicit CUDARegressionMAPELOSS(const std::vector& strs); - - ~CUDARegressionMAPELOSS(); - - void Init(const Metadata& metadata, data_size_t num_data) override; - - bool IsRenewTreeOutput() const override { return true; } - - private: - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; - - double LaunchCalcInitScoreKernel() const override; - - void LaunchRenewTreeOutputCUDAKernel( - const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf, - const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override; - - void LaunchCalcLabelWeightKernel(); - - label_t* cuda_label_weights_; -}; - -class CUDARegressionGammaLoss : public CUDARegressionPoissonLoss { - public: - explicit CUDARegressionGammaLoss(const Config& config); - - explicit CUDARegressionGammaLoss(const std::vector& strs); - - ~CUDARegressionGammaLoss(); - - const char* GetName() const override { - return "gamma"; - } - - private: - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; -}; - -class CUDARegressionTweedieLoss : public CUDARegressionPoissonLoss { - public: - explicit CUDARegressionTweedieLoss(const Config& config); - - explicit CUDARegressionTweedieLoss(const std::vector& strs); - - ~CUDARegressionTweedieLoss(); - - const char* GetName() const override { - return "tweedie"; - } - - private: - const double rho_ = 0.0f; - - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override; -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_NEW_CUDA_REGRESSION_OBJECTIVE_HPP_ diff --git a/src/objective/cuda/cuda_xentropy_objective.cpp b/src/objective/cuda/cuda_xentropy_objective.cpp deleted file mode 100644 index d9247736ff43..000000000000 --- a/src/objective/cuda/cuda_xentropy_objective.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_xentropy_objective.hpp" - -namespace LightGBM { - -CUDACrossEntropy::CUDACrossEntropy(const Config& config): CrossEntropy(config) {} - -CUDACrossEntropy::CUDACrossEntropy(const std::vector& strs): CrossEntropy(strs) {} - -CUDACrossEntropy::~CUDACrossEntropy() {} - -void CUDACrossEntropy::Init(const Metadata& metadata, data_size_t num_data) { - CrossEntropy::Init(metadata, num_data); - const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; - AllocateCUDAMemoryOuter(&cuda_reduce_sum_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - cuda_labels_ = metadata.cuda_metadata()->cuda_label(); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); -} - -double CUDACrossEntropy::BoostFromScore(int) const { - return LaunchCalcInitScoreKernel(); -} - -void CUDACrossEntropy::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { - LaunchGetGradientsKernel(score, gradients, hessians); -} - -void CUDACrossEntropy::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { - LaunchConvertOutputCUDAKernel(num_data, input, output); -} - -CUDACrossEntropyLambda::CUDACrossEntropyLambda(const Config& config): CrossEntropyLambda(config) {} - -CUDACrossEntropyLambda::CUDACrossEntropyLambda(const std::vector& strs): CrossEntropyLambda(strs) {} - -CUDACrossEntropyLambda::~CUDACrossEntropyLambda() {} - -void CUDACrossEntropyLambda::Init(const Metadata& metadata, data_size_t num_data) { - CrossEntropyLambda::Init(metadata, num_data); - const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; - AllocateCUDAMemoryOuter(&cuda_reduce_sum_buffer_, static_cast(num_blocks), __FILE__, __LINE__); - cuda_labels_ = metadata.cuda_metadata()->cuda_label(); - cuda_weights_ = metadata.cuda_metadata()->cuda_weights(); -} - -double CUDACrossEntropyLambda::BoostFromScore(int) const { - return LaunchCalcInitScoreKernel(); -} - -void CUDACrossEntropyLambda::GetGradients(const double* score, score_t* gradients, score_t* hessians) const { - LaunchGetGradientsKernel(score, gradients, hessians); -} - -void CUDACrossEntropyLambda::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const { - LaunchConvertOutputCUDAKernel(num_data, input, output); -} - -} // namespace LightGBM diff --git a/src/objective/cuda/cuda_xentropy_objective.cu b/src/objective/cuda/cuda_xentropy_objective.cu deleted file mode 100644 index 3595877470a4..000000000000 --- a/src/objective/cuda/cuda_xentropy_objective.cu +++ /dev/null @@ -1,144 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include -#include "cuda_xentropy_objective.hpp" - -namespace LightGBM { - -double CUDACrossEntropy::LaunchCalcInitScoreKernel() const { - double suml = 0.0f; - double sumw = 0.0f; - if (cuda_weights_ == nullptr) { - sumw = static_cast(num_data_); - ReduceSumGlobal(cuda_labels_, static_cast(num_data_), cuda_reduce_sum_buffer_); - CopyFromCUDADeviceToHostOuter(&suml, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); - } else { - ReduceDotProductGlobal(cuda_labels_, cuda_weights_, static_cast(num_data_), cuda_reduce_sum_buffer_); - CopyFromCUDADeviceToHostOuter(&suml, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); - ReduceSumGlobal(cuda_weights_, static_cast(num_data_), cuda_reduce_sum_buffer_); - CopyFromCUDADeviceToHostOuter(&sumw, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); - } - double pavg = suml / sumw; - pavg = std::min(pavg, 1.0 - kEpsilon); - pavg = std::max(pavg, kEpsilon); - return std::log(pavg / (1.0f - pavg)); -} - -template -__global__ void GetGradientsKernel_CrossEntropy( - const double* cuda_scores, - const label_t* cuda_labels, - const label_t* cuda_weights, - const data_size_t num_data, - score_t* cuda_out_gradients, - score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - if (USE_WEIGHT) { - const double z = 1.0f / (1.0f + exp(-cuda_scores[data_index])); - const label_t weight = cuda_weights[data_index]; - cuda_out_gradients[data_index] = static_cast(z - cuda_labels[data_index] * weight); - cuda_out_hessians[data_index] = static_cast(z * (1.0f - z) * weight); - } else { - const double z = 1.0f / (1.0f + exp(-cuda_scores[data_index])); - cuda_out_gradients[data_index] = static_cast(z - cuda_labels[data_index]); - cuda_out_hessians[data_index] = static_cast(z * (1.0f - z)); - } - } -} - -void CUDACrossEntropy::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_CrossEntropy<<>>(score, cuda_labels_, nullptr, num_data_, gradients, hessians); - } else { - GetGradientsKernel_CrossEntropy<<>>(score, cuda_labels_, cuda_weights_, num_data_, gradients, hessians); - } -} - -__global__ void ConvertOutputCUDAKernel_CrossEntropy(const data_size_t num_data, const double* input, double* output) { - const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (data_index < num_data) { - output[data_index] = 1.0f / (1.0f + exp(-input[data_index])); - } -} - -void CUDACrossEntropy::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { - const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; - ConvertOutputCUDAKernel_CrossEntropy<<>>(num_data, input, output); -} - -double CUDACrossEntropyLambda::LaunchCalcInitScoreKernel() const { - double suml = 0.0f; - double sumw = 0.0f; - if (cuda_weights_ == nullptr) { - sumw = static_cast(num_data_); - ReduceSumGlobal(cuda_labels_, static_cast(num_data_), cuda_reduce_sum_buffer_); - CopyFromCUDADeviceToHostOuter(&suml, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); - } else { - ReduceDotProductGlobal(cuda_labels_, cuda_weights_, static_cast(num_data_), cuda_reduce_sum_buffer_); - CopyFromCUDADeviceToHostOuter(&suml, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); - ReduceSumGlobal(cuda_weights_, static_cast(num_data_), cuda_reduce_sum_buffer_); - CopyFromCUDADeviceToHostOuter(&sumw, cuda_reduce_sum_buffer_, 1, __FILE__, __LINE__); - } - double havg = suml / sumw; - return std::log(std::exp(havg) - 1.0f); -} - -template -__global__ void GetGradientsKernel_CrossEntropyLambda( - const double* cuda_scores, - const label_t* cuda_labels, - const label_t* cuda_weights, - const data_size_t num_data, - score_t* cuda_out_gradients, - score_t* cuda_out_hessians) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - if (USE_WEIGHT) { - const double w = static_cast(cuda_weights[data_index]); - const double y = static_cast(cuda_labels[data_index]); - const double epf = exp(cuda_scores[data_index]); - const double hhat = log(1.0f + epf); - const double z = 1.0f - exp(-w * hhat); - const double enf = 1.0f / epf; // = std::exp(-cuda_scores[data_index]); - cuda_out_gradients[data_index] = static_cast((1.0f - y / z) * w / (1.0f + enf)); - const double c = 1.0f / (1.0f - z); - double d = 1.0f + epf; - const double a = w * epf / (d * d); - d = c - 1.0f; - const double b = (c / (d * d) ) * (1.0f + w * epf - c); - cuda_out_hessians[data_index] = static_cast(a * (1.0f + y * b)); - } else { - const double z = 1.0f / (1.0f + exp(-cuda_scores[data_index])); - cuda_out_gradients[data_index] = static_cast(z - cuda_labels[data_index]); - cuda_out_hessians[data_index] = static_cast(z * (1.0f - z)); - } - } -} - -void CUDACrossEntropyLambda::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const { - const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; - if (cuda_weights_ == nullptr) { - GetGradientsKernel_CrossEntropyLambda<<>>(score, cuda_labels_, nullptr, num_data_, gradients, hessians); - } else { - GetGradientsKernel_CrossEntropyLambda<<>>(score, cuda_labels_, cuda_weights_, num_data_, gradients, hessians); - } -} - -__global__ void ConvertOutputCUDAKernel_CUDACrossEntropyLambda(const data_size_t num_data, const double* input, double* output) { - const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (data_index < num_data) { - output[data_index] = log(1.0f + exp(input[data_index])); - } -} - -void CUDACrossEntropyLambda::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const { - const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_XENTROPY - 1) / GET_GRADIENTS_BLOCK_SIZE_XENTROPY; - ConvertOutputCUDAKernel_CUDACrossEntropyLambda<<>>(num_data, input, output); -} - -} // namespace LightGBM diff --git a/src/objective/cuda/cuda_xentropy_objective.hpp b/src/objective/cuda/cuda_xentropy_objective.hpp deleted file mode 100644 index 39e47bb5fa5a..000000000000 --- a/src/objective/cuda/cuda_xentropy_objective.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_XENTROPY_OBJECTIVE_HPP_ -#define LIGHTGBM_OBJECTIVE_CUDA_CUDA_XENTROPY_OBJECTIVE_HPP_ - -#include -#include "../xentropy_objective.hpp" - -#define GET_GRADIENTS_BLOCK_SIZE_XENTROPY (1024) - -namespace LightGBM { - -class CUDACrossEntropy: public CUDAObjectiveInterface, public CrossEntropy { - public: - explicit CUDACrossEntropy(const Config& config); - - explicit CUDACrossEntropy(const std::vector& strs); - - ~CUDACrossEntropy(); - - virtual void Init(const Metadata& metadata, data_size_t num_data) override; - - double BoostFromScore(int) const override; - - virtual void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; - - void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; - - std::function GetCUDAConvertOutputFunc() const override { - return [this] (data_size_t num_data, const double* input, double* output) { - ConvertOutputCUDA(num_data, input, output); - }; - } - - private: - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; - - double LaunchCalcInitScoreKernel() const; - - void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; - - const label_t* cuda_labels_; - const label_t* cuda_weights_; - double* cuda_reduce_sum_buffer_; -}; - -class CUDACrossEntropyLambda: public CUDAObjectiveInterface, public CrossEntropyLambda { - public: - explicit CUDACrossEntropyLambda(const Config& config); - - explicit CUDACrossEntropyLambda(const std::vector& strs); - - ~CUDACrossEntropyLambda(); - - virtual void Init(const Metadata& metadata, data_size_t num_data) override; - - double BoostFromScore(int) const override; - - virtual void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override; - - void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override; - - std::function GetCUDAConvertOutputFunc() const override { - return [this] (data_size_t num_data, const double* input, double* output) { - ConvertOutputCUDA(num_data, input, output); - }; - } - - private: - void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const; - - double LaunchCalcInitScoreKernel() const; - - void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const; - - const label_t* cuda_labels_; - const label_t* cuda_weights_; - double* cuda_reduce_sum_buffer_; -}; - -} // namespace LightGBM - -#endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_XENTROPY_OBJECTIVE_HPP_ diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index 30d3d075b30e..d58b6f26aa4c 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -10,89 +10,43 @@ #include "regression_objective.hpp" #include "xentropy_objective.hpp" -#include "cuda/cuda_binary_objective.hpp" -#include "cuda/cuda_multiclass_objective.hpp" -#include "cuda/cuda_regression_objective.hpp" -#include "cuda/cuda_rank_objective.hpp" -#include "cuda/cuda_xentropy_objective.hpp" - namespace LightGBM { ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) { - if (config.device_type == std::string("cuda")) { - if (type == std::string("regression")) { - return new CUDARegressionL2loss(config); - } else if (type == std::string("regression_l1")) { - return new CUDARegressionL1loss(config); - } else if (type == std::string("quantile")) { - return new CUDARegressionQuantileloss(config); - } else if (type == std::string("huber")) { - return new CUDARegressionHuberLoss(config); - } else if (type == std::string("fair")) { - return new CUDARegressionFairLoss(config); - } else if (type == std::string("poisson")) { - return new CUDARegressionFairLoss(config); - } else if (type == std::string("binary")) { - return new CUDABinaryLogloss(config); - } else if (type == std::string("lambdarank")) { - return new CUDALambdarankNDCG(config); - } else if (type == std::string("rank_xendcg")) { - return new CUDARankXENDCG(config); - } else if (type == std::string("multiclass")) { - return new CUDAMulticlassSoftmax(config); - } else if (type == std::string("multiclassova")) { - return new CUDAMulticlassOVA(config); - } else if (type == std::string("cross_entropy")) { - return new CUDACrossEntropy(config); - } else if (type == std::string("cross_entropy_lambda")) { - return new CUDACrossEntropyLambda(config); - } else if (type == std::string("mape")) { - return new CUDARegressionMAPELOSS(config); - } else if (type == std::string("gamma")) { - return new CUDARegressionGammaLoss(config); - } else if (type == std::string("tweedie")) { - return new CUDARegressionTweedieLoss(config); - } else if (type == std::string("custom")) { - // TODO(shiyu1994): when using customized objective function - // TODO(shiyu1994): we should copy gradients manually to GPU - return nullptr; - } - } else { - if (type == std::string("regression")) { - return new RegressionL2loss(config); - } else if (type == std::string("regression_l1")) { - return new RegressionL1loss(config); - } else if (type == std::string("quantile")) { - return new RegressionQuantileloss(config); - } else if (type == std::string("huber")) { - return new RegressionHuberLoss(config); - } else if (type == std::string("fair")) { - return new RegressionFairLoss(config); - } else if (type == std::string("poisson")) { - return new RegressionPoissonLoss(config); - } else if (type == std::string("binary")) { - return new BinaryLogloss(config); - } else if (type == std::string("lambdarank")) { - return new LambdarankNDCG(config); - } else if (type == std::string("rank_xendcg")) { - return new RankXENDCG(config); - } else if (type == std::string("multiclass")) { - return new MulticlassSoftmax(config); - } else if (type == std::string("multiclassova")) { - return new MulticlassOVA(config); - } else if (type == std::string("cross_entropy")) { - return new CrossEntropy(config); - } else if (type == std::string("cross_entropy_lambda")) { - return new CrossEntropyLambda(config); - } else if (type == std::string("mape")) { - return new RegressionMAPELOSS(config); - } else if (type == std::string("gamma")) { - return new RegressionGammaLoss(config); - } else if (type == std::string("tweedie")) { - return new RegressionTweedieLoss(config); - } else if (type == std::string("custom")) { - return nullptr; - } + if (type == std::string("regression")) { + return new RegressionL2loss(config); + } else if (type == std::string("regression_l1")) { + return new RegressionL1loss(config); + } else if (type == std::string("quantile")) { + return new RegressionQuantileloss(config); + } else if (type == std::string("huber")) { + return new RegressionHuberLoss(config); + } else if (type == std::string("fair")) { + return new RegressionFairLoss(config); + } else if (type == std::string("poisson")) { + return new RegressionPoissonLoss(config); + } else if (type == std::string("binary")) { + return new BinaryLogloss(config); + } else if (type == std::string("lambdarank")) { + return new LambdarankNDCG(config); + } else if (type == std::string("rank_xendcg")) { + return new RankXENDCG(config); + } else if (type == std::string("multiclass")) { + return new MulticlassSoftmax(config); + } else if (type == std::string("multiclassova")) { + return new MulticlassOVA(config); + } else if (type == std::string("cross_entropy")) { + return new CrossEntropy(config); + } else if (type == std::string("cross_entropy_lambda")) { + return new CrossEntropyLambda(config); + } else if (type == std::string("mape")) { + return new RegressionMAPELOSS(config); + } else if (type == std::string("gamma")) { + return new RegressionGammaLoss(config); + } else if (type == std::string("tweedie")) { + return new RegressionTweedieLoss(config); + } else if (type == std::string("custom")) { + return nullptr; } Log::Fatal("Unknown objective type name: %s", type.c_str()); return nullptr; From 8fb8562a29c9512bbab6b355dbfc5b79d79adf95 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 13 Sep 2021 15:21:54 +0000 Subject: [PATCH 070/166] use shuffle based prefix sum --- include/LightGBM/cuda/cuda_algorithms.hpp | 349 +---- .../LightGBM/cuda/cuda_objective_function.hpp | 26 - src/application/cuda/cuda_predictor.hpp | 1 - src/boosting/cuda/cuda_score_updater.cpp | 19 - src/boosting/gbdt.cpp | 35 +- src/cuda/cuda_algorithms.cu | 1223 ----------------- src/main.cpp | 10 +- src/objective/multiclass_objective.hpp | 40 - .../cuda/cuda_best_split_finder.cu | 258 ++-- src/treelearner/cuda/cuda_data_partition.cpp | 2 +- src/treelearner/cuda/cuda_data_partition.cu | 215 ++- src/treelearner/cuda/cuda_data_partition.hpp | 3 +- .../cuda/cuda_histogram_constructor.cu | 29 +- .../cuda/cuda_histogram_constructor.hpp | 2 +- .../cuda/new_cuda_tree_learner.cpp | 25 - .../cuda/new_cuda_tree_learner.hpp | 8 - src/treelearner/serial_tree_learner.cpp | 8 - 17 files changed, 296 insertions(+), 1957 deletions(-) delete mode 100644 include/LightGBM/cuda/cuda_objective_function.hpp diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index ff28bbba201b..b006dcc10270 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -29,109 +29,6 @@ namespace LightGBM { -#define ReduceSumInner(values, n) \ - const unsigned int thread_index = threadIdx.x; \ - for (size_t s = 1; s < n; s <<= 1) { \ - if (thread_index % (s << 1) == 0 && (thread_index + s) < n) { \ - values[thread_index] += values[thread_index + s]; \ - } \ - __syncthreads(); \ - } - - -#define ReduceSumConflictFreeInner(values, n) \ - const unsigned int thread_index = threadIdx.x; \ - for (size_t s = 1; s < n; s <<= 1) { \ - if (thread_index % (s << 1) == 0 && (thread_index + s) < n) { \ - values[CONFLICT_FREE_INDEX(thread_index)] += values[CONFLICT_FREE_INDEX(thread_index + s)]; \ - } \ - __syncthreads(); \ - } \ - -template -__device__ void ReduceSum(T* values, size_t n) { - ReduceSumInner(values, n); -} - -template -__device__ void ReduceSumConflictFree(T* values, size_t n) { - ReduceSumConflictFreeInner(values, n); -} - -template -void ReduceSumGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer); - -template -__global__ void BlockReduceSum(T* block_buffer, const data_size_t num_blocks); - -template -void ReduceDotProductGlobal(const VAL_A_T* a, const VAL_B_T* b, size_t n, REDUCE_T* block_buffer); - -template -void ReduceMaxGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer); - -template -void ReduceMinGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer); - -template -__device__ void ReduceMax(T* values, size_t n); - -template -void GlobalInclusivePrefixSum(T* values, T* block_buffer, size_t n); - -template -void GlobalGenAUCPosNegSum(const label_t* labels, - const label_t* weights, - const data_size_t* sorted_indices, - double* sum_pos_buffer, - double* block_sum_pos_buffer, - const data_size_t num_data); - -template -__global__ void GlobalInclusivePrefixSumReduceBlockKernel(T* block_buffer, data_size_t num_blocks); - -template -__global__ void GlobalInclusivePrefixSumAddBlockBaseKernel(const T* block_buffer, T* values, data_size_t num_data); - -__global__ void GlobalGenAUCMarkKernel(const double* scores, - const data_size_t* sorted_indices, - data_size_t* mark_buffer, - data_size_t* block_mark_buffer, - uint16_t* block_mark_first_zero, - data_size_t num_data); - -__global__ void GlobalInclusivePrefixSumReduceBlockZeroOutKernel( - data_size_t* block_buffer, - const uint16_t* block_mark_first_zero, - data_size_t num_blocks); - -__global__ void GlobalInclusivePrefixSumAddBlockBaseGenAUCMarkKernel( - const data_size_t* block_buffer, - data_size_t* values, - const uint16_t* block_first_zero, - data_size_t num_data); - -void GloblGenAUCMark(const double* scores, - const data_size_t* sorted_indices, - data_size_t* mark_buffer, - data_size_t* block_mark_buffer, - uint16_t* block_mark_first_zero, - const data_size_t num_data); - -template -void GlobalCalcAUC(const double* sum_pos_buffer, - const double* sum_neg_buffer, - const data_size_t* mark_buffer, - const data_size_t num_data, - double* block_buffer); - -template -void GlobalCalcAveragePrecision(const double* sum_pos_buffer, - const double* sum_neg_buffer, - const data_size_t* mark_buffer, - const data_size_t num_data, - double* block_buffer); - template __device__ void PrefixSum(T* values, size_t n) { unsigned int offset = 1; @@ -206,103 +103,35 @@ __device__ __forceinline__ void PrefixSumConflictFree(T* values, size_t n) { } } -template -void BitonicSortGlobal(VAL_T* values, const size_t len); - -template -void BitonicArgSortGlobal(const VAL_T* values, INDEX_T* indices, const size_t len); - -template -__global__ void BitonicArgSortGlobalKernel(const VAL_T* values, INDEX_T* indices, const int num_total_data); - -template -__global__ void BitonicArgCompareKernel(const VAL_T* values, INDEX_T* indices, const int half_segment_length, const int outer_segment_length, const int len); - -template -__global__ void BitonicArgSortMergeKernel(const VAL_T* values, INDEX_T* indices, const int segment_length, const int len); - -void BitonicArgSortItemsGlobal(const double* values, - const int num_queries, - const data_size_t* cuda_query_boundaries, - data_size_t* out_indices); - -__device__ __forceinline__ void BitonicArgSort_1024(const score_t* scores, uint16_t* indices, const uint16_t num_items) { - uint16_t depth = 1; - uint16_t num_items_aligend = 1; - uint16_t num_items_ref = num_items - 1; - while (num_items_ref > 0) { - num_items_ref >>= 1; - num_items_aligend <<= 1; - ++depth; - } - for (uint16_t outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { - const uint16_t outer_segment_length = 1 << (depth - outer_depth); - const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; - const bool ascending = (outer_segment_index % 2 > 0); - for (uint16_t inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { - const uint16_t segment_length = 1 << (depth - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (threadIdx.x < num_items_aligend) { - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length; - if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) { - const uint16_t index = indices[threadIdx.x]; - indices[threadIdx.x] = indices[index_to_compare]; - indices[index_to_compare] = index; - } - } - } - __syncthreads(); - } - } -} - -__device__ __forceinline__ void BitonicArgSort_2048(const score_t* scores, uint16_t* indices) { - for (uint16_t base = 0; base < 2048; base += 1024) { - for (uint16_t outer_depth = 10; outer_depth >= 1; --outer_depth) { - const uint16_t outer_segment_length = 1 << (11 - outer_depth); - const uint16_t outer_segment_index = threadIdx.x / outer_segment_length; - const bool ascending = (base == 0) ? (outer_segment_index % 2 > 0) : (outer_segment_index % 2 == 0); - for (uint16_t inner_depth = outer_depth; inner_depth < 11; ++inner_depth) { - const uint16_t segment_length = 1 << (11 - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length + base; - if ((scores[indices[threadIdx.x + base]] > scores[indices[index_to_compare]]) == ascending) { - const uint16_t index = indices[threadIdx.x + base]; - indices[threadIdx.x + base] = indices[index_to_compare]; - indices[index_to_compare] = index; - } - } - __syncthreads(); - } +template +__device__ __forceinline__ T ShufflePrefixSum(T value, T* shared_mem_buffer) { + const uint32_t mask = 0xffffffff; + const uint32_t warpLane = threadIdx.x % warpSize; + const uint32_t warpID = threadIdx.x / warpSize; + const uint32_t num_warp = blockDim.x / warpSize; + for (uint32_t offset = 1; offset < warpSize; offset <<= 1) { + const T other_value = __shfl_up_sync(mask, value, offset); + if (warpLane >= offset) { + value += other_value; } } - const unsigned int index_to_compare = threadIdx.x + 1024; - if (scores[indices[index_to_compare]] > scores[indices[threadIdx.x]]) { - const uint16_t temp_index = indices[index_to_compare]; - indices[index_to_compare] = indices[threadIdx.x]; - indices[threadIdx.x] = temp_index; + if (warpLane == warpSize - 1) { + shared_mem_buffer[warpID] = value; } __syncthreads(); - for (uint16_t base = 0; base < 2048; base += 1024) { - for (uint16_t inner_depth = 1; inner_depth < 11; ++inner_depth) { - const uint16_t segment_length = 1 << (11 - inner_depth); - const uint16_t half_segment_length = segment_length >> 1; - const uint16_t half_segment_index = threadIdx.x / half_segment_length; - if (half_segment_index % 2 == 0) { - const uint16_t index_to_compare = threadIdx.x + half_segment_length + base; - if (scores[indices[threadIdx.x + base]] < scores[indices[index_to_compare]]) { - const uint16_t index = indices[threadIdx.x + base]; - indices[threadIdx.x + base] = indices[index_to_compare]; - indices[index_to_compare] = index; - } + if (warpID == 0) { + T warp_sum = (warpLane < num_warp ? shared_mem_buffer[warpLane] : 0); + for (uint32_t offset = 1; offset < warpSize; offset <<= 1) { + const T other_warp_sum = __shfl_up_sync(mask, warp_sum, offset); + if (warpLane >= offset) { + warp_sum += other_warp_sum; } - __syncthreads(); } + shared_mem_buffer[warpLane] = warp_sum; } + __syncthreads(); + const T warp_base = warpID == 0 ? 0 : shared_mem_buffer[warpID - 1]; + return warp_base + value; } template @@ -368,140 +197,6 @@ __device__ __forceinline__ T ShuffleReduceMax(T value, T* shared_mem_buffer, con return value; } -template -__device__ __forceinline__ T ShuffleReduceMinWarp(T value, const data_size_t len) { - if (len > 0) { - const uint32_t mask = (0xffffffff >> (warpSize - len)); - for (int offset = warpSize / 2; offset > 0; offset >>= 1) { - const T other_value = __shfl_down_sync(mask, value, offset); - value = (other_value < value) ? other_value : value; - } - } - return value; -} - -// reduce values from an 1-dimensional block (block size must be no greather than 1024) -template -__device__ __forceinline__ T ShuffleReduceMin(T value, T* shared_mem_buffer, const size_t len) { - const uint32_t warpLane = threadIdx.x % warpSize; - const uint32_t warpID = threadIdx.x / warpSize; - const data_size_t warp_len = min(static_cast(warpSize), static_cast(len) - static_cast(warpID * warpSize)); - value = ShuffleReduceMinWarp(value, warp_len); - if (warpLane == 0) { - shared_mem_buffer[warpID] = value; - } - __syncthreads(); - const data_size_t num_warp = static_cast((len + warpSize - 1) / warpSize); - if (warpID == 0) { - value = (warpLane < num_warp ? shared_mem_buffer[warpLane] : shared_mem_buffer[0]); - value = ShuffleReduceMinWarp(value, num_warp); - } - return value; -} - -template -__device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, const int len); - -template -__device__ void PrefixSumDevice(const VAL_T* in_values, - const INDEX_T* sorted_indices, - REDUCE_VAL_T* out_values, - const INDEX_T num_data) { - __shared__ REDUCE_VAL_T shared_buffer[1025]; - const INDEX_T num_data_per_thread = (num_data + static_cast(blockDim.x) - 1) / static_cast(blockDim.x); - const INDEX_T start = num_data_per_thread * static_cast(threadIdx.x); - const INDEX_T end = min(start + num_data_per_thread, num_data); - REDUCE_VAL_T thread_sum = 0; - for (INDEX_T index = start; index < end; ++index) { - thread_sum += static_cast(in_values[sorted_indices[index]]); - } - shared_buffer[threadIdx.x] = thread_sum; - __syncthreads(); - PrefixSum(shared_buffer, blockDim.x); - const REDUCE_VAL_T thread_base = shared_buffer[threadIdx.x]; - for (INDEX_T index = start; index < end; ++index) { - out_values[index] = thread_base + static_cast(in_values[sorted_indices[index]]); - } - __syncthreads(); -} - -template -__device__ VAL_T PercentileDevice(const VAL_T* values, - const WEIGHT_T* weights, - INDEX_T* indices, - REDUCE_WEIGHT_T* weights_prefix_sum, - const double alpha, - const INDEX_T len); - -template -__global__ void PercentileGlobalKernel(const VAL_T* values, - const WEIGHT_T* weights, - const INDEX_T* sorted_indices, - const WEIGHT_REDUCE_T* weights_prefix_sum, - const double alpha, - const INDEX_T len, - VAL_T* out_value) { - if (!USE_WEIGHT) { - const double float_pos = (1.0f - alpha) * len; - const INDEX_T pos = static_cast(float_pos); - if (pos < 1) { - *out_value = values[sorted_indices[0]]; - } else if (pos >= len) { - *out_value = values[sorted_indices[len - 1]]; - } else { - const double bias = float_pos - static_cast(pos); - const VAL_T v1 = values[sorted_indices[pos - 1]]; - const VAL_T v2 = values[sorted_indices[pos]]; - *out_value = static_cast(v1 - (v1 - v2) * bias); - } - } else { - const WEIGHT_REDUCE_T threshold = weights_prefix_sum[len - 1] * (1.0f - alpha); - __shared__ INDEX_T pos; - if (threadIdx.x == 0) { - pos = len; - } - __syncthreads(); - for (INDEX_T index = static_cast(threadIdx.x); index < len; index += static_cast(blockDim.x)) { - if (weights_prefix_sum[index] > threshold && (index == 0 || weights_prefix_sum[index - 1] <= threshold)) { - pos = index; - } - } - __syncthreads(); - pos = min(pos, len - 1); - if (pos == 0 || pos == len - 1) { - *out_value = values[pos]; - } - const VAL_T v1 = values[sorted_indices[pos - 1]]; - const VAL_T v2 = values[sorted_indices[pos]]; - *out_value = static_cast(v1 - (v1 - v2) * (threshold - weights_prefix_sum[pos - 1]) / (weights_prefix_sum[pos] - weights_prefix_sum[pos - 1])); - } -} - -template -void GlobalInclusiveArgPrefixSum(const INDEX_T* sorted_indices, const VAL_T* in_values, REDUCE_T* out_values, REDUCE_T* block_buffer, size_t n); - -template -void PercentileGlobal(const VAL_T* values, - const WEIGHT_T* weights, - INDEX_T* indices, - WEIGHT_REDUCE_T* weights_prefix_sum, - WEIGHT_REDUCE_T* weights_prefix_sum_buffer, - const double alpha, - const INDEX_T len, - VAL_T* cuda_out_value) { - if (len <= 1) { - CopyFromCUDADeviceToCUDADeviceOuter(cuda_out_value, values, 1, __FILE__, __LINE__); - } - BitonicArgSortGlobal(values, indices, len); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - if (USE_WEIGHT) { - GlobalInclusiveArgPrefixSum(indices, weights, weights_prefix_sum, weights_prefix_sum_buffer, static_cast(len)); - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - PercentileGlobalKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(values, weights, indices, weights_prefix_sum, alpha, len, cuda_out_value); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - } // namespace LightGBM #endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp deleted file mode 100644 index 44af57132105..000000000000 --- a/include/LightGBM/cuda/cuda_objective_function.hpp +++ /dev/null @@ -1,26 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for - * license information. - */ - -#ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_OBJECTIVE_HPP_ -#define LIGHTGBM_OBJECTIVE_CUDA_CUDA_OBJECTIVE_HPP_ - -#ifdef USE_CUDA - -#include -#include -#include - -namespace LightGBM { - -class CUDAObjectiveInterface { - public: - virtual void ConvertOutputCUDA(const data_size_t /*num_data*/, const double* /*input*/, double* /*output*/) const {} -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_OBJECTIVE_HPP_ diff --git a/src/application/cuda/cuda_predictor.hpp b/src/application/cuda/cuda_predictor.hpp index 3465952cdb8b..f795dfa7dfb8 100644 --- a/src/application/cuda/cuda_predictor.hpp +++ b/src/application/cuda/cuda_predictor.hpp @@ -5,7 +5,6 @@ #ifndef LIGHTGBM_APPLICATION_CUDA_CUDA_PREDICTOR_HPP_ #define LIGHTGBM_APPLICATION_CUDA_CUDA_PREDICTOR_HPP_ -#include #include #include #include diff --git a/src/boosting/cuda/cuda_score_updater.cpp b/src/boosting/cuda/cuda_score_updater.cpp index 10026955bbe2..336842ea5fa8 100644 --- a/src/boosting/cuda/cuda_score_updater.cpp +++ b/src/boosting/cuda/cuda_score_updater.cpp @@ -50,25 +50,6 @@ inline void CUDAScoreUpdater::AddScore(const TreeLearner* tree_learner, const Tr Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; tree_learner->AddPredictionToScore(tree, cuda_score_ + offset); - std::vector class_train_score(num_data_, 0.0f); - CopyFromCUDADeviceToHostOuter(class_train_score.data(), cuda_score_ + offset, num_data_, __FILE__, __LINE__); - const int num_threads = OMP_NUM_THREADS(); - std::vector thread_max_abs_train_score(num_threads, 0.0f); - Threading::For(0, num_data_, 512, - [&thread_max_abs_train_score, &class_train_score] (int thread_index, data_size_t start, data_size_t end) { - for (data_size_t data_index = start; data_index < end; ++data_index) { - if (std::fabs(class_train_score[data_index]) > std::fabs(thread_max_abs_train_score[thread_index])) { - thread_max_abs_train_score[thread_index] = class_train_score[data_index]; - } - } - }); - double max_abs_train_score = 0.0f; - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - if (std::fabs(thread_max_abs_train_score[thread_index]) > std::fabs(max_abs_train_score)) { - max_abs_train_score = thread_max_abs_train_score[thread_index]; - } - } - Log::Warning("class %d max_abs_train_score = %f", cur_tree_id, max_abs_train_score); } inline void CUDAScoreUpdater::AddScore(const Tree* tree, const data_size_t* data_indices, diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index c0695b74bcde..4e3dec289bac 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -113,23 +113,18 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective num_data_ = train_data_->num_data(); // create buffer for gradients and Hessians + size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + if (config_->device_type == std::string("cuda")) { + AllocateCUDAMemoryOuter(&gradients_pointer_, total_size, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&hessians_pointer_, total_size, __FILE__, __LINE__); + } if (objective_function_ != nullptr) { - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - if (config_->device_type == std::string("cuda")) { - AllocateCUDAMemoryOuter(&gradients_pointer_, total_size, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&hessians_pointer_, total_size, __FILE__, __LINE__); - } else { - gradients_.resize(total_size); - hessians_.resize(total_size); + gradients_.resize(total_size); + hessians_.resize(total_size); + if (config_->device_type == std::string("cpu")) { gradients_pointer_ = gradients_.data(); hessians_pointer_ = hessians_.data(); } - } else { - if (config_->device_type == std::string("cuda")) { - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - AllocateCUDAMemoryOuter(&gradients_pointer_, total_size, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&hessians_pointer_, total_size, __FILE__, __LINE__); - } } // get max feature index max_feature_idx_ = train_data_->num_total_features() - 1; @@ -196,7 +191,7 @@ void GBDT::Boosting() { // objective function will calculate gradients and hessians int64_t num_score = 0; objective_function_-> - GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_); + GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data()); } data_size_t GBDT::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) { @@ -399,10 +394,14 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } if (config_->device_type == std::string("cuda")) { const size_t total_size = static_cast(num_data_ * num_class_); - CopyFromHostToCUDADeviceOuter(gradients_pointer_, gradients, total_size, __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(hessians_pointer_, hessians, total_size, __FILE__, __LINE__); - } - if (gradients == nullptr || hessians == nullptr) { + const score_t* host_gradients = gradients == nullptr ? gradients_.data() : gradients; + const score_t* host_hessians = hessians == nullptr ? hessians_.data() : hessians; + global_timer.Start("Copy gradients from Host to CUDA"); + CopyFromHostToCUDADeviceOuter(gradients_pointer_, host_gradients, total_size, __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(hessians_pointer_, host_hessians, total_size, __FILE__, __LINE__); + global_timer.Stop("Copy gradients from Host to CUDA"); + } + if (gradients == nullptr || hessians == nullptr || config_->device_type == std::string("cuda")) { gradients = gradients_pointer_; hessians = hessians_pointer_; } diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 894606f3b6ae..cbb7e89cbd20 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -7,956 +7,6 @@ namespace LightGBM { -template -__global__ void BitonicSortGlobalKernel(T* values, const int num_total_data) { - const int thread_index = static_cast(threadIdx.x); - const int low = static_cast(blockIdx.x * BITONIC_SORT_NUM_ELEMENTS); - const bool outer_ascending = ASCENDING ? (blockIdx.x % 2 == 0) : (blockIdx.x % 2 == 1); - T* values_pointer = values + low; - const int num_data = min(BITONIC_SORT_NUM_ELEMENTS, num_total_data - low); - __shared__ T shared_values[BITONIC_SORT_NUM_ELEMENTS]; - if (thread_index < num_data) { - shared_values[thread_index] = values_pointer[thread_index]; - } - __syncthreads(); - for (int depth = BITONIC_SORT_DEPTH - 1; depth >= 1; --depth) { - const int segment_length = 1 << (BITONIC_SORT_DEPTH - depth); - const int segment_index = thread_index / segment_length; - const bool ascending = outer_ascending ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - const int num_total_segment = (num_data + segment_length - 1) / segment_length; - { - const int inner_depth = depth; - const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); - const int inner_segment_index_half = thread_index / inner_segment_length_half; - const int offset = ((inner_segment_index_half >> 1) == num_total_segment - 1 && ascending == outer_ascending) ? - (num_total_segment * segment_length - num_data) : 0; - const int segment_start = segment_index * segment_length; - if (inner_segment_index_half % 2 == 0) { - if (thread_index >= offset + segment_start) { - const int index_to_compare = thread_index + inner_segment_length_half - offset; - if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { - const T tmp = shared_values[thread_index]; - shared_values[thread_index] = shared_values[index_to_compare]; - shared_values[index_to_compare] = tmp; - } - } - } - __syncthreads(); - } - for (int inner_depth = depth + 1; inner_depth < BITONIC_SORT_DEPTH; ++inner_depth) { - const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); - const int inner_segment_index_half = thread_index / inner_segment_length_half; - if (inner_segment_index_half % 2 == 0) { - const int index_to_compare = thread_index + inner_segment_length_half; - if (index_to_compare < num_data && (shared_values[thread_index] > shared_values[index_to_compare]) == ascending) { - const T tmp = shared_values[thread_index]; - shared_values[thread_index] = shared_values[index_to_compare]; - shared_values[index_to_compare] = tmp; - } - } - __syncthreads(); - } - } - if (thread_index < num_data) { - values_pointer[thread_index] = shared_values[thread_index]; - } -} - -template -__global__ void BitonicSortMergeKernel(VAL_T* values, const int segment_length, const int len) { - const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const int segment_index = thread_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; - const int offset = static_cast(blockIdx.x * blockDim.x); - const int local_len = min(BITONIC_SORT_NUM_ELEMENTS, len - offset); - if (thread_index < len) { - shared_values[threadIdx.x] = values[thread_index]; - } - __syncthreads(); - int half_segment_length = BITONIC_SORT_NUM_ELEMENTS / 2; - while (half_segment_length >= 1) { - const int half_segment_index = static_cast(threadIdx.x) / half_segment_length; - if (half_segment_index % 2 == 0) { - const int index_to_compare = static_cast(threadIdx.x) + half_segment_length; - if (index_to_compare < local_len && ((shared_values[threadIdx.x] > shared_values[index_to_compare]) == ascending)) { - const VAL_T tmp = shared_values[index_to_compare]; - shared_values[index_to_compare] = shared_values[threadIdx.x]; - shared_values[threadIdx.x] = tmp; - } - } - __syncthreads(); - half_segment_length >>= 1; - } - if (thread_index < len) { - values[thread_index] = shared_values[threadIdx.x]; - } -} - -template -__global__ void BitonicCompareKernel(VAL_T* values, const int half_segment_length, const int outer_segment_length, const int len) { - const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const int segment_index = thread_index / outer_segment_length; - const int half_segment_index = thread_index / half_segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - if (half_segment_index % 2 == 0) { - const int num_total_segment = (len + outer_segment_length - 1) / outer_segment_length; - if (BEGIN && (half_segment_index >> 1) == num_total_segment - 1 && ascending == ASCENDING) { - const int offset = num_total_segment * outer_segment_length - len; - const int segment_start = segment_index * outer_segment_length; - if (thread_index >= offset + segment_start) { - const int index_to_compare = thread_index + half_segment_length - offset; - if (index_to_compare < len && (values[thread_index] > values[index_to_compare]) == ascending) { - const VAL_T tmp = values[index_to_compare]; - values[index_to_compare] = values[thread_index]; - values[thread_index] = tmp; - } - } - } else { - const int index_to_compare = thread_index + half_segment_length; - if (index_to_compare < len) { - if ((values[thread_index] > values[index_to_compare]) == ascending) { - const VAL_T tmp = values[index_to_compare]; - values[index_to_compare] = values[thread_index]; - values[thread_index] = tmp; - } - } - } - } -} - -template -void BitonicSortGlobalHelper(VAL_T* values, const size_t len) { - int max_depth = 1; - int len_to_shift = static_cast(len) - 1; - while (len_to_shift > 0) { - ++max_depth; - len_to_shift >>= 1; - } - const int num_blocks = (static_cast(len) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; - BitonicSortGlobalKernel<<>>(values, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - for (int depth = max_depth - 11; depth >= 1; --depth) { - const int segment_length = (1 << (max_depth - depth)); - int half_segment_length = (segment_length >> 1); - { - BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - half_segment_length >>= 1; - } - for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { - BitonicCompareKernel<<>>(values, half_segment_length, segment_length, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - half_segment_length >>= 1; - } - BitonicSortMergeKernel<<>>(values, segment_length, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - } -} - -template <> -void BitonicSortGlobal(int* values, const size_t len) { - BitonicSortGlobalHelper(values, len); -} - -template <> -void BitonicSortGlobal(double* values, const size_t len) { - BitonicSortGlobalHelper(values, len); -} - -template <> -void BitonicSortGlobal(double* values, const size_t len) { - BitonicSortGlobalHelper(values, len); -} - -template -__global__ void BitonicArgSortGlobalKernel(const VAL_T* values, INDEX_T* indices, const int num_total_data) { - const int thread_index = static_cast(threadIdx.x); - const int low = static_cast(blockIdx.x * BITONIC_SORT_NUM_ELEMENTS); - const bool outer_ascending = ASCENDING ? (blockIdx.x % 2 == 0) : (blockIdx.x % 2 == 1); - const VAL_T* values_pointer = values + low; - INDEX_T* indices_pointer = indices + low; - const int num_data = min(BITONIC_SORT_NUM_ELEMENTS, num_total_data - low); - __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; - __shared__ INDEX_T shared_indices[BITONIC_SORT_NUM_ELEMENTS]; - if (thread_index < num_data) { - shared_values[thread_index] = values_pointer[thread_index]; - shared_indices[thread_index] = static_cast(thread_index + blockIdx.x * blockDim.x); - } - __syncthreads(); - for (int depth = BITONIC_SORT_DEPTH - 1; depth >= 1; --depth) { - const int segment_length = 1 << (BITONIC_SORT_DEPTH - depth); - const int segment_index = thread_index / segment_length; - const bool ascending = outer_ascending ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - const int num_total_segment = (num_data + segment_length - 1) / segment_length; - { - const int inner_depth = depth; - const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); - const int inner_segment_index_half = thread_index / inner_segment_length_half; - const int offset = ((inner_segment_index_half >> 1) == num_total_segment - 1 && ascending == outer_ascending) ? - (num_total_segment * segment_length - num_data) : 0; - const int segment_start = segment_index * segment_length; - if (inner_segment_index_half % 2 == 0) { - if (thread_index >= offset + segment_start) { - const int index_to_compare = thread_index + inner_segment_length_half - offset; - const INDEX_T this_index = shared_indices[thread_index]; - const INDEX_T other_index = shared_indices[index_to_compare]; - const VAL_T this_value = shared_values[thread_index]; - const VAL_T other_value = shared_values[index_to_compare]; - if (index_to_compare < num_data && (this_value > other_value) == ascending) { - shared_indices[thread_index] = other_index; - shared_indices[index_to_compare] = this_index; - shared_values[thread_index] = other_value; - shared_values[index_to_compare] = this_value; - } - } - } - __syncthreads(); - } - for (int inner_depth = depth + 1; inner_depth < BITONIC_SORT_DEPTH; ++inner_depth) { - const int inner_segment_length_half = 1 << (BITONIC_SORT_DEPTH - 1 - inner_depth); - const int inner_segment_index_half = thread_index / inner_segment_length_half; - if (inner_segment_index_half % 2 == 0) { - const int index_to_compare = thread_index + inner_segment_length_half; - const INDEX_T this_index = shared_indices[thread_index]; - const INDEX_T other_index = shared_indices[index_to_compare]; - const VAL_T this_value = shared_values[thread_index]; - const VAL_T other_value = shared_values[index_to_compare]; - if (index_to_compare < num_data && (this_value > other_value) == ascending) { - shared_indices[thread_index] = other_index; - shared_indices[index_to_compare] = this_index; - shared_values[thread_index] = other_value; - shared_values[index_to_compare] = this_value; - } - } - __syncthreads(); - } - } - if (thread_index < num_data) { - indices_pointer[thread_index] = shared_indices[thread_index]; - } -} - -template -__global__ void BitonicArgSortMergeKernel(const VAL_T* values, INDEX_T* indices, const int segment_length, const int len) { - const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const int segment_index = thread_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - __shared__ VAL_T shared_values[BITONIC_SORT_NUM_ELEMENTS]; - __shared__ INDEX_T shared_indices[BITONIC_SORT_NUM_ELEMENTS]; - const int offset = static_cast(blockIdx.x * blockDim.x); - const int local_len = min(BITONIC_SORT_NUM_ELEMENTS, len - offset); - if (thread_index < len) { - const INDEX_T index = indices[thread_index]; - shared_values[threadIdx.x] = values[index]; - shared_indices[threadIdx.x] = index; - } - __syncthreads(); - int half_segment_length = BITONIC_SORT_NUM_ELEMENTS / 2; - while (half_segment_length >= 1) { - const int half_segment_index = static_cast(threadIdx.x) / half_segment_length; - if (half_segment_index % 2 == 0) { - const int index_to_compare = static_cast(threadIdx.x) + half_segment_length; - const INDEX_T this_index = shared_indices[threadIdx.x]; - const INDEX_T other_index = shared_indices[index_to_compare]; - const VAL_T this_value = shared_values[threadIdx.x]; - const VAL_T other_value = shared_values[index_to_compare]; - if (index_to_compare < local_len && ((this_value > other_value) == ascending)) { - shared_indices[threadIdx.x] = other_index; - shared_indices[index_to_compare] = this_index; - shared_values[threadIdx.x] = other_value; - shared_values[index_to_compare] = this_value; - } - } - __syncthreads(); - half_segment_length >>= 1; - } - if (thread_index < len) { - indices[thread_index] = shared_indices[threadIdx.x]; - } -} - -template -__global__ void BitonicArgCompareKernel(const VAL_T* values, INDEX_T* indices, const int half_segment_length, const int outer_segment_length, const int len) { - const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const int segment_index = thread_index / outer_segment_length; - const int half_segment_index = thread_index / half_segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - if (half_segment_index % 2 == 0) { - const int num_total_segment = (len + outer_segment_length - 1) / outer_segment_length; - if (BEGIN && (half_segment_index >> 1) == num_total_segment - 1 && ascending == ASCENDING) { - const int offset = num_total_segment * outer_segment_length - len; - const int segment_start = segment_index * outer_segment_length; - if (thread_index >= offset + segment_start) { - const int index_to_compare = thread_index + half_segment_length - offset; - if (index_to_compare < len) { - const INDEX_T this_index = indices[thread_index]; - const INDEX_T other_index = indices[index_to_compare]; - if ((values[this_index] > values[other_index]) == ascending) { - indices[thread_index] = other_index; - indices[index_to_compare] = this_index; - } - } - } - } else { - const int index_to_compare = thread_index + half_segment_length; - if (index_to_compare < len) { - const INDEX_T this_index = indices[thread_index]; - const INDEX_T other_index = indices[index_to_compare]; - if ((values[this_index] > values[other_index]) == ascending) { - indices[thread_index] = other_index; - indices[index_to_compare] = this_index; - } - } - } - } -} - -template -void BitonicArgSortGlobalHelper(const VAL_T* values, INDEX_T* indices, const size_t len) { - int max_depth = 1; - int len_to_shift = static_cast(len) - 1; - while (len_to_shift > 0) { - ++max_depth; - len_to_shift >>= 1; - } - const int num_blocks = (static_cast(len) + BITONIC_SORT_NUM_ELEMENTS - 1) / BITONIC_SORT_NUM_ELEMENTS; - BitonicArgSortGlobalKernel<<>>(values, indices, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - for (int depth = max_depth - 11; depth >= 1; --depth) { - const int segment_length = (1 << (max_depth - depth)); - int half_segment_length = (segment_length >> 1); - { - BitonicArgCompareKernel<<>>( - values, indices, half_segment_length, segment_length, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - half_segment_length >>= 1; - } - for (int inner_depth = depth + 1; inner_depth <= max_depth - 11; ++inner_depth) { - BitonicArgCompareKernel<<>>( - values, indices, half_segment_length, segment_length, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - half_segment_length >>= 1; - } - BitonicArgSortMergeKernel<<>>( - values, indices, segment_length, static_cast(len)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - } -} - -template <> -void BitonicArgSortGlobal(const double* values, data_size_t* indices, const size_t len) { - BitonicArgSortGlobalHelper(values, indices, len); -} - -template <> -void BitonicArgSortGlobal(const double* values, data_size_t* indices, const size_t len) { - BitonicArgSortGlobalHelper(values, indices, len); -} - -template <> -void BitonicArgSortGlobal(const label_t* values, data_size_t* indices, const size_t len) { - BitonicArgSortGlobalHelper(values, indices, len); -} - - -template -__device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, const int len) { - __shared__ VAL_T shared_values[BLOCK_DIM]; - __shared__ INDEX_T shared_indices[BLOCK_DIM]; - int len_to_shift = len - 1; - int max_depth = 1; - while (len_to_shift > 0) { - len_to_shift >>= 1; - ++max_depth; - } - const int num_blocks = (len + static_cast(BLOCK_DIM) - 1) / static_cast(BLOCK_DIM); - for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); - if (this_index < len) { - shared_values[threadIdx.x] = values[this_index]; - shared_indices[threadIdx.x] = this_index; - } else { - shared_indices[threadIdx.x] = len; - } - __syncthreads(); - for (int depth = max_depth - 1; depth > max_depth - static_cast(MAX_DEPTH); --depth) { - const int segment_length = (1 << (max_depth - depth)); - const int segment_index = this_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - { - const int half_segment_length = (segment_length >> 1); - const int half_segment_index = this_index / half_segment_length; - const int num_total_segment = (len + segment_length - 1) / segment_length; - const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ? - (num_total_segment * segment_length - len) : 0; - if (half_segment_index % 2 == 0) { - const int segment_start = segment_index * segment_length; - if (this_index >= offset + segment_start) { - const int other_index = static_cast(threadIdx.x) + half_segment_length - offset; - const INDEX_T this_data_index = shared_indices[threadIdx.x]; - const INDEX_T other_data_index = shared_indices[other_index]; - const VAL_T this_value = shared_values[threadIdx.x]; - const VAL_T other_value = shared_values[other_index]; - if (other_data_index < len && (this_value > other_value) == ascending) { - shared_indices[threadIdx.x] = other_data_index; - shared_indices[other_index] = this_data_index; - shared_values[threadIdx.x] = other_value; - shared_values[other_index] = this_value; - } - } - } - __syncthreads(); - } - for (int inner_depth = depth + 1; inner_depth < max_depth; ++inner_depth) { - const int half_segment_length = (1 << (max_depth - inner_depth - 1)); - const int half_segment_index = this_index / half_segment_length; - if (half_segment_index % 2 == 0) { - const int other_index = static_cast(threadIdx.x) + half_segment_length; - const INDEX_T this_data_index = shared_indices[threadIdx.x]; - const INDEX_T other_data_index = shared_indices[other_index]; - const VAL_T this_value = shared_values[threadIdx.x]; - const VAL_T other_value = shared_values[other_index]; - if (other_data_index < len && (this_value > other_value) == ascending) { - shared_indices[threadIdx.x] = other_data_index; - shared_indices[other_index] = this_data_index; - shared_values[threadIdx.x] = other_value; - shared_values[other_index] = this_value; - } - } - __syncthreads(); - } - } - if (this_index < len) { - indices[this_index] = shared_indices[threadIdx.x]; - } - __syncthreads(); - } - for (int depth = max_depth - static_cast(MAX_DEPTH); depth >= 1; --depth) { - const int segment_length = (1 << (max_depth - depth)); - { - const int num_total_segment = (len + segment_length - 1) / segment_length; - const int half_segment_length = (segment_length >> 1); - for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); - const int segment_index = this_index / segment_length; - const int half_segment_index = this_index / half_segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ? - (num_total_segment * segment_length - len) : 0; - if (half_segment_index % 2 == 0) { - const int segment_start = segment_index * segment_length; - if (this_index >= offset + segment_start) { - const int other_index = this_index + half_segment_length - offset; - if (other_index < len) { - const INDEX_T this_data_index = indices[this_index]; - const INDEX_T other_data_index = indices[other_index]; - const VAL_T this_value = values[this_data_index]; - const VAL_T other_value = values[other_data_index]; - if ((this_value > other_value) == ascending) { - indices[this_index] = other_data_index; - indices[other_index] = this_data_index; - } - } - } - } - } - __syncthreads(); - } - for (int inner_depth = depth + 1; inner_depth <= max_depth - static_cast(MAX_DEPTH); ++inner_depth) { - const int half_segment_length = (1 << (max_depth - inner_depth - 1)); - for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); - const int segment_index = this_index / segment_length; - const int half_segment_index = this_index / half_segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - if (half_segment_index % 2 == 0) { - const int other_index = this_index + half_segment_length; - if (other_index < len) { - const INDEX_T this_data_index = indices[this_index]; - const INDEX_T other_data_index = indices[other_index]; - const VAL_T this_value = values[this_data_index]; - const VAL_T other_value = values[other_data_index]; - if ((this_value > other_value) == ascending) { - indices[this_index] = other_data_index; - indices[other_index] = this_data_index; - } - } - } - __syncthreads(); - } - } - for (int block_index = 0; block_index < num_blocks; ++block_index) { - const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); - const int segment_index = this_index / segment_length; - const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); - if (this_index < len) { - const INDEX_T index = indices[this_index]; - shared_values[threadIdx.x] = values[index]; - shared_indices[threadIdx.x] = index; - } else { - shared_indices[threadIdx.x] = len; - } - __syncthreads(); - for (int inner_depth = max_depth - static_cast(MAX_DEPTH) + 1; inner_depth < max_depth; ++inner_depth) { - const int half_segment_length = (1 << (max_depth - inner_depth - 1)); - const int half_segment_index = this_index / half_segment_length; - if (half_segment_index % 2 == 0) { - const int other_index = static_cast(threadIdx.x) + half_segment_length; - const INDEX_T this_data_index = shared_indices[threadIdx.x]; - const INDEX_T other_data_index = shared_indices[other_index]; - const VAL_T this_value = shared_values[threadIdx.x]; - const VAL_T other_value = shared_values[other_index]; - if (other_data_index < len && (this_value > other_value) == ascending) { - shared_indices[threadIdx.x] = other_data_index; - shared_indices[other_index] = this_data_index; - shared_values[threadIdx.x] = other_value; - shared_values[other_index] = this_value; - } - } - __syncthreads(); - } - if (this_index < len) { - indices[this_index] = shared_indices[threadIdx.x]; - } - __syncthreads(); - } - } -} - -__global__ void BitonicArgSortItemsGlobalKernel(const double* scores, - const int num_queries, - const data_size_t* cuda_query_boundaries, - data_size_t* out_indices) { - const int query_index_start = static_cast(blockIdx.x) * BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE; - const int query_index_end = min(query_index_start + BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE, num_queries); - for (int query_index = query_index_start; query_index < query_index_end; ++query_index) { - const data_size_t query_item_start = cuda_query_boundaries[query_index]; - const data_size_t query_item_end = cuda_query_boundaries[query_index + 1]; - const data_size_t num_items_in_query = query_item_end - query_item_start; - BitonicArgSortDevice(scores + query_item_start, - out_indices + query_item_start, - num_items_in_query); - __syncthreads(); - } -} - -void BitonicArgSortItemsGlobal( - const double* scores, - const int num_queries, - const data_size_t* cuda_query_boundaries, - data_size_t* out_indices) { - const int num_blocks = (num_queries + BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE - 1) / BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE; - BitonicArgSortItemsGlobalKernel<<>>( - scores, num_queries, cuda_query_boundaries, out_indices); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -__device__ void PrefixSumZeroOut(data_size_t* values, size_t n) { - unsigned int offset = 1; - unsigned int threadIdx_x = static_cast(threadIdx.x); - const data_size_t last_element = values[n - 1]; - __syncthreads(); - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - if (values[dst_pos] != 0) { - values[dst_pos] += values[src_pos]; - } - } - offset <<= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - values[n - 1] = 0; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const data_size_t src_val = values[src_pos]; - values[src_pos] = values[dst_pos]; - if (src_val != 0) { - values[dst_pos] += src_val; - } else { - values[dst_pos] = 0; - } - } - __syncthreads(); - } - if (threadIdx.x == 0) { - if (last_element != 0) { - values[n] = values[n - 1] + last_element; - } else { - values[n] = 0; - } - } - __syncthreads(); -} - -__device__ void PrefixSumZeroOut(data_size_t* values, bool* is_all_non_zero, size_t n) { - unsigned int offset = 1; - unsigned int threadIdx_x = static_cast(threadIdx.x); - const data_size_t last_element = values[n - 1]; - const bool last_is_all_non_zero = is_all_non_zero[n - 1]; - __syncthreads(); - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - if (is_all_non_zero[dst_pos]) { - values[dst_pos] += values[src_pos]; - is_all_non_zero[dst_pos] &= is_all_non_zero[src_pos]; - } - } - offset <<= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - values[n - 1] = 0; - is_all_non_zero[n - 1] = true; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const data_size_t src_val = values[src_pos]; - const bool src_is_all_non_zero = is_all_non_zero[src_pos]; - values[src_pos] = values[dst_pos]; - is_all_non_zero[src_pos] = is_all_non_zero[dst_pos]; - if (src_is_all_non_zero) { - values[dst_pos] += src_val; - } else { - values[dst_pos] = src_val; - } - is_all_non_zero[dst_pos] &= src_is_all_non_zero; - } - __syncthreads(); - } - if (threadIdx.x == 0) { - if (last_is_all_non_zero) { - values[n] = values[n - 1] + last_element; - is_all_non_zero[n] = is_all_non_zero[n - 1]; - } else { - values[n] = last_element; - is_all_non_zero[n] = last_is_all_non_zero; - } - } - __syncthreads(); -} - -template -__global__ void GlobalInclusivePrefixSumKernel(T* values, T* block_buffer, data_size_t num_data) { - __shared__ T shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - shared_buffer[threadIdx.x] = (data_index < num_data ? values[data_index] : 0); - __syncthreads(); - PrefixSum(shared_buffer, blockDim.x); - if (data_index < num_data) { - values[data_index] = shared_buffer[threadIdx.x + 1]; - } - if (threadIdx.x == 0) { - block_buffer[blockIdx.x + 1] = shared_buffer[blockDim.x]; - } -} - -template -__global__ void GlobalInclusiveArgPrefixSumKernel( - const INDEX_T* sorted_indices, const VAL_T* in_values, REDUCE_T* out_values, REDUCE_T* block_buffer, data_size_t num_data) { - __shared__ REDUCE_T shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - if (sorted_indices[data_index] >= num_data || sorted_indices[data_index] < 0) { - printf("error find sorted_indices[%d] = %d\n", data_index, sorted_indices[data_index]); - } - } - shared_buffer[threadIdx.x] = (data_index < num_data ? in_values[sorted_indices[data_index]] : 0); - __syncthreads(); - PrefixSum(shared_buffer, blockDim.x); - if (data_index < num_data) { - out_values[data_index] = shared_buffer[threadIdx.x + 1]; - } - if (threadIdx.x == 0) { - block_buffer[blockIdx.x + 1] = shared_buffer[blockDim.x]; - } -} - -template -__global__ void GlobalInclusivePrefixSumReduceBlockKernel(T* block_buffer, data_size_t num_blocks) { - __shared__ T shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; - T thread_sum = 0; - const data_size_t num_blocks_per_thread = (num_blocks + static_cast(blockDim.x)) / static_cast(blockDim.x); - const data_size_t thread_start_block_index = static_cast(threadIdx.x) * num_blocks_per_thread; - const data_size_t thread_end_block_index = min(thread_start_block_index + num_blocks_per_thread, num_blocks + 1); - for (data_size_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { - thread_sum += block_buffer[block_index]; - } - shared_buffer[threadIdx.x] = thread_sum; - __syncthreads(); - PrefixSum(shared_buffer, blockDim.x); - const T thread_sum_base = shared_buffer[threadIdx.x]; - for (data_size_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { - block_buffer[block_index] += thread_sum_base; - } -} - -__global__ void GlobalInclusivePrefixSumReduceBlockZeroOutKernel(data_size_t* block_buffer, const uint16_t* block_mark_first_zero, data_size_t num_blocks) { - __shared__ data_size_t shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; - __shared__ bool is_all_non_zero[GLOBAL_PREFIX_SUM_BLOCK_SIZE]; - data_size_t thread_sum = 0; - const data_size_t num_blocks_per_thread = (num_blocks + static_cast(blockDim.x) - 1) / static_cast(blockDim.x); - const data_size_t thread_start_block_index = static_cast(threadIdx.x) * num_blocks_per_thread; - const data_size_t thread_end_block_index = min(thread_start_block_index + num_blocks_per_thread, num_blocks); - bool thread_is_all_non_zero = true; - data_size_t first_with_zero_block = thread_end_block_index; - for (data_size_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { - const uint16_t mark_first_zero = block_mark_first_zero[block_index]; - const data_size_t block_buffer_value = block_buffer[block_index]; - if (mark_first_zero == GLOBAL_PREFIX_SUM_BLOCK_SIZE) { - thread_sum += block_buffer_value; - } else { - thread_is_all_non_zero = false; - thread_sum = block_buffer_value; - if (first_with_zero_block == thread_end_block_index) { - first_with_zero_block = block_index; - } - } - } - is_all_non_zero[threadIdx.x] = thread_is_all_non_zero; - shared_buffer[threadIdx.x] = thread_sum; - __syncthreads(); - PrefixSumZeroOut(shared_buffer, is_all_non_zero, blockDim.x); - data_size_t thread_sum_base = shared_buffer[threadIdx.x]; - for (data_size_t block_index = thread_start_block_index; block_index < first_with_zero_block; ++block_index) { - block_buffer[block_index] += thread_sum_base; - } -} - -template -__global__ void GlobalInclusivePrefixSumAddBlockBaseKernel(const T* block_buffer, T* values, data_size_t num_data) { - const T block_sum_base = block_buffer[blockIdx.x]; - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - values[data_index] += block_sum_base; - } -} - -__global__ void GlobalInclusivePrefixSumAddBlockBaseGenAUCMarkKernel( - const data_size_t* block_buffer, - data_size_t* values, - const uint16_t* block_first_zero, - data_size_t num_data) { - const data_size_t block_sum_base = (blockIdx.x == 0 ? 0 : block_buffer[blockIdx.x - 1]); - const uint16_t first_zero = block_first_zero[blockIdx.x]; - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data && threadIdx.x < first_zero) { - values[data_index] += block_sum_base; - } -} - -template -void GlobalInclusivePrefixSum(T* values, T* block_buffer, size_t n) { - const data_size_t num_data = static_cast(n); - const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - GlobalInclusivePrefixSumKernel<<>>( - values, block_buffer, num_data); - GlobalInclusivePrefixSumReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( - block_buffer, num_blocks); - GlobalInclusivePrefixSumAddBlockBaseKernel<<>>( - block_buffer, values, num_data); -} - -template -void GlobalInclusiveArgPrefixSumInner(const INDEX_T* sorted_indices, const VAL_T* in_values, REDUCE_T* out_values, REDUCE_T* block_buffer, size_t n) { - const data_size_t num_data = static_cast(n); - const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - GlobalInclusiveArgPrefixSumKernel<<>>( - sorted_indices, in_values, out_values, block_buffer, num_data); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - GlobalInclusivePrefixSumReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( - block_buffer, num_blocks); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - GlobalInclusivePrefixSumAddBlockBaseKernel<<>>( - block_buffer, out_values, num_data); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -template <> -void GlobalInclusiveArgPrefixSum(const data_size_t* sorted_indices, const label_t* in_values, double* out_values, double* block_buffer, size_t n) { - GlobalInclusiveArgPrefixSumInner(sorted_indices, in_values, out_values, block_buffer, n); -} - -__global__ void GlobalGenAUCMarkKernel(const double* scores, - const data_size_t* sorted_indices, - data_size_t* mark_buffer, - data_size_t* block_mark_buffer, - uint16_t* block_mark_first_zero, - data_size_t num_data) { - __shared__ data_size_t shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; - __shared__ uint16_t shuffle_reduce_shared_buffer[32]; - __shared__ bool is_all_non_zero[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - if (data_index > 0) { - shared_buffer[threadIdx.x] = static_cast(scores[sorted_indices[data_index]] == scores[sorted_indices[data_index - 1]]); - } else { - shared_buffer[threadIdx.x] = 0; - } - } else { - shared_buffer[threadIdx.x] = 0; - } - is_all_non_zero[threadIdx.x] = static_cast(shared_buffer[threadIdx.x]); - __syncthreads(); - uint16_t block_first_zero = (shared_buffer[threadIdx.x] == 0 ? threadIdx.x : blockDim.x); - PrefixSumZeroOut(shared_buffer, is_all_non_zero, blockDim.x); - block_first_zero = ShuffleReduceMin(block_first_zero, shuffle_reduce_shared_buffer, blockDim.x); - if (data_index < num_data) { - mark_buffer[data_index] = shared_buffer[threadIdx.x + 1]; - } - if (threadIdx.x == 0) { - block_mark_buffer[blockIdx.x] = shared_buffer[blockDim.x]; - block_mark_first_zero[blockIdx.x] = block_first_zero; - } -} - -void GloblGenAUCMark(const double* scores, - const data_size_t* sorted_indices, - data_size_t* mark_buffer, - data_size_t* block_mark_buffer, - uint16_t* block_mark_first_zero, - const data_size_t num_data) { - const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - GlobalGenAUCMarkKernel<<>>(scores, sorted_indices, mark_buffer, block_mark_buffer, block_mark_first_zero, num_data); - GlobalInclusivePrefixSumReduceBlockZeroOutKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( - block_mark_buffer, block_mark_first_zero, num_blocks); - GlobalInclusivePrefixSumAddBlockBaseGenAUCMarkKernel<<>>( - block_mark_buffer, mark_buffer, block_mark_first_zero, num_data); -} - -template -__global__ void GlobalGenAUCPosSumKernel( - const label_t* labels, - const label_t* weights, - const data_size_t* sorted_indices, - double* sum_pos_buffer, - double* block_sum_pos_buffer, - const data_size_t num_data) { - __shared__ double shared_buffer[GLOBAL_PREFIX_SUM_BLOCK_SIZE + 1]; - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const double pos = IS_POS ? - (USE_WEIGHT ? - (data_index < num_data ? static_cast(labels[sorted_indices[data_index]] > 0) * weights[sorted_indices[data_index]] : 0.0f) : - (data_index < num_data ? static_cast(labels[sorted_indices[data_index]] > 0) : 0.0f)) : - (USE_WEIGHT ? - (data_index < num_data ? static_cast(labels[sorted_indices[data_index]] <= 0) * weights[sorted_indices[data_index]] : 0.0f) : - (data_index < num_data ? static_cast(labels[sorted_indices[data_index]] <= 0) : 0.0f)); - - shared_buffer[threadIdx.x] = pos; - __syncthreads(); - PrefixSum(shared_buffer, blockDim.x); - if (data_index < num_data) { - sum_pos_buffer[data_index] = shared_buffer[threadIdx.x + 1]; - } - if (threadIdx.x == 0) { - block_sum_pos_buffer[blockIdx.x + 1] = shared_buffer[blockDim.x]; - } -} - -template -void GlobalGenAUCPosNegSumInner(const label_t* labels, - const label_t* weights, - const data_size_t* sorted_indices, - double* sum_pos_buffer, - double* block_sum_pos_buffer, - const data_size_t num_data) { - const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - GlobalGenAUCPosSumKernel<<>>(labels, weights, sorted_indices, sum_pos_buffer, block_sum_pos_buffer, num_data); - GlobalInclusivePrefixSumReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>( - block_sum_pos_buffer, num_blocks); - GlobalInclusivePrefixSumAddBlockBaseKernel<<>>( - block_sum_pos_buffer, sum_pos_buffer, num_data); -} - -template <> -void GlobalGenAUCPosNegSum(const label_t* labels, - const label_t* weights, - const data_size_t* sorted_indices, - double* sum_pos_buffer, - double* block_sum_pos_buffer, - const data_size_t num_data) { - GlobalGenAUCPosNegSumInner(labels, weights, sorted_indices, sum_pos_buffer, block_sum_pos_buffer, num_data); -} - -template <> -void GlobalGenAUCPosNegSum(const label_t* labels, - const label_t* weights, - const data_size_t* sorted_indices, - double* sum_pos_buffer, - double* block_sum_pos_buffer, - const data_size_t num_data) { - GlobalGenAUCPosNegSumInner(labels, weights, sorted_indices, sum_pos_buffer, block_sum_pos_buffer, num_data); -} - -template <> -void GlobalGenAUCPosNegSum(const label_t* labels, - const label_t* weights, - const data_size_t* sorted_indices, - double* sum_pos_buffer, - double* block_sum_pos_buffer, - const data_size_t num_data) { - GlobalGenAUCPosNegSumInner(labels, weights, sorted_indices, sum_pos_buffer, block_sum_pos_buffer, num_data); -} - -template -__global__ void GlobalCalcAUCKernel( - const double* sum_pos_buffer, - const double* sum_neg_buffer, - const data_size_t* mark_buffer, - const data_size_t num_data, - double* block_buffer) { - __shared__ double shared_buffer[32]; - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - double area = 0.0f; - if (data_index < num_data) { - if (data_index == num_data - 1 || mark_buffer[data_index + 1] == 0) { - const data_size_t prev_data_index = data_index - mark_buffer[data_index] - 1; - const double prev_sum_pos = (prev_data_index < 0 ? 0.0f : sum_pos_buffer[prev_data_index]); - if (USE_WEIGHT) { - const double prev_sum_neg = (prev_data_index < 0 ? 0.0f : sum_neg_buffer[prev_data_index]); - const double cur_pos = sum_pos_buffer[data_index] - prev_sum_pos; - const double cur_neg = sum_neg_buffer[data_index] - prev_sum_neg; - area = cur_neg * (cur_pos * 0.5f + prev_sum_pos); - } else { - const double cur_pos = sum_pos_buffer[data_index] - prev_sum_pos; - const double cur_neg = static_cast(data_index - prev_data_index) - cur_pos; - area = cur_neg * (cur_pos * 0.5f + prev_sum_pos); - } - } - } - area = ShuffleReduceSum(area, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - block_buffer[blockIdx.x] = area; - } -} - -template -__global__ void BlockReduceSum(T* block_buffer, const data_size_t num_blocks) { - __shared__ T shared_buffer[32]; - T thread_sum = 0; - for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { - thread_sum += block_buffer[block_index]; - } - thread_sum = ShuffleReduceSum(thread_sum, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - block_buffer[0] = thread_sum; - } -} - template __global__ void BlockReduceMax(T* block_buffer, const data_size_t num_blocks) { __shared__ T shared_buffer[32]; @@ -973,277 +23,4 @@ __global__ void BlockReduceMax(T* block_buffer, const data_size_t num_blocks) { } } -template -__global__ void BlockReduceMin(T* block_buffer, const data_size_t num_blocks) { - __shared__ T shared_buffer[32]; - T thread_min = 0; - for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { - const T value = block_buffer[block_index]; - if (value < thread_min) { - thread_min = value; - } - } - thread_min = ShuffleReduceMin(thread_min, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - block_buffer[0] = thread_min; - } -} - -template -void GlobalCalcAUCInner(const double* sum_pos_buffer, - const double* sum_neg_buffer, - const data_size_t* mark_buffer, - const data_size_t num_data, - double* block_buffer) { - const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - GlobalCalcAUCKernel<<>>(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); - BlockReduceSum<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); -} - -template <> -void GlobalCalcAUC(const double* sum_pos_buffer, - const double* sum_neg_buffer, - const data_size_t* mark_buffer, - const data_size_t num_data, - double* block_buffer) { - GlobalCalcAUCInner(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); -} - -template <> -void GlobalCalcAUC(const double* sum_pos_buffer, - const double* sum_neg_buffer, - const data_size_t* mark_buffer, - const data_size_t num_data, - double* block_buffer) { - GlobalCalcAUCInner(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); -} - -template -__global__ void GlobalCalcAveragePrecisionKernel( - const double* sum_pos_buffer, - const double* sum_neg_buffer, - const data_size_t* mark_buffer, - const data_size_t num_data, - double* block_buffer) { - __shared__ double shared_buffer[32]; - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - double area = 0.0f; - if (data_index < num_data) { - if (data_index == num_data - 1 || mark_buffer[data_index + 1] == 0) { - const data_size_t prev_data_index = data_index - mark_buffer[data_index] - 1; - const double prev_sum_pos = (prev_data_index < 0 ? 0.0f : sum_pos_buffer[prev_data_index]); - if (USE_WEIGHT) { - const double prev_sum_neg = (prev_data_index < 0 ? 0.0f : sum_neg_buffer[prev_data_index]); - const double cur_pos = sum_pos_buffer[data_index] - prev_sum_pos; - const double cur_neg = sum_neg_buffer[data_index] - prev_sum_neg; - area = cur_pos * (cur_pos + prev_sum_pos) / (prev_sum_neg + prev_sum_pos + cur_pos + cur_neg); - } else { - const double cur_pos = sum_pos_buffer[data_index] - prev_sum_pos; - const double cur_neg = static_cast(data_index - prev_data_index) - cur_pos; - area = cur_pos * (cur_pos + prev_sum_pos) / static_cast(data_index + 1); - } - } - } - area = ShuffleReduceSum(area, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - block_buffer[blockIdx.x] = area; - } -} - -template -void GlobalCalcAveragePrecisionInner(const double* sum_pos_buffer, - const double* sum_neg_buffer, - const data_size_t* mark_buffer, - const data_size_t num_data, - double* block_buffer) { - const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - GlobalCalcAveragePrecisionKernel<<>>(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); - BlockReduceSum<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); -} - -template <> -void GlobalCalcAveragePrecision(const double* sum_pos_buffer, - const double* sum_neg_buffer, - const data_size_t* mark_buffer, - const data_size_t num_data, - double* block_buffer) { - GlobalCalcAveragePrecisionInner(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); -} - -template <> -void GlobalCalcAveragePrecision(const double* sum_pos_buffer, - const double* sum_neg_buffer, - const data_size_t* mark_buffer, - const data_size_t num_data, - double* block_buffer) { - GlobalCalcAveragePrecisionInner(sum_pos_buffer, sum_neg_buffer, mark_buffer, num_data, block_buffer); -} - -template -__global__ void ReduceSumGlobalKernel(const VAL_T* values, const data_size_t num_value, REDUCE_T* block_buffer) { - __shared__ REDUCE_T shared_buffer[32]; - const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - const REDUCE_T value = (data_index < num_value ? static_cast(values[data_index]) : 0.0f); - const REDUCE_T reduce_value = ShuffleReduceSum(value, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - block_buffer[blockIdx.x] = reduce_value; - } -} - -template -void ReduceSumGlobalInner(const VAL_T* values, size_t n, REDUCE_T* block_buffer) { - const data_size_t num_value = static_cast(n); - const data_size_t num_blocks = (num_value + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - ReduceSumGlobalKernel<<>>(values, num_value, block_buffer); - BlockReduceSum<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); -} - -template -__global__ void ReduceDotProductGlobalKernel(const VAL_A_T* a, const VAL_B_T* b, const data_size_t num_value, REDUCE_T* block_buffer) { - __shared__ REDUCE_T shared_buffer[32]; - const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - const REDUCE_T value = (data_index < num_value ? static_cast(a[data_index]) * static_cast(b[data_index]) : 0.0f); - const REDUCE_T reduce_value = ShuffleReduceSum(value, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - block_buffer[blockIdx.x] = reduce_value; - } -} - -template <> -void ReduceSumGlobal(const label_t* values, size_t n, double* block_buffer) { - ReduceSumGlobalInner(values, n, block_buffer); -} - -template -void ReduceDotProductGlobalInner(const VAL_A_T* a, const VAL_B_T* b, size_t n, REDUCE_T* block_buffer) { - const data_size_t num_value = static_cast(n); - const data_size_t num_blocks = (num_value + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - ReduceDotProductGlobalKernel<<>>(a, b, num_value, block_buffer); - BlockReduceSum<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); -} - -template <> -void ReduceDotProductGlobal(const label_t* a, const label_t* b, size_t n, double* block_buffer) { - ReduceDotProductGlobalInner(a, b, n, block_buffer); -} - -template -__global__ void ReduceMaxGlobalKernel(const VAL_T* values, const data_size_t num_value, REDUCE_T* block_buffer) { - __shared__ REDUCE_T shared_buffer[32]; - const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - const REDUCE_T value = (data_index < num_value ? static_cast(values[data_index]) : 0.0f); - const REDUCE_T reduce_value = ShuffleReduceMax(value, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - block_buffer[blockIdx.x] = reduce_value; - } -} - -template -void ReduceMaxGlobalInner(const VAL_T* values, size_t n, REDUCE_T* block_buffer) { - const data_size_t num_value = static_cast(n); - const data_size_t num_blocks = (num_value + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - ReduceMaxGlobalKernel<<>>(values, num_value, block_buffer); - BlockReduceMax<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); -} - -template <> -void ReduceMaxGlobal(const label_t* values, size_t n, double* block_buffer) { - ReduceMaxGlobalInner(values, n, block_buffer); -} - -template -__global__ void ReduceMinGlobalKernel(const VAL_T* values, const data_size_t num_value, REDUCE_T* block_buffer) { - __shared__ REDUCE_T shared_buffer[32]; - const data_size_t data_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - const REDUCE_T value = (data_index < num_value ? static_cast(values[data_index]) : 0.0f); - const REDUCE_T reduce_value = ShuffleReduceMin(value, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - block_buffer[blockIdx.x] = reduce_value; - } -} - -template -void ReduceMinGlobalInner(const VAL_T* values, size_t n, REDUCE_T* block_buffer) { - const data_size_t num_value = static_cast(n); - const data_size_t num_blocks = (num_value + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; - ReduceMinGlobalKernel<<>>(values, num_value, block_buffer); - BlockReduceMin<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks); -} - -template <> -void ReduceMinGlobal(const label_t* values, size_t n, double* block_buffer) { - ReduceMinGlobalInner(values, n, block_buffer); -} - -template -__device__ VAL_T PercentileDeviceInner(const VAL_T* values, - const WEIGHT_T* weights, - INDEX_T* indices, - REDUCE_WEIGHT_T* weights_prefix_sum, - const double alpha, - const INDEX_T len) { - if (len <= 1) { - return values[0]; - } - if (!USE_WEIGHT) { - BitonicArgSortDevice(values, indices, len); - const double float_pos = (1.0f - alpha) * len; - const INDEX_T pos = static_cast(float_pos); - if (pos < 1) { - return values[indices[0]]; - } else if (pos >= len) { - return values[indices[len - 1]]; - } else { - const double bias = float_pos - pos; - const VAL_T v1 = values[indices[pos - 1]]; - const VAL_T v2 = values[indices[pos]]; - return static_cast(v1 - (v1 - v2) * bias); - } - } else { - BitonicArgSortDevice(values, indices, len); - PrefixSumDevice(weights, indices, weights_prefix_sum, len); - const REDUCE_WEIGHT_T threshold = weights_prefix_sum[len - 1] * (1.0f - alpha); - __shared__ INDEX_T pos; - if (threadIdx.x == 0) { - pos = len; - } - __syncthreads(); - for (INDEX_T index = static_cast(threadIdx.x); index < len; index += static_cast(blockDim.x)) { - if (weights_prefix_sum[index] > threshold && (index == 0 || weights_prefix_sum[index - 1] <= threshold)) { - pos = index; - } - } - __syncthreads(); - pos = min(pos, len - 1); - if (pos == 0 || pos == len - 1) { - return values[pos]; - } - const VAL_T v1 = values[indices[pos - 1]]; - const VAL_T v2 = values[indices[pos]]; - return static_cast(v1 - (v1 - v2) * (threshold - weights_prefix_sum[pos - 1]) / (weights_prefix_sum[pos] - weights_prefix_sum[pos - 1])); - } -} - -template <> -__device__ double PercentileDevice( - const double* values, - const label_t* weights, - data_size_t* indices, - double* weights_prefix_sum, - const double alpha, - const data_size_t len) { - return PercentileDeviceInner(values, weights, indices, weights_prefix_sum, alpha, len); -} - -template <> -__device__ double PercentileDevice( - const double* values, - const label_t* weights, - data_size_t* indices, - double* weights_prefix_sum, - const double alpha, - const data_size_t len) { - return PercentileDeviceInner(values, weights, indices, weights_prefix_sum, alpha, len); -} - } // namespace LightGBM diff --git a/src/main.cpp b/src/main.cpp index 8034da826811..776ffefd7e81 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,10 +8,16 @@ #include "network/linkers.h" -int main(int argc, char** argv) { +int main(int /*argc*/, char** /*argv*/) { bool success = false; + std::string config_str = std::string("config=train.conf"); + char* argv = new char[config_str.size() + 1]; + for (size_t i = 0; i < config_str.size(); ++i) { + argv[i] = config_str[i]; + } + argv[config_str.size()] = '\0'; try { - LightGBM::Application app(argc, argv); + LightGBM::Application app(2, &argv - 1); app.Run(); #ifdef USE_MPI diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp index 07db1dfeb963..5379caec1199 100644 --- a/src/objective/multiclass_objective.hpp +++ b/src/objective/multiclass_objective.hpp @@ -127,46 +127,6 @@ class MulticlassSoftmax: public ObjectiveFunction { } } } - for (int class_index = 0; class_index < num_class_; ++class_index) { - const size_t offset = static_cast(class_index * num_data_); - const score_t* host_gradients_ptr = gradients + offset; - const score_t* host_hessians_ptr = hessians + offset; - const int num_threads = OMP_NUM_THREADS(); - std::vector thread_abs_max_gradient(num_threads, 0.0f); - std::vector thread_abs_max_hessian(num_threads, 0.0f); - std::vector thread_abs_min_hessian(num_threads, std::numeric_limits::infinity()); - Threading::For(0, num_data_, 512, - [&thread_abs_max_gradient, &thread_abs_max_hessian, &thread_abs_min_hessian, host_gradients_ptr, host_hessians_ptr] (int thread_index, data_size_t start, data_size_t end) { - for (data_size_t index = start; index < end; ++index) { - const score_t gradient = host_gradients_ptr[index]; - const score_t hessian = host_hessians_ptr[index]; - if (std::fabs(gradient) > std::fabs(thread_abs_max_gradient[thread_index])) { - thread_abs_max_gradient[thread_index] = gradient; - } - if (std::fabs(hessian) > std::fabs(thread_abs_max_hessian[thread_index])) { - thread_abs_max_hessian[thread_index] = hessian; - } - if (std::fabs(hessian) < std::fabs(thread_abs_min_hessian[thread_index])) { - thread_abs_min_hessian[thread_index] = hessian; - } - } - }); - double max_abs_gradient = 0.0f; - double max_abs_hessian = 0.0f; - double min_abs_hessian = std::numeric_limits::infinity(); - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - if (std::fabs(thread_abs_max_gradient[thread_index]) > std::fabs(max_abs_gradient)) { - max_abs_gradient = thread_abs_max_gradient[thread_index]; - } - if (std::fabs(thread_abs_max_hessian[thread_index] > std::fabs(max_abs_hessian))) { - max_abs_hessian = thread_abs_max_hessian[thread_index]; - } - if (std::fabs(thread_abs_min_hessian[thread_index] < std::fabs(min_abs_hessian))) { - min_abs_hessian = thread_abs_min_hessian[thread_index]; - } - } - Log::Warning("class %d max_abs_gradient = %f, max_abs_hessian = %f, min_abs_hessian = %f", class_index, max_abs_gradient, max_abs_hessian, min_abs_hessian); - } } void ConvertOutput(const double* input, double* output) const override { diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index da6e1519d37d..e7c8e959de8a 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -11,25 +11,55 @@ namespace LightGBM { -__device__ void ReduceBestGain(double* gain, hist_t* sum_gradients, - hist_t* sum_hessians, uint8_t* found, - uint32_t* threshold_value) { - const unsigned int tid = threadIdx.x; - const unsigned int conflict_free_tid_plus_1 = CONFLICT_FREE_INDEX(tid + 1); - for (unsigned int s = 1; s < MAX_NUM_BIN_IN_FEATURE; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < MAX_NUM_BIN_IN_FEATURE) { - const uint32_t tid_s = tid + s; - const uint32_t conflict_free_tid_s_plus_1 = CONFLICT_FREE_INDEX(tid_s + 1); - if ((found[tid_s] && !found[tid]) || (found[tid_s] && found[tid] && gain[tid_s] > gain[tid])) { - gain[tid] = gain[tid_s]; - sum_gradients[conflict_free_tid_plus_1] = sum_gradients[conflict_free_tid_s_plus_1]; - sum_hessians[conflict_free_tid_plus_1] = sum_hessians[conflict_free_tid_s_plus_1]; - found[tid] = found[tid_s]; - threshold_value[tid] = threshold_value[tid_s]; - } +__device__ void ReduceBestGainWarp(double gain, bool found, uint32_t thread_index, double* out_gain, bool* out_found, uint32_t* out_thread_index) { + const uint32_t mask = 0xffffffff; + const uint32_t warpLane = threadIdx.x % warpSize; + for (uint32_t offset = warpSize / 2; offset > 0; offset >>= 1) { + const bool other_found = __shfl_down_sync(mask, found, offset); + const double other_gain = __shfl_down_sync(mask, gain, offset); + const uint32_t other_thread_index = __shfl_down_sync(mask, thread_index, offset); + if ((other_found && found && other_gain > gain) || (!found && other_found)) { + found = other_found; + gain = other_gain; + thread_index = other_thread_index; + } + } + if (warpLane == 0) { + *out_gain = gain; + *out_found = found; + *out_thread_index = thread_index; + } +} + +__device__ uint32_t ReduceBestGainBlock(double gain, bool found, uint32_t thread_index) { + const uint32_t mask = 0xffffffff; + for (uint32_t offset = warpSize / 2; offset > 0; offset >>= 1) { + const bool other_found = __shfl_down_sync(mask, found, offset); + const double other_gain = __shfl_down_sync(mask, gain, offset); + const uint32_t other_thread_index = __shfl_down_sync(mask, thread_index, offset); + if ((other_found && found && other_gain > gain) || (!found && other_found)) { + found = other_found; + gain = other_gain; + thread_index = other_thread_index; } - __syncthreads(); } + return thread_index; +} + +__device__ uint32_t ReduceBestGain(double gain, bool found, uint32_t thread_index, + double* shared_gain_buffer, bool* shared_found_buffer, uint32_t* shared_thread_index_buffer) { + const uint32_t warpID = threadIdx.x / warpSize; + const uint32_t warpLane = threadIdx.x % warpSize; + const uint32_t num_warp = blockDim.x / warpSize; + ReduceBestGainWarp(gain, found, thread_index, shared_gain_buffer + warpID, shared_found_buffer + warpID, shared_thread_index_buffer + warpID); + __syncthreads(); + if (warpID == 0) { + gain = warpLane < num_warp ? shared_gain_buffer[warpLane] : kMinScore; + found = warpLane < num_warp ? shared_found_buffer[warpLane] : false; + thread_index = warpLane < num_warp ? shared_thread_index_buffer[warpLane] : 0; + thread_index = ReduceBestGainBlock(gain, found, thread_index); + } + return thread_index; } __device__ void ReduceBestGainForLeaves(double* gain, int* leaves, int cuda_cur_num_leaves) { @@ -46,6 +76,50 @@ __device__ void ReduceBestGainForLeaves(double* gain, int* leaves, int cuda_cur_ } } +__device__ void ReduceBestGainForLeavesWarp(double gain, int leaf_index, double* out_gain, int* out_leaf_index) { + const uint32_t mask = 0xffffffff; + const uint32_t warpLane = threadIdx.x % warpSize; + for (uint32_t offset = warpSize / 2; offset > 0; offset >>= 1) { + const int other_leaf_index = __shfl_down_sync(mask, leaf_index, offset); + const double other_gain = __shfl_down_sync(mask, gain, offset); + if ((leaf_index != -1 && other_leaf_index != -1 && other_gain > gain) || (leaf_index == -1 && other_leaf_index != -1)) { + gain = other_gain; + leaf_index = other_leaf_index; + } + } + if (warpLane == 0) { + *out_gain = gain; + *out_leaf_index = leaf_index; + } +} + +__device__ int ReduceBestGainForLeavesBlock(double gain, int leaf_index) { + const uint32_t mask = 0xffffffff; + for (uint32_t offset = warpSize / 2; offset > 0; offset >>= 1) { + const int other_leaf_index = __shfl_down_sync(mask, leaf_index, offset); + const double other_gain = __shfl_down_sync(mask, gain, offset); + if ((leaf_index != -1 && other_leaf_index != -1 && other_gain > gain) || (leaf_index == -1 && other_leaf_index != -1)) { + gain = other_gain; + leaf_index = other_leaf_index; + } + } + return leaf_index; +} + +__device__ int ReduceBestGainForLeaves(double gain, int leaf_index, double* shared_gain_buffer, int* shared_leaf_index_buffer) { + const uint32_t warpID = threadIdx.x / warpSize; + const uint32_t warpLane = threadIdx.x % warpSize; + const uint32_t num_warp = blockDim.x / warpSize; + ReduceBestGainForLeavesWarp(gain, leaf_index, shared_gain_buffer + warpID, shared_leaf_index_buffer + warpID); + __syncthreads(); + if (warpID == 0) { + gain = warpLane < num_warp ? shared_gain_buffer[warpLane] : 0.0f; + leaf_index = warpLane < num_warp ? shared_leaf_index_buffer[warpLane] : -1; + leaf_index = ReduceBestGainForLeavesBlock(gain, leaf_index); + } + return leaf_index; +} + __device__ double ThresholdL1(double s, double l1) { const double reg_s = fmax(0.0, fabs(s) - l1); if (s >= 0.0f) { @@ -134,58 +208,53 @@ __device__ void FindBestSplitsForLeafKernelInner( cuda_best_split_info->is_valid = false; - __shared__ hist_t local_grad_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION]; - __shared__ hist_t local_hess_hist[MAX_NUM_BIN_IN_FEATURE + 1 + (MAX_NUM_BIN_IN_FEATURE + 1) / LOG_NUM_BANKS_DATA_PARTITION]; - __shared__ double local_gain[MAX_NUM_BIN_IN_FEATURE]; - __shared__ uint8_t threshold_found[MAX_NUM_BIN_IN_FEATURE]; - __shared__ uint32_t threshold_value[MAX_NUM_BIN_IN_FEATURE]; - + __shared__ hist_t shared_mem_buffer[32]; + hist_t local_grad_hist = 0.0f; + hist_t local_hess_hist = 0.0f; + double local_gain = 0.0f; + bool threshold_found = false; + uint32_t threshold_value = 0; + __shared__ uint32_t best_thread_index; + __shared__ double shared_gain_buffer[32]; + __shared__ bool shared_found_buffer[32]; + __shared__ uint32_t shared_thread_index_buffer[32]; const unsigned int threadIdx_x = threadIdx.x; - const bool skip_sum = (skip_default_bin && (threadIdx_x + feature_mfb_offset) == static_cast(feature_default_bin)); + const bool skip_sum = reverse ? + (skip_default_bin && (feature_num_bin - 1 - threadIdx_x) == static_cast(feature_default_bin)) : + (skip_default_bin && (threadIdx_x + feature_mfb_offset) == static_cast(feature_default_bin)); const uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; - const bool skip_split = (skip_default_bin && (feature_num_bin_minus_offset - 1 - threadIdx_x + feature_mfb_offset == static_cast(feature_default_bin))); - const unsigned int bin_offset = threadIdx_x << 1; - const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); if (!reverse) { if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { - local_grad_hist[conflict_free_threadIdx_x] = feature_hist_ptr[bin_offset]; - const hist_t hess = feature_hist_ptr[bin_offset + 1]; - local_hess_hist[conflict_free_threadIdx_x] = hess; + const unsigned int bin_offset = threadIdx_x << 1; + local_grad_hist = feature_hist_ptr[bin_offset]; + local_hess_hist = feature_hist_ptr[bin_offset + 1]; } else { - local_grad_hist[conflict_free_threadIdx_x] = 0.0f; - local_hess_hist[conflict_free_threadIdx_x] = 0.0f; + local_grad_hist = 0.0f; + local_hess_hist = 0.0f; } } else { - if (threadIdx_x < feature_num_bin_minus_offset) { - const unsigned int write_index = feature_num_bin_minus_offset - 1 - threadIdx_x; - const unsigned int conflict_free_write_index = CONFLICT_FREE_INDEX(write_index); - if (!skip_sum) { - local_grad_hist[conflict_free_write_index] = feature_hist_ptr[bin_offset]; - const hist_t hess = feature_hist_ptr[bin_offset + 1]; - local_hess_hist[conflict_free_write_index] = hess; - } else { - local_grad_hist[conflict_free_write_index] = 0.0f; - local_hess_hist[conflict_free_write_index] = 0.0f; - } + if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { + const unsigned int read_index = feature_num_bin_minus_offset - 1 - threadIdx_x; + const unsigned int bin_offset = read_index << 1; + local_grad_hist = feature_hist_ptr[bin_offset]; + local_hess_hist = feature_hist_ptr[bin_offset + 1]; } else { - local_grad_hist[conflict_free_threadIdx_x] = 0.0f; - local_hess_hist[conflict_free_threadIdx_x] = 0.0f; + local_grad_hist = 0.0f; + local_hess_hist = 0.0f; } } __syncthreads(); if (threadIdx_x == 0) { - local_hess_hist[conflict_free_threadIdx_x] += kEpsilon; + local_hess_hist += kEpsilon; } - local_gain[threadIdx_x] = kMinScore; - __syncthreads(); - PrefixSumConflictFree(local_grad_hist, MAX_NUM_BIN_IN_FEATURE); - PrefixSumConflictFree(local_hess_hist, MAX_NUM_BIN_IN_FEATURE); + local_gain = kMinScore; + local_grad_hist = ShufflePrefixSum(local_grad_hist, shared_mem_buffer); __syncthreads(); - const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); + local_hess_hist = ShufflePrefixSum(local_hess_hist, shared_mem_buffer); if (reverse) { - if (threadIdx_x >= static_cast(na_as_missing) && threadIdx_x <= feature_num_bin - 2 && !skip_split) { - const double sum_right_gradient = local_grad_hist[conflict_free_threadIdx_x_plus_1]; - const double sum_right_hessian = local_hess_hist[conflict_free_threadIdx_x_plus_1]; + if (threadIdx_x >= static_cast(na_as_missing) && threadIdx_x <= feature_num_bin - 2 && !skip_sum) { + const double sum_right_gradient = local_grad_hist; + const double sum_right_hessian = local_hess_hist; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian; @@ -198,22 +267,22 @@ __device__ void FindBestSplitsForLeafKernelInner( lambda_l2); // gain with split is worse than without split if (current_gain <= min_gain_shift) { - threshold_found[threadIdx_x] = 0; + threshold_found = false; } else { - local_gain[threadIdx_x] = current_gain - min_gain_shift; - threshold_value[threadIdx_x] = static_cast(feature_num_bin - 2 - threadIdx_x); - threshold_found[threadIdx_x] = 1; + local_gain = current_gain - min_gain_shift; + threshold_value = static_cast(feature_num_bin - 2 - threadIdx_x); + threshold_found = true; } } else { - threshold_found[threadIdx_x] = 0; + threshold_found = false; } } else { - threshold_found[threadIdx_x] = 0; + threshold_found = true; } } else { - if (threadIdx_x <= feature_num_bin_minus_offset - 2 /* TODO(shiyu1994): skip default */) { - const double sum_left_gradient = local_grad_hist[conflict_free_threadIdx_x_plus_1]; - const double sum_left_hessian = local_hess_hist[conflict_free_threadIdx_x_plus_1]; + if (threadIdx_x <= feature_num_bin_minus_offset - 2 && !skip_sum) { + const double sum_left_gradient = local_grad_hist; + const double sum_left_hessian = local_hess_hist; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian; @@ -226,30 +295,33 @@ __device__ void FindBestSplitsForLeafKernelInner( lambda_l2); // gain with split is worse than without split if (current_gain <= min_gain_shift) { - threshold_found[threadIdx_x] = 0; + threshold_found = false; } else { - local_gain[threadIdx_x] = current_gain - min_gain_shift; - threshold_value[threadIdx_x] = static_cast(threadIdx_x + feature_mfb_offset); - threshold_found[threadIdx_x] = 1; + local_gain = current_gain - min_gain_shift; + threshold_value = static_cast(threadIdx_x + feature_mfb_offset); + threshold_found = true; } } else { - threshold_found[threadIdx_x] = 0; + threshold_found = false; } } else { - threshold_found[threadIdx_x] = 0; + threshold_found = false; } } __syncthreads(); - ReduceBestGain(local_gain, local_grad_hist, local_hess_hist, threshold_found, threshold_value); - const uint8_t found = threshold_found[0]; - if (found && threadIdx_x == 0) { + const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer); + if (threadIdx_x == 0) { + best_thread_index = result; + } + __syncthreads(); + if (threshold_found && threadIdx_x == best_thread_index) { cuda_best_split_info->is_valid = true; - cuda_best_split_info->threshold = threshold_value[0]; - cuda_best_split_info->gain = local_gain[0]; + cuda_best_split_info->threshold = threshold_value; + cuda_best_split_info->gain = local_gain; cuda_best_split_info->default_left = assume_out_default_left; if (reverse) { - const double sum_right_gradient = local_grad_hist[1]; - const double sum_right_hessian = local_hess_hist[1] - kEpsilon; + const double sum_right_gradient = local_grad_hist; + const double sum_right_hessian = local_hess_hist - kEpsilon; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon; @@ -271,8 +343,8 @@ __device__ void FindBestSplitsForLeafKernelInner( cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); } else { - const double sum_left_gradient = local_grad_hist[1]; - const double sum_left_hessian = local_hess_hist[1] - kEpsilon; + const double sum_left_gradient = local_grad_hist; + const double sum_left_hessian = local_hess_hist - kEpsilon; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon; @@ -674,28 +746,20 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( __global__ void FindBestFromAllSplitsKernel(const int cur_num_leaves, CUDASplitInfo* cuda_leaf_best_split_info, int* cuda_best_split_info_buffer) { - __shared__ double thread_best_gain[NUM_THREADS_FIND_BEST_LEAF]; - __shared__ int thread_best_leaf[NUM_THREADS_FIND_BEST_LEAF]; - const unsigned int threadIdx_x = threadIdx.x; - thread_best_gain[threadIdx_x] = kMinScore; - thread_best_leaf[threadIdx_x] = -1; - const int num_leaves_per_thread = (cur_num_leaves + NUM_THREADS_FIND_BEST_LEAF - 1) / NUM_THREADS_FIND_BEST_LEAF; - const int cur_num_valid_threads = (cur_num_leaves + num_leaves_per_thread - 1) / num_leaves_per_thread; - if (threadIdx_x < static_cast(cur_num_valid_threads)) { - const int start = num_leaves_per_thread * threadIdx_x; - const int end = min(start + num_leaves_per_thread, cur_num_leaves); - for (int leaf_index = threadIdx_x; leaf_index < cur_num_leaves; leaf_index += cur_num_valid_threads) { - const double leaf_best_gain = cuda_leaf_best_split_info[leaf_index].gain; - if (cuda_leaf_best_split_info[leaf_index].is_valid && leaf_best_gain > thread_best_gain[threadIdx_x]) { - thread_best_gain[threadIdx_x] = leaf_best_gain; - thread_best_leaf[threadIdx_x] = leaf_index; - } + __shared__ double gain_shared_buffer[32]; + __shared__ int leaf_index_shared_buffer[32]; + double thread_best_gain = kMinScore; + int thread_best_leaf_index = -1; + const int threadIdx_x = static_cast(threadIdx.x); + for (int leaf_index = threadIdx_x; leaf_index < cur_num_leaves; leaf_index += static_cast(blockDim.x)) { + const double leaf_best_gain = cuda_leaf_best_split_info[leaf_index].gain; + if (cuda_leaf_best_split_info[leaf_index].is_valid && leaf_best_gain > thread_best_gain) { + thread_best_gain = leaf_best_gain; + thread_best_leaf_index = leaf_index; } } - __syncthreads(); - ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf, cur_num_valid_threads); + const int best_leaf_index = ReduceBestGainForLeaves(thread_best_gain, thread_best_leaf_index, gain_shared_buffer, leaf_index_shared_buffer); if (threadIdx_x == 0) { - const int best_leaf_index = thread_best_leaf[0]; cuda_best_split_info_buffer[6] = best_leaf_index; if (best_leaf_index != -1) { cuda_leaf_best_split_info[best_leaf_index].is_valid = false; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 604a4728a3fd..095c1cb2b879 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -51,7 +51,7 @@ void CUDADataPartition::Init() { AllocateCUDAMemoryOuter(&cuda_leaf_data_end_, static_cast(num_leaves_), __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_leaf_num_data_, static_cast(num_leaves_), __FILE__, __LINE__); // leave some space for alignment - AllocateCUDAMemoryOuter(&cuda_data_to_left_, static_cast(num_data_) + 1024 * 8, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_to_left_offset_, static_cast(num_data_), __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_data_index_to_leaf_index_, static_cast(num_data_), __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_block_data_to_left_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_block_data_to_right_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 8ee4d58712c5..472a00ac7566 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -27,20 +27,21 @@ void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() { FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_, cuda_data_index_to_leaf_index_); } -__device__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, const uint8_t* split_to_left_bit_vector, +__device__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, uint16_t* block_to_left_offset, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, - uint16_t* thread_to_left_offset_cnt) { + const uint16_t thread_to_left_offset_cnt) { + __shared__ uint16_t shared_mem_buffer[32]; const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockDim_x = blockDim.x; - __syncthreads(); - ReduceSumConflictFree(thread_to_left_offset_cnt, static_cast(split_indices_block_size_data_partition)); - __syncthreads(); - if (threadIdx_x == 0) { - const data_size_t num_data_in_block = (blockIdx.x + 1) * blockDim_x <= num_data_in_leaf_ref ? static_cast(blockDim_x) : - num_data_in_leaf_ref - static_cast(blockIdx.x * blockDim_x); + const uint16_t thread_to_left_offset = ShufflePrefixSum(thread_to_left_offset_cnt, shared_mem_buffer); + const data_size_t num_data_in_block = (blockIdx.x + 1) * blockDim_x <= num_data_in_leaf_ref ? static_cast(blockDim_x) : + num_data_in_leaf_ref - static_cast(blockIdx.x * blockDim_x); + if (static_cast(threadIdx_x) < num_data_in_block) { + block_to_left_offset[threadIdx_x] = thread_to_left_offset; + } + if (threadIdx_x == blockDim_x - 1) { if (num_data_in_block > 0) { - const data_size_t data_to_left = static_cast(thread_to_left_offset_cnt[0]); + const data_size_t data_to_left = static_cast(thread_to_left_offset); block_to_left_offset_buffer[blockIdx.x + 1] = data_to_left; block_to_right_offset_buffer[blockIdx.x + 1] = num_data_in_block - data_to_left; } else { @@ -255,42 +256,31 @@ __global__ void GenDataToLeftBitVectorKernel0(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, + uint16_t* block_to_left_offset, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; + uint16_t thread_to_left_offset_cnt = 0; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + thread_to_left_offset_cnt = split_missing_default_to_left; } else if ((bin < min_bin_ref || bin > max_bin_ref)) { if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + thread_to_left_offset_cnt = split_missing_default_to_left; } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + thread_to_left_offset_cnt = split_default_to_left; } - } else if (bin > th) { - cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; - } else { - cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + } else if (bin <= th) { + thread_to_left_offset_cnt = 1; } - } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; } __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); + PrepareOffset(num_data_in_leaf, block_to_left_offset + blockIdx.x * blockDim.x, block_to_left_offset_buffer, block_to_right_offset_buffer, + thread_to_left_offset_cnt); } // min_bin_ref == max_bin_ref @@ -301,56 +291,47 @@ __global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, // values from feature const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint8_t* cuda_data_to_left, + uint16_t* block_to_left_offset, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const int split_indices_block_size_data_partition, int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, const int default_leaf_index, const int missing_default_leaf_index) { - __shared__ uint16_t thread_to_left_offset_cnt[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1) / NUM_BANKS_DATA_PARTITION]; + uint16_t thread_to_left_offset_cnt = 0; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + thread_to_left_offset_cnt = split_missing_default_to_left; } else if (bin != max_bin_ref) { if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + thread_to_left_offset_cnt = split_missing_default_to_left; } else { - cuda_data_to_left[local_data_index] = split_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_default_to_left; + thread_to_left_offset_cnt = split_default_to_left; } } else { if (MISSING_IS_NA && !MFB_IS_NA) { - cuda_data_to_left[local_data_index] = split_missing_default_to_left; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = split_missing_default_to_left; + thread_to_left_offset_cnt = split_missing_default_to_left; } else { if (MAX_TO_LEFT) { - cuda_data_to_left[local_data_index] = 1; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 1; + thread_to_left_offset_cnt = 1; } else { - cuda_data_to_left[local_data_index] = 0; - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + thread_to_left_offset_cnt = 0; } } } } else { - thread_to_left_offset_cnt[CONFLICT_FREE_INDEX(threadIdx.x)] = 0; + thread_to_left_offset_cnt = 0; } __syncthreads(); - PrepareOffset(num_data_in_leaf, cuda_data_to_left, block_to_left_offset_buffer, block_to_right_offset_buffer, - split_indices_block_size_data_partition, thread_to_left_offset_cnt); + PrepareOffset(num_data_in_leaf, block_to_left_offset, block_to_left_offset_buffer, block_to_right_offset_buffer, + thread_to_left_offset_cnt); } #define GenBitVector_ARGS \ split_feature_index, num_data_in_leaf, data_indices_in_leaf, \ th, num_features_, \ column_data, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, \ - split_missing_default_to_left, cuda_data_to_left_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, \ - split_indices_block_size_data_partition_aligned, \ + split_missing_default_to_left, cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, \ cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index template @@ -847,14 +828,11 @@ __global__ void AggregateBlockOffsetKernel0( data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, const data_size_t num_blocks) { - __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + - (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; - __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + - (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint32_t shared_mem_buffer[32]; + __shared__ uint32_t to_left_total_count; const data_size_t num_data_in_leaf = cuda_leaf_num_data[left_leaf_index]; const unsigned int blockDim_x = blockDim.x; const unsigned int threadIdx_x = threadIdx.x; - const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); const data_size_t num_blocks_plus_1 = num_blocks + 1; const uint32_t num_blocks_per_thread = (num_blocks_plus_1 + blockDim_x - 1) / blockDim_x; const uint32_t remain = num_blocks_plus_1 - ((num_blocks_per_thread - 1) * blockDim_x); @@ -877,20 +855,24 @@ __global__ void AggregateBlockOffsetKernel0( block_to_right_offset_buffer[block_index] += block_to_right_offset_buffer[block_index - 1]; } __syncthreads(); - if (thread_start_block_index < thread_end_block_index) { - block_to_left_offset[conflict_free_threadIdx_x] = block_to_left_offset_buffer[thread_end_block_index - 1]; - block_to_right_offset[conflict_free_threadIdx_x] = block_to_right_offset_buffer[thread_end_block_index - 1]; + uint32_t block_to_left_offset = 0; + uint32_t block_to_right_offset = 0; + if (thread_start_block_index < thread_end_block_index && thread_start_block_index > 1) { + block_to_left_offset = block_to_left_offset_buffer[thread_start_block_index - 1]; + block_to_right_offset = block_to_right_offset_buffer[thread_start_block_index - 1]; } else { - block_to_left_offset[conflict_free_threadIdx_x] = 0; - block_to_right_offset[conflict_free_threadIdx_x] = 0; + block_to_left_offset = 0; + block_to_right_offset = 0; } + block_to_left_offset = ShufflePrefixSum(block_to_left_offset, shared_mem_buffer); __syncthreads(); - PrefixSumConflictFree(block_to_left_offset, blockDim_x); - PrefixSumConflictFree(block_to_right_offset, blockDim_x); + block_to_right_offset = ShufflePrefixSum(block_to_right_offset, shared_mem_buffer); + if (threadIdx_x == blockDim_x - 1) { + to_left_total_count = block_to_left_offset + block_to_left_offset_buffer[num_blocks]; + } __syncthreads(); - const uint32_t to_left_total_count = block_to_left_offset[CONFLICT_FREE_INDEX(blockDim_x)]; - const uint32_t to_left_thread_block_offset = block_to_left_offset[conflict_free_threadIdx_x]; - const uint32_t to_right_thread_block_offset = block_to_right_offset[conflict_free_threadIdx_x] + to_left_total_count; + const uint32_t to_left_thread_block_offset = block_to_left_offset; + const uint32_t to_right_thread_block_offset = block_to_right_offset + to_left_total_count; for (uint32_t block_index = thread_start_block_index; block_index < thread_end_block_index; ++block_index) { block_to_left_offset_buffer[block_index] += to_left_thread_block_offset; block_to_right_offset_buffer[block_index] += to_right_thread_block_offset; @@ -913,29 +895,26 @@ __global__ void AggregateBlockOffsetKernel1( data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, const data_size_t num_blocks, const data_size_t num_blocks_aligned) { - __shared__ uint32_t block_to_left_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + - (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; - __shared__ uint32_t block_to_right_offset[AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2 + - (AGGREGATE_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; + __shared__ uint32_t shared_mem_buffer[32]; + __shared__ uint32_t to_left_total_count; const data_size_t num_data_in_leaf = cuda_leaf_num_data[left_leaf_index]; const unsigned int threadIdx_x = threadIdx.x; - const unsigned int conflict_free_threadIdx_x = CONFLICT_FREE_INDEX(threadIdx_x); - const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); + uint32_t block_to_left_offset = 0; + uint32_t block_to_right_offset = 0; if (threadIdx_x < static_cast(num_blocks)) { - block_to_left_offset[conflict_free_threadIdx_x] = block_to_left_offset_buffer[threadIdx_x + 1]; - block_to_right_offset[conflict_free_threadIdx_x] = block_to_right_offset_buffer[threadIdx_x + 1]; - } else { - block_to_left_offset[conflict_free_threadIdx_x] = 0; - block_to_right_offset[conflict_free_threadIdx_x] = 0; + block_to_left_offset = block_to_left_offset_buffer[threadIdx_x + 1]; + block_to_right_offset = block_to_right_offset_buffer[threadIdx_x + 1]; } + block_to_left_offset = ShufflePrefixSum(block_to_left_offset, shared_mem_buffer); __syncthreads(); - PrefixSumConflictFree(block_to_left_offset, num_blocks_aligned); - PrefixSumConflictFree(block_to_right_offset, num_blocks_aligned); + block_to_right_offset = ShufflePrefixSum(block_to_right_offset, shared_mem_buffer); + if (threadIdx.x == blockDim.x - 1) { + to_left_total_count = block_to_left_offset; + } __syncthreads(); - const uint32_t to_left_total_count = block_to_left_offset[CONFLICT_FREE_INDEX(num_blocks_aligned)]; if (threadIdx_x < static_cast(num_blocks)) { - block_to_left_offset_buffer[threadIdx_x + 1] = block_to_left_offset[conflict_free_threadIdx_x_plus_1]; - block_to_right_offset_buffer[threadIdx_x + 1] = block_to_right_offset[conflict_free_threadIdx_x_plus_1] + to_left_total_count; + block_to_left_offset_buffer[threadIdx_x + 1] = block_to_left_offset; + block_to_right_offset_buffer[threadIdx_x + 1] = block_to_right_offset + to_left_total_count; } if (threadIdx_x == 0) { block_to_right_offset_buffer[0] = to_left_total_count; @@ -1075,68 +1054,28 @@ __global__ void SplitTreeStructureKernel(const int left_leaf_index, __global__ void SplitInnerKernel(const int left_leaf_index, const int right_leaf_index, const data_size_t* cuda_leaf_data_start, const data_size_t* cuda_leaf_num_data, - const data_size_t* cuda_data_indices, const uint8_t* split_to_left_bit_vector, + const data_size_t* cuda_data_indices, const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, - data_size_t* out_data_indices_in_leaf, const int split_indices_block_size_data_partition) { - __shared__ uint16_t thread_to_left_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 1 + - (SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION + 2) / NUM_BANKS_DATA_PARTITION]; - __shared__ uint16_t thread_to_right_pos[SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION]; - uint8_t first_to_left = 0; - uint8_t second_to_left = 0; + const uint16_t* block_to_left_offset, data_size_t* out_data_indices_in_leaf) { const data_size_t leaf_num_data_offset = cuda_leaf_data_start[left_leaf_index]; const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[left_leaf_index] + cuda_leaf_num_data[right_leaf_index]; const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockDim_x = blockDim.x; - const unsigned int conflict_free_threadIdx_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + 1); - const unsigned int global_thread_index = blockIdx.x * blockDim_x * 2 + threadIdx_x; + const unsigned int global_thread_index = blockIdx.x * blockDim_x + threadIdx_x; const data_size_t* cuda_data_indices_in_leaf = cuda_data_indices + leaf_num_data_offset; - if (global_thread_index < num_data_in_leaf_ref) { - const uint8_t bit = split_to_left_bit_vector[global_thread_index]; - first_to_left = bit; - thread_to_left_pos[conflict_free_threadIdx_x_plus_1] = bit; - } else { - first_to_left = 0; - thread_to_left_pos[conflict_free_threadIdx_x_plus_1] = 0; - } - const unsigned int conflict_free_threadIdx_x_plus_blockDim_x_plus_1 = CONFLICT_FREE_INDEX(threadIdx_x + blockDim_x + 1); - const unsigned int global_thread_index_plus_blockDim_x = global_thread_index + blockDim_x; - if (global_thread_index_plus_blockDim_x < num_data_in_leaf_ref) { - const uint8_t bit = split_to_left_bit_vector[global_thread_index_plus_blockDim_x]; - second_to_left = bit; - thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = bit; - } else { - second_to_left = 0; - thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1] = 0; - } - __syncthreads(); + const uint16_t* block_to_left_offset_ptr = block_to_left_offset + blockIdx.x * blockDim_x; const uint32_t to_right_block_offset = block_to_right_offset_buffer[blockIdx.x]; const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x]; - if (threadIdx_x == 0) { - thread_to_left_pos[0] = 0; - thread_to_right_pos[0] = 0; - } - __syncthreads(); - PrefixSumConflictFree(thread_to_left_pos, split_indices_block_size_data_partition); - __syncthreads(); - if (threadIdx_x > 0) { - thread_to_right_pos[threadIdx_x] = (threadIdx_x - thread_to_left_pos[conflict_free_threadIdx_x_plus_1]); - } - thread_to_right_pos[threadIdx_x + blockDim_x] = (threadIdx_x + blockDim_x - thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]); - __syncthreads(); data_size_t* left_out_data_indices_in_leaf = out_data_indices_in_leaf + to_left_block_offset; data_size_t* right_out_data_indices_in_leaf = out_data_indices_in_leaf + to_right_block_offset; - if (global_thread_index < num_data_in_leaf_ref) { - if (first_to_left == 1) { - left_out_data_indices_in_leaf[thread_to_left_pos[conflict_free_threadIdx_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index]; - } else { - right_out_data_indices_in_leaf[thread_to_right_pos[threadIdx_x]] = cuda_data_indices_in_leaf[global_thread_index]; - } - } - if (global_thread_index_plus_blockDim_x < num_data_in_leaf_ref) { - if (second_to_left == 1) { - left_out_data_indices_in_leaf[thread_to_left_pos[conflict_free_threadIdx_x_plus_blockDim_x_plus_1]] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x]; + if (static_cast(global_thread_index) < num_data_in_leaf_ref) { + const uint32_t thread_to_left_offset = (threadIdx_x == 0 ? 0 : block_to_left_offset_ptr[threadIdx_x - 1]); + const bool to_left = block_to_left_offset_ptr[threadIdx_x] > thread_to_left_offset; + if (to_left) { + left_out_data_indices_in_leaf[thread_to_left_offset] = cuda_data_indices_in_leaf[global_thread_index]; } else { - right_out_data_indices_in_leaf[thread_to_right_pos[threadIdx_x + blockDim_x]] = cuda_data_indices_in_leaf[global_thread_index_plus_blockDim_x]; + const uint32_t thread_to_right_offset = threadIdx.x - thread_to_left_offset; + right_out_data_indices_in_leaf[thread_to_right_offset] = cuda_data_indices_in_leaf[global_thread_index]; } } } @@ -1204,10 +1143,10 @@ void CUDADataPartition::LaunchSplitInnerKernel( global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); global_timer.Start("CUDADataPartition::SplitInnerKernel"); - SplitInnerKernel<<>>( - left_leaf_index, right_leaf_index, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_data_to_left_, - cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, - cuda_out_data_indices_in_leaf_, split_indices_block_size_data_partition_aligned); + SplitInnerKernel<<>>( + left_leaf_index, right_leaf_index, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, + cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_block_to_left_offset_, + cuda_out_data_indices_in_leaf_); global_timer.Stop("CUDADataPartition::SplitInnerKernel"); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); @@ -1229,8 +1168,8 @@ void CUDADataPartition::LaunchSplitInnerKernel( const double* cpu_sum_hessians_info = reinterpret_cast(cpu_split_info_buffer.data() + 8); global_timer.Start("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); CopyFromCUDADeviceToHostAsyncOuter(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 12, cuda_streams_[0], __FILE__, __LINE__); - global_timer.Stop("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + global_timer.Stop("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); const data_size_t left_leaf_num_data = cpu_split_info_buffer[1]; const data_size_t left_leaf_data_start = cpu_split_info_buffer[2]; const data_size_t right_leaf_num_data = cpu_split_info_buffer[4]; diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 4551e8714c31..8a0f1713dfe5 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -258,8 +258,7 @@ class CUDADataPartition { double* cuda_leaf_output_; // split data algorithm related - /*! \brief marks whether each data goes to left or right, 1 for left, and 0 for right */ - uint8_t* cuda_data_to_left_; + uint16_t* cuda_block_to_left_offset_; /*! \brief maps data index to leaf index, for adding scores to training data set */ int* cuda_data_index_to_leaf_index_; /*! \brief prefix sum of number of data going to left in all blocks */ diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 7e58325127ef..79739ec035b3 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -281,10 +281,9 @@ __global__ void FixHistogramKernel( const int* cuda_need_fix_histogram_features, const uint32_t* cuda_need_fix_histogram_features_num_bin_aligned, const CUDALeafSplitsStruct* cuda_smaller_leaf_splits) { + __shared__ hist_t shared_mem_buffer[32]; const unsigned int blockIdx_x = blockIdx.x; const int feature_index = cuda_need_fix_histogram_features[blockIdx_x]; - __shared__ double hist_gradients[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; - __shared__ double hist_hessians[FIX_HISTOGRAM_SHARED_MEM_SIZE + 1]; const uint32_t num_bin_aligned = cuda_need_fix_histogram_features_num_bin_aligned[blockIdx_x]; const uint32_t feature_hist_offset = cuda_feature_hist_offsets[feature_index]; const uint32_t most_freq_bin = cuda_feature_most_freq_bins[feature_index]; @@ -294,25 +293,13 @@ __global__ void FixHistogramKernel( const unsigned int threadIdx_x = threadIdx.x; const uint32_t num_bin = cuda_feature_num_bins[feature_index]; const uint32_t hist_pos = threadIdx_x << 1; - if (threadIdx_x < num_bin) { - if (threadIdx_x == most_freq_bin) { - hist_gradients[threadIdx_x] = 0.0f; - hist_hessians[threadIdx_x] = 0.0f; - } else { - hist_gradients[threadIdx_x] = feature_hist[hist_pos]; - hist_hessians[threadIdx_x] = feature_hist[hist_pos + 1]; - } - } else { - hist_gradients[threadIdx_x] = 0.0f; - hist_hessians[threadIdx_x] = 0.0f; - } - __syncthreads(); - ReduceSum(hist_gradients, num_bin_aligned); - ReduceSum(hist_hessians, num_bin_aligned); - __syncthreads(); - if (threadIdx_x == most_freq_bin) { - feature_hist[hist_pos] = leaf_sum_gradients - hist_gradients[0]; - feature_hist[hist_pos + 1] = leaf_sum_hessians - hist_hessians[0]; + const hist_t bin_gradient = (threadIdx_x < num_bin && threadIdx_x != most_freq_bin) ? feature_hist[hist_pos] : 0.0f; + const hist_t bin_hessian = (threadIdx_x < num_bin && threadIdx_x != most_freq_bin) ? feature_hist[hist_pos + 1] : 0.0f; + const hist_t sum_gradient = ShuffleReduceSum(bin_gradient, shared_mem_buffer, num_bin_aligned); + const hist_t sum_hessian = ShuffleReduceSum(bin_hessian, shared_mem_buffer, num_bin_aligned); + if (threadIdx_x == 0) { + feature_hist[most_freq_bin << 1] = leaf_sum_gradients - sum_gradient; + feature_hist[(most_freq_bin << 1) + 1] = leaf_sum_hessians - sum_hessian; } } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 25a0f3ff686a..2719cf77ece2 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -106,7 +106,7 @@ class CUDAHistogramConstructor { /*! \brief aligned number of bins of the features whose histograms need to be fixed */ std::vector need_fix_histogram_features_num_bin_aligend_; /*! \brief minimum number of blocks allowed in the y dimension */ - const int min_grid_dim_y_ = 10; + const int min_grid_dim_y_ = 160; // CUDA memory, held by this object diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 6335ba98a519..7f721501b1ce 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -69,17 +69,6 @@ void NewCUDATreeLearner::BeforeTrain() { larger_leaf_index_ = -1; } -void NewCUDATreeLearner::FindBestSplits(const Tree* /*tree*/) {} - -void NewCUDATreeLearner::ConstructHistograms(const std::vector& /*is_feature_used*/, - bool /*use_subtract*/) {} - -void NewCUDATreeLearner::FindBestSplitsFromHistograms(const std::vector& /*is_feature_used*/, - bool /*use_subtract*/, const Tree* /*tree*/) {} - -void NewCUDATreeLearner::Split(Tree* /*tree*/, int /*best_leaf*/, - int* /*left_leaf*/, int* /*right_leaf*/) {} - void NewCUDATreeLearner::AddPredictionToScore(const Tree* tree, double* out_score) const { CHECK(tree->is_cuda_tree()); const CUDATree* cuda_tree = reinterpret_cast(tree); @@ -205,21 +194,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, SynchronizeCUDADeviceOuter(__FILE__, __LINE__); const auto end = std::chrono::steady_clock::now(); const double duration = (static_cast>(end - start)).count(); - /*Log::Warning("Train time %f", duration); - Log::Warning("before train time %f", static_cast>(before_train_end - before_train_start).count()); - Log::Warning("construct histogram time %f", construct_histogram_time); - Log::Warning("find best split time %f", find_best_split_time); - Log::Warning("find best split time from all leaves %f", find_best_split_from_all_leaves_time); - Log::Warning("split data indices time %f", split_data_indices_time);*/ tree->ToHost(); - double max_abs_leaf_output = 0.0f; - for (int leaf_index = 0; leaf_index < tree->num_leaves(); ++leaf_index) { - //Log::Warning("leaf_index %d leaf_value %f", leaf_index, tree->LeafOutput(leaf_index)); - if (std::fabs(tree->LeafOutput(leaf_index)) > std::fabs(max_abs_leaf_output)) { - max_abs_leaf_output = tree->LeafOutput(leaf_index); - } - } - Log::Warning("max_abs_leaf_output = %f", max_abs_leaf_output); return tree.release(); } diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 4ee40ffdb668..d74ec9b33020 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -37,14 +37,6 @@ class NewCUDATreeLearner: public SerialTreeLearner { const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; protected: - void FindBestSplits(const Tree* tree) override; - - void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) override; - - void FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract, const Tree* tree) override; - - void Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) override; - void BeforeTrain() override; // number of GPUs diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 28be19d1b714..5fd872d61eeb 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -205,14 +205,6 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians } Log::Debug("Trained a tree with leaves = %d and depth = %d", tree->num_leaves(), cur_depth); - double max_abs_leaf_output = 0.0f; - for (int leaf_index = 0; leaf_index < tree->num_leaves(); ++leaf_index) { - Log::Warning("leaf_index %d leaf_value %f", leaf_index, tree->LeafOutput(leaf_index)); - if (std::fabs(tree->LeafOutput(leaf_index)) > std::fabs(max_abs_leaf_output)) { - max_abs_leaf_output = tree->LeafOutput(leaf_index); - } - } - Log::Warning("max_abs_leaf_output = %f", max_abs_leaf_output); return tree.release(); } From 883ed157785057c0251f3dc4b7f7e6574b201687 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 13 Sep 2021 15:24:56 +0000 Subject: [PATCH 071/166] clean up cuda_algorithms.hpp --- include/LightGBM/cuda/cuda_algorithms.hpp | 106 ---------------------- src/cuda/cuda_algorithms.cu | 16 ---- 2 files changed, 122 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index b006dcc10270..5c2ff2cddd4e 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -29,80 +29,6 @@ namespace LightGBM { -template -__device__ void PrefixSum(T* values, size_t n) { - unsigned int offset = 1; - unsigned int threadIdx_x = static_cast(threadIdx.x); - const T last_element = values[n - 1]; - __syncthreads(); - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - values[dst_pos] += values[src_pos]; - } - offset <<= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - values[n - 1] = 0; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const unsigned int dst_pos = offset * (2 * threadIdx_x + 2) - 1; - const unsigned int src_pos = offset * (2 * threadIdx_x + 1) - 1; - const T src_val = values[src_pos]; - values[src_pos] = values[dst_pos]; - values[dst_pos] += src_val; - } - __syncthreads(); - } - if (threadIdx.x == 0) { - values[n] = values[n - 1] + last_element; - } - __syncthreads(); -} - -template -__device__ __forceinline__ void PrefixSumConflictFree(T* values, size_t n) { - size_t offset = 1; - unsigned int threadIdx_x = threadIdx.x; - const size_t conflict_free_n_minus_1 = CONFLICT_FREE_INDEX(n - 1); - const T last_element = values[conflict_free_n_minus_1]; - __syncthreads(); - for (int d = (n >> 1); d > 0; d >>= 1) { - if (threadIdx_x < d) { - const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; - const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; - values[CONFLICT_FREE_INDEX(dst_pos)] += values[CONFLICT_FREE_INDEX(src_pos)]; - } - offset <<= 1; - __syncthreads(); - } - if (threadIdx_x == 0) { - values[conflict_free_n_minus_1] = 0; - } - __syncthreads(); - for (int d = 1; d < n; d <<= 1) { - offset >>= 1; - if (threadIdx_x < d) { - const size_t dst_pos = offset * (2 * threadIdx_x + 2) - 1; - const size_t src_pos = offset * (2 * threadIdx_x + 1) - 1; - const size_t conflict_free_dst_pos = CONFLICT_FREE_INDEX(dst_pos); - const size_t conflict_free_src_pos = CONFLICT_FREE_INDEX(src_pos); - const T src_val = values[conflict_free_src_pos]; - values[conflict_free_src_pos] = values[conflict_free_dst_pos]; - values[conflict_free_dst_pos] += src_val; - } - __syncthreads(); - } - if (threadIdx_x == 0) { - values[CONFLICT_FREE_INDEX(n)] = values[conflict_free_n_minus_1] + last_element; - } -} - template __device__ __forceinline__ T ShufflePrefixSum(T value, T* shared_mem_buffer) { const uint32_t mask = 0xffffffff; @@ -165,38 +91,6 @@ __device__ __forceinline__ T ShuffleReduceSum(T value, T* shared_mem_buffer, con return value; } -template -__device__ __forceinline__ T ShuffleReduceMaxWarp(T value, const data_size_t len) { - if (len > 0) { - // TODO(shiyu1994): check how mask works - const uint32_t mask = 0xffffffff; - for (int offset = warpSize / 2; offset > 0; offset >>= 1) { - const T other_value = __shfl_down_sync(mask, value, offset); - value = (other_value > value) ? other_value : value; - } - } - return value; -} - -// reduce values from an 1-dimensional block (block size must be no greather than 1024) -template -__device__ __forceinline__ T ShuffleReduceMax(T value, T* shared_mem_buffer, const size_t len) { - const uint32_t warpLane = threadIdx.x % warpSize; - const uint32_t warpID = threadIdx.x / warpSize; - const data_size_t warp_len = min(static_cast(warpSize), static_cast(len) - static_cast(warpID * warpSize)); - value = ShuffleReduceMaxWarp(value, warp_len); - if (warpLane == 0) { - shared_mem_buffer[warpID] = value; - } - __syncthreads(); - const data_size_t num_warp = static_cast((len + warpSize - 1) / warpSize); - if (warpID == 0) { - value = (warpLane < num_warp ? shared_mem_buffer[warpLane] : shared_mem_buffer[0]); - value = ShuffleReduceMaxWarp(value, num_warp); - } - return value; -} - } // namespace LightGBM #endif // USE_CUDA diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index cbb7e89cbd20..7b168c10fa17 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -7,20 +7,4 @@ namespace LightGBM { -template -__global__ void BlockReduceMax(T* block_buffer, const data_size_t num_blocks) { - __shared__ T shared_buffer[32]; - T thread_max = 0; - for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { - const T value = block_buffer[block_index]; - if (value > thread_max) { - thread_max = value; - } - } - thread_max = ShuffleReduceMax(thread_max, shared_buffer, blockDim.x); - if (threadIdx.x == 0) { - block_buffer[0] = thread_max; - } -} - } // namespace LightGBM From e7ffc3fea222ef1d905e688fd3ba1987380f826a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 15 Sep 2021 05:17:02 +0000 Subject: [PATCH 072/166] add copy subset on CUDA --- src/boosting/cuda/gbdt.cu | 33 ++ src/boosting/gbdt.cpp | 17 +- src/boosting/gbdt.h | 8 + .../cuda/cuda_best_split_finder.cu | 54 +-- .../cuda/cuda_best_split_finder.hpp | 2 +- src/treelearner/cuda/cuda_data_partition.cpp | 35 +- src/treelearner/cuda/cuda_data_partition.cu | 362 ++++++++---------- src/treelearner/cuda/cuda_data_partition.hpp | 24 +- .../cuda/new_cuda_tree_learner.cpp | 2 - 9 files changed, 248 insertions(+), 289 deletions(-) create mode 100644 src/boosting/cuda/gbdt.cu diff --git a/src/boosting/cuda/gbdt.cu b/src/boosting/cuda/gbdt.cu new file mode 100644 index 000000000000..9a029f75d941 --- /dev/null +++ b/src/boosting/cuda/gbdt.cu @@ -0,0 +1,33 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "../gbdt.h" + +#define COPY_SUBSAMPLE_GRADIENTS_BLOCK_SIZE (1024) + +namespace LightGBM { + +__global__ void CopySubsampleGradientsKernel( + score_t* dst_grad, score_t* dst_hess, + const score_t* src_grad, const score_t* src_hess, + const data_size_t* bag_data_indices, + const data_size_t bag_data_cnt) { + const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (local_data_index < bag_data_cnt) { + const data_size_t global_data_index = bag_data_indices[local_data_index]; + dst_grad[local_data_index] = src_grad[global_data_index]; + dst_hess[local_data_index] = src_hess[global_data_index]; + } +} + +void GBDT::LaunchCopySubsampleGradientsKernel( +score_t* dst_grad, score_t* dst_hess, +const score_t* src_grad, const score_t* src_hess) { + const int num_blocks = (bag_data_cnt_ + COPY_SUBSAMPLE_GRADIENTS_BLOCK_SIZE - 1) / COPY_SUBSAMPLE_GRADIENTS_BLOCK_SIZE; + CopySubsampleGradientsKernel<<>>( + dst_grad, dst_hess, src_grad, src_hess, bag_data_indices_.data(), bag_data_cnt_); +} + +} // namespace LightGBM diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 4e3dec289bac..a83decb550a3 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -416,10 +416,13 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto hess = hessians + offset; // need to copy gradients for bagging subset. if (is_use_subset_ && bag_data_cnt_ < num_data_) { - for (int i = 0; i < bag_data_cnt_; ++i) { - // TODO(shiyu1994): bagging is not supported, the copy operation should be done in GPU - gradients_pointer_[offset + i] = grad[bag_data_indices_[i]]; - gradients_pointer_[offset + i] = hess[bag_data_indices_[i]]; + if (config_->device_type == std::string("cuda")) { + CopySubsampleGradientsCUDA(gradients_pointer_ + offset, hessians_pointer_ + offset, grad, hess); + } else { + for (int i = 0; i < bag_data_cnt_; ++i) { + gradients_pointer_[offset + i] = grad[bag_data_indices_[i]]; + gradients_pointer_[offset + i] = hess[bag_data_indices_[i]]; + } } grad = gradients_pointer_ + offset; hess = hessians_pointer_ + offset; @@ -890,4 +893,10 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { } } +void GBDT::CopySubsampleGradientsCUDA( +score_t* dst_grad, score_t* dst_hess, +const score_t* src_grad, const score_t* src_hess) { + LaunchCopySubsampleGradientsKernel(dst_grad, dst_hess, src_grad, src_hess); +} + } // namespace LightGBM diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 1ed115caf201..c707d67b8413 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -465,6 +465,14 @@ class GBDT : public GBDTBase { double BoostFromAverage(int class_id, bool update_scorer); + void CopySubsampleGradientsCUDA( + score_t* dst_grad, score_t* dst_hess, + const score_t* src_grad, const score_t* src_hess); + + void LaunchCopySubsampleGradientsKernel( + score_t* dst_grad, score_t* dst_hess, + const score_t* src_grad, const score_t* src_hess); + /*! \brief current iteration */ int iter_; /*! \brief Pointer to training data */ diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index e7c8e959de8a..55e309eb0a5c 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -228,9 +228,6 @@ __device__ void FindBestSplitsForLeafKernelInner( const unsigned int bin_offset = threadIdx_x << 1; local_grad_hist = feature_hist_ptr[bin_offset]; local_hess_hist = feature_hist_ptr[bin_offset + 1]; - } else { - local_grad_hist = 0.0f; - local_hess_hist = 0.0f; } } else { if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { @@ -238,9 +235,6 @@ __device__ void FindBestSplitsForLeafKernelInner( const unsigned int bin_offset = read_index << 1; local_grad_hist = feature_hist_ptr[bin_offset]; local_hess_hist = feature_hist_ptr[bin_offset + 1]; - } else { - local_grad_hist = 0.0f; - local_hess_hist = 0.0f; } } __syncthreads(); @@ -266,21 +260,15 @@ __device__ void FindBestSplitsForLeafKernelInner( sum_right_hessian, lambda_l1, use_l1, lambda_l2); // gain with split is worse than without split - if (current_gain <= min_gain_shift) { - threshold_found = false; - } else { + if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; threshold_value = static_cast(feature_num_bin - 2 - threadIdx_x); threshold_found = true; } - } else { - threshold_found = false; } - } else { - threshold_found = true; } } else { - if (threadIdx_x <= feature_num_bin_minus_offset - 2 && !skip_sum) { + if (threadIdx_x <= feature_num_bin_minus_offset - 2/* && !skip_sum*/) { const double sum_left_gradient = local_grad_hist; const double sum_left_hessian = local_hess_hist; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); @@ -294,18 +282,12 @@ __device__ void FindBestSplitsForLeafKernelInner( sum_right_hessian, lambda_l1, use_l1, lambda_l2); // gain with split is worse than without split - if (current_gain <= min_gain_shift) { - threshold_found = false; - } else { + if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; threshold_value = static_cast(threadIdx_x + feature_mfb_offset); threshold_found = true; } - } else { - threshold_found = false; } - } else { - threshold_found = false; } } __syncthreads(); @@ -541,36 +523,38 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i const int num_blocks_per_leaf, const bool larger_only, const int num_leaves) { - + __shared__ double shared_gain_buffer[32]; + __shared__ bool shared_found_buffer[32]; + __shared__ uint32_t shared_thread_index_buffer[32]; const uint32_t threadIdx_x = threadIdx.x; const uint32_t blockIdx_x = blockIdx.x; - __shared__ bool best_found[NUM_TASKS_PER_SYNC_BLOCK]; - __shared__ double best_gain[NUM_TASKS_PER_SYNC_BLOCK]; - __shared__ uint32_t shared_read_index[NUM_TASKS_PER_SYNC_BLOCK]; + bool best_found = false; + double best_gain = kMinScore; + uint32_t shared_read_index = 0; const bool is_smaller = (blockIdx_x < static_cast(num_blocks_per_leaf) && !larger_only); const uint32_t leaf_block_index = (is_smaller || larger_only) ? blockIdx_x : (blockIdx_x - static_cast(num_blocks_per_leaf)); const int task_index = static_cast(leaf_block_index * blockDim.x + threadIdx_x); const uint32_t read_index = is_smaller ? static_cast(task_index) : static_cast(task_index + num_tasks); if (task_index < num_tasks) { - best_found[threadIdx_x] = cuda_best_split_info[read_index].is_valid; - best_gain[threadIdx_x] = cuda_best_split_info[read_index].gain; - shared_read_index[threadIdx_x] = read_index; + best_found = cuda_best_split_info[read_index].is_valid; + best_gain = cuda_best_split_info[read_index].gain; + shared_read_index = read_index; } else { - best_found[threadIdx_x] = false; + best_found = false; } __syncthreads(); - ReduceBestSplit(best_found, best_gain, shared_read_index, NUM_TASKS_PER_SYNC_BLOCK); + const uint32_t best_read_index = ReduceBestGain(best_gain, best_found, shared_read_index, + shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer); if (threadIdx.x == 0) { const int leaf_index_ref = is_smaller ? smaller_leaf_index : larger_leaf_index; const unsigned buffer_write_pos = static_cast(leaf_index_ref) + leaf_block_index * num_leaves; - const uint32_t best_read_index = shared_read_index[0]; CUDASplitInfo* cuda_split_info = cuda_leaf_best_split_info + buffer_write_pos; const CUDASplitInfo* best_split_info = cuda_best_split_info + best_read_index; - if (best_found[0]) { - cuda_split_info->gain = best_gain[0]; + if (best_split_info->is_valid) { + /*cuda_split_info->gain = best_split_info->gain; cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; cuda_split_info->default_left = best_split_info->default_left; @@ -585,6 +569,10 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i cuda_split_info->right_count = best_split_info->right_count; cuda_split_info->right_gain = best_split_info->right_gain; cuda_split_info->right_value = best_split_info->right_value; + cuda_split_info->is_valid = true;*/ + *cuda_split_info = *best_split_info; + cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : + cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; cuda_split_info->is_valid = true; } else { cuda_split_info->gain = kMinScore; diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index b5f6a052c46a..6411e24eeb18 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -19,7 +19,7 @@ #define MAX_NUM_BIN_IN_FEATURE (256) #define NUM_THREADS_FIND_BEST_LEAF (256) -#define NUM_TASKS_PER_SYNC_BLOCK (1024) +#define NUM_TASKS_PER_SYNC_BLOCK (32) namespace LightGBM { diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 095c1cb2b879..36650626a9a7 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -23,8 +23,8 @@ CUDADataPartition::CUDADataPartition( num_leaves_(num_leaves), num_threads_(num_threads), cuda_hist_(cuda_hist) { - max_num_split_indices_blocks_ = (num_data_ + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / - SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; + CalcBlockDim(num_data_); + max_num_split_indices_blocks_ = grid_dim_; cur_num_leaves_ = 1; bin_upper_bounds_.resize(num_features_); feature_num_bins_.resize(num_features_); @@ -125,9 +125,8 @@ void CUDADataPartition::Split( data_size_t* right_leaf_start, double* left_leaf_sum_of_hessians, double* right_leaf_sum_of_hessians) { + CalcBlockDim(num_data_in_leaf); global_timer.Start("GenDataToLeftBitVector"); - global_timer.Start("SplitInner Copy CUDA To Host"); - global_timer.Stop("SplitInner Copy CUDA To Host"); GenDataToLeftBitVector(num_data_in_leaf, leaf_best_split_feature, leaf_best_split_threshold, @@ -204,29 +203,7 @@ void CUDADataPartition::UpdateTrainScore(const double* leaf_value, double* cuda_ LaunchAddPredictionToScoreKernel(leaf_value, cuda_scores); } -void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf, - int* grid_dim, - int* block_dim) { - const int num_threads_per_block = SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION; - const int min_grid_dim = num_data_in_leaf <= 100 ? 1 : 10; - const int num_data_per_block = (num_threads_per_block * 8); - const int num_blocks = std::max(min_grid_dim, (num_data_in_leaf + num_data_per_block - 1) / num_data_per_block); - const int num_threads_per_block_final = (num_data_in_leaf + (num_blocks * 8) - 1) / (num_blocks * 8); - int num_threads_per_block_final_ref = num_threads_per_block_final - 1; - CHECK_GT(num_threads_per_block_final_ref, 0); - int num_threads_per_block_final_aligned = 1; - while (num_threads_per_block_final_ref > 0) { - num_threads_per_block_final_aligned <<= 1; - num_threads_per_block_final_ref >>= 1; - } - const int num_blocks_final = (num_data_in_leaf + (num_threads_per_block_final_aligned * 8) - 1) / (num_threads_per_block_final_aligned * 8); - *grid_dim = num_blocks_final; - *block_dim = num_threads_per_block_final_aligned; -} - -void CUDADataPartition::CalcBlockDimInCopy(const data_size_t num_data_in_leaf, - int* grid_dim, - int* block_dim) { +void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf) { const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; @@ -237,8 +214,8 @@ void CUDADataPartition::CalcBlockDimInCopy(const data_size_t num_data_in_leaf, split_indices_block_size_data_partition >>= 1; } const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; - *grid_dim = num_blocks_final; - *block_dim = split_indices_block_size_data_partition_aligned; + grid_dim_ = num_blocks_final; + block_dim_ = split_indices_block_size_data_partition_aligned; } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 472a00ac7566..7e427b3b2d89 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -27,10 +27,9 @@ void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() { FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_, cuda_data_index_to_leaf_index_); } -__device__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, uint16_t* block_to_left_offset, +__device__ __forceinline__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, uint16_t* block_to_left_offset, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - const uint16_t thread_to_left_offset_cnt) { - __shared__ uint16_t shared_mem_buffer[32]; + const uint16_t thread_to_left_offset_cnt, uint16_t* shared_mem_buffer) { const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockDim_x = blockDim.x; const uint16_t thread_to_left_offset = ShufflePrefixSum(thread_to_left_offset_cnt, shared_mem_buffer); @@ -111,139 +110,138 @@ void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel( const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint32_t min_bin_ref, int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, const int default_leaf_index, const int missing_default_leaf_index, - const bool missing_is_zero, const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, const bool max_to_left, - const int num_blocks, const int block_size) { + const bool missing_is_zero, const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, const bool max_to_left) { if (min_bin_ref < max_bin_ref) { if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } } else { if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); } } } @@ -260,6 +258,7 @@ __global__ void GenDataToLeftBitVectorKernel0(const int best_split_feature_ref, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, const int default_leaf_index, const int missing_default_leaf_index) { + __shared__ uint16_t shared_mem_buffer[32]; uint16_t thread_to_left_offset_cnt = 0; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -280,7 +279,7 @@ __global__ void GenDataToLeftBitVectorKernel0(const int best_split_feature_ref, } __syncthreads(); PrepareOffset(num_data_in_leaf, block_to_left_offset + blockIdx.x * blockDim.x, block_to_left_offset_buffer, block_to_right_offset_buffer, - thread_to_left_offset_cnt); + thread_to_left_offset_cnt, shared_mem_buffer); } // min_bin_ref == max_bin_ref @@ -295,6 +294,7 @@ __global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, const int default_leaf_index, const int missing_default_leaf_index) { + __shared__ uint16_t shared_mem_buffer[32]; uint16_t thread_to_left_offset_cnt = 0; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { @@ -311,20 +311,14 @@ __global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, } else { if (MISSING_IS_NA && !MFB_IS_NA) { thread_to_left_offset_cnt = split_missing_default_to_left; - } else { - if (MAX_TO_LEFT) { - thread_to_left_offset_cnt = 1; - } else { - thread_to_left_offset_cnt = 0; - } + } else if (MAX_TO_LEFT) { + thread_to_left_offset_cnt = 1; } } - } else { - thread_to_left_offset_cnt = 0; } __syncthreads(); - PrepareOffset(num_data_in_leaf, block_to_left_offset, block_to_left_offset_buffer, block_to_right_offset_buffer, - thread_to_left_offset_cnt); + PrepareOffset(num_data_in_leaf, block_to_left_offset + blockIdx.x * blockDim.x, block_to_left_offset_buffer, block_to_right_offset_buffer, + thread_to_left_offset_cnt, shared_mem_buffer); } #define GenBitVector_ARGS \ @@ -342,8 +336,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( const bool mfb_is_na, const bool max_bin_to_left, const int column_index, - const int num_blocks_final, - const int split_indices_block_size_data_partition_aligned, const int split_feature_index, const data_size_t leaf_data_start, const data_size_t num_data_in_leaf, @@ -362,100 +354,100 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start; if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); } } @@ -466,8 +458,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( const bool mfb_is_zero, const bool mfb_is_na, const int column_index, - const int num_blocks_final, - const int split_indices_block_size_data_partition_aligned, const int split_feature_index, const data_size_t leaf_data_start, const data_size_t num_data_in_leaf, @@ -486,100 +476,100 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start; if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); } } @@ -589,15 +579,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num const int split_feature_index, const uint32_t split_threshold, const uint8_t split_default_left, const data_size_t leaf_data_start, const int left_leaf_index, const int right_leaf_index) { - const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; - const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); - int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; - int split_indices_block_size_data_partition_aligned = 1; - while (split_indices_block_size_data_partition > 0) { - split_indices_block_size_data_partition_aligned <<= 1; - split_indices_block_size_data_partition >>= 1; - } - const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; const uint8_t missing_is_zero = cuda_column_data_->feature_missing_is_zero(split_feature_index); const uint8_t missing_is_na = cuda_column_data_->feature_missing_is_na(split_feature_index); const uint8_t mfb_is_zero = cuda_column_data_->feature_mfb_is_zero(split_feature_index); @@ -642,8 +623,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num mfb_is_zero, mfb_is_na, column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -665,8 +644,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num mfb_is_zero, mfb_is_na, column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -688,8 +665,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num mfb_is_zero, mfb_is_na, column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -714,8 +689,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num mfb_is_na, max_bin_to_left, column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -738,8 +711,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num mfb_is_na, max_bin_to_left, column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -762,8 +733,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num mfb_is_na, max_bin_to_left, column_index, - num_blocks_final, - split_indices_block_size_data_partition_aligned, split_feature_index, leaf_data_start, num_data_in_leaf, @@ -791,9 +760,7 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num static_cast(missing_is_na), static_cast(mfb_is_zero), static_cast(mfb_is_na), - max_bin_to_left, - num_blocks_final, - split_indices_block_size_data_partition_aligned); + max_bin_to_left); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(column_data_pointer); LaunchUpdateDataIndexToLeafIndexKernel(num_data_in_leaf, @@ -803,9 +770,7 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num static_cast(missing_is_na), static_cast(mfb_is_zero), static_cast(mfb_is_na), - max_bin_to_left, - num_blocks_final, - split_indices_block_size_data_partition_aligned); + max_bin_to_left); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(column_data_pointer); LaunchUpdateDataIndexToLeafIndexKernel(num_data_in_leaf, @@ -815,9 +780,7 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num static_cast(missing_is_na), static_cast(mfb_is_zero), static_cast(mfb_is_na), - max_bin_to_left, - num_blocks_final, - split_indices_block_size_data_partition_aligned); + max_bin_to_left); } } @@ -860,9 +823,6 @@ __global__ void AggregateBlockOffsetKernel0( if (thread_start_block_index < thread_end_block_index && thread_start_block_index > 1) { block_to_left_offset = block_to_left_offset_buffer[thread_start_block_index - 1]; block_to_right_offset = block_to_right_offset_buffer[thread_start_block_index - 1]; - } else { - block_to_left_offset = 0; - block_to_right_offset = 0; } block_to_left_offset = ShufflePrefixSum(block_to_left_offset, shared_mem_buffer); __syncthreads(); @@ -894,7 +854,7 @@ __global__ void AggregateBlockOffsetKernel1( data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, data_size_t* cuda_leaf_data_start, data_size_t* cuda_leaf_data_end, data_size_t* cuda_leaf_num_data, const data_size_t* cuda_data_indices, - const data_size_t num_blocks, const data_size_t num_blocks_aligned) { + const data_size_t num_blocks) { __shared__ uint32_t shared_mem_buffer[32]; __shared__ uint32_t to_left_total_count; const data_size_t num_data_in_leaf = cuda_leaf_num_data[left_leaf_index]; @@ -940,9 +900,7 @@ __global__ void SplitTreeStructureKernel(const int left_leaf_index, CUDALeafSplitsStruct* smaller_leaf_splits, CUDALeafSplitsStruct* larger_leaf_splits, const int num_total_bin, - hist_t* cuda_hist, hist_t** cuda_hist_pool, const int split_indices_block_size_data_partition, - - const double* cuda_bin_upper_bounds, const int* cuda_feature_num_bin_offsets, + hist_t* cuda_hist, hist_t** cuda_hist_pool, double* cuda_leaf_output, int* cuda_split_info_buffer) { const unsigned int to_left_total_cnt = cuda_leaf_num_data[left_leaf_index]; @@ -1072,9 +1030,17 @@ __global__ void SplitInnerKernel(const int left_leaf_index, const int right_leaf const uint32_t thread_to_left_offset = (threadIdx_x == 0 ? 0 : block_to_left_offset_ptr[threadIdx_x - 1]); const bool to_left = block_to_left_offset_ptr[threadIdx_x] > thread_to_left_offset; if (to_left) { + if (static_cast(thread_to_left_offset) >= block_to_left_offset_buffer[blockIdx.x + 1] - block_to_left_offset_buffer[blockIdx.x]) { + printf("error: thread_to_left_offset = %d, block_to_left_offset_buffer[%d] - block_to_left_offset_buffer[%d] = %d\n", + thread_to_left_offset, blockIdx.x + 1, blockIdx.x, block_to_left_offset_buffer[blockIdx.x + 1] - block_to_left_offset_buffer[blockIdx.x]); + } left_out_data_indices_in_leaf[thread_to_left_offset] = cuda_data_indices_in_leaf[global_thread_index]; } else { const uint32_t thread_to_right_offset = threadIdx.x - thread_to_left_offset; + if (static_cast(thread_to_right_offset) >= block_to_right_offset_buffer[blockIdx.x + 1] - block_to_right_offset_buffer[blockIdx.x]) { + printf("error: thread_to_right_offset = %d, block_to_right_offset_buffer[%d] - block_to_right_offset_buffer[%d] = %d\n", + thread_to_right_offset, blockIdx.x + 1, blockIdx.x, block_to_right_offset_buffer[blockIdx.x + 1] - block_to_right_offset_buffer[blockIdx.x]); + } right_out_data_indices_in_leaf[thread_to_right_offset] = cuda_data_indices_in_leaf[global_thread_index]; } } @@ -1105,16 +1071,7 @@ void CUDADataPartition::LaunchSplitInnerKernel( data_size_t* right_leaf_start_ref, double* left_leaf_sum_of_hessians_ref, double* right_leaf_sum_of_hessians_ref) { - const int min_num_blocks = num_data_in_leaf <= 100 ? 1 : 80; - const int num_blocks = std::max(min_num_blocks, (num_data_in_leaf + SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION); - int split_indices_block_size_data_partition = (num_data_in_leaf + num_blocks - 1) / num_blocks - 1; - int split_indices_block_size_data_partition_aligned = 1; - while (split_indices_block_size_data_partition > 0) { - split_indices_block_size_data_partition_aligned <<= 1; - split_indices_block_size_data_partition >>= 1; - } - const int num_blocks_final = (num_data_in_leaf + split_indices_block_size_data_partition_aligned - 1) / split_indices_block_size_data_partition_aligned; - int num_blocks_final_ref = num_blocks_final - 1; + int num_blocks_final_ref = grid_dim_ - 1; int num_blocks_final_aligned = 1; while (num_blocks_final_ref > 0) { num_blocks_final_aligned <<= 1; @@ -1122,14 +1079,14 @@ void CUDADataPartition::LaunchSplitInnerKernel( } global_timer.Start("CUDADataPartition::AggregateBlockOffsetKernel"); - if (num_blocks_final > AGGREGATE_BLOCK_SIZE_DATA_PARTITION) { + if (grid_dim_ > AGGREGATE_BLOCK_SIZE_DATA_PARTITION) { AggregateBlockOffsetKernel0<<<1, AGGREGATE_BLOCK_SIZE_DATA_PARTITION, 0, cuda_streams_[0]>>>( left_leaf_index, right_leaf_index, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, - num_blocks_final); + grid_dim_); } else { AggregateBlockOffsetKernel1<<<1, num_blocks_final_aligned, 0, cuda_streams_[0]>>>( left_leaf_index, @@ -1137,17 +1094,18 @@ void CUDADataPartition::LaunchSplitInnerKernel( cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_leaf_data_start_, cuda_leaf_data_end_, cuda_leaf_num_data_, cuda_data_indices_, - num_blocks_final, num_blocks_final_aligned); + grid_dim_); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); global_timer.Start("CUDADataPartition::SplitInnerKernel"); - SplitInnerKernel<<>>( + SplitInnerKernel<<>>( left_leaf_index, right_leaf_index, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_block_to_left_offset_, cuda_out_data_indices_in_leaf_); global_timer.Stop("CUDADataPartition::SplitInnerKernel"); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); SplitTreeStructureKernel<<<4, 5, 0, cuda_streams_[0]>>>(left_leaf_index, right_leaf_index, @@ -1159,9 +1117,7 @@ void CUDADataPartition::LaunchSplitInnerKernel( larger_leaf_splits, num_total_bin_, cuda_hist_, - cuda_hist_pool_, split_indices_block_size_data_partition_aligned, - - cuda_bin_upper_bounds_, cuda_feature_num_bin_offsets_, + cuda_hist_pool_, cuda_leaf_output_, cuda_split_info_buffer_); global_timer.Stop("CUDADataPartition::SplitTreeStructureKernel"); std::vector cpu_split_info_buffer(12); @@ -1174,7 +1130,7 @@ void CUDADataPartition::LaunchSplitInnerKernel( const data_size_t left_leaf_data_start = cpu_split_info_buffer[2]; const data_size_t right_leaf_num_data = cpu_split_info_buffer[4]; global_timer.Start("CUDADataPartition::CopyDataIndicesKernel"); - CopyDataIndicesKernel<<>>( + CopyDataIndicesKernel<<>>( left_leaf_num_data + right_leaf_num_data, cuda_out_data_indices_in_leaf_, cuda_data_indices_ + left_leaf_data_start); global_timer.Stop("CUDADataPartition::CopyDataIndicesKernel"); const data_size_t right_leaf_data_start = cpu_split_info_buffer[5]; diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 8a0f1713dfe5..04d7cf2e094d 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -18,7 +18,7 @@ // TODO(shiyu1994): adjust these values according to different CUDA and GPU versions #define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) -#define SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION (512) +#define SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) #define AGGREGATE_BLOCK_SIZE_DATA_PARTITION (1024) namespace LightGBM { @@ -66,15 +66,7 @@ class CUDADataPartition { const data_size_t* cuda_leaf_data_start() const { return cuda_leaf_data_start_; } private: - void CalcBlockDim( - const data_size_t num_data_in_leaf, - int* grid_dim, - int* block_dim); - - void CalcBlockDimInCopy( - const data_size_t num_data_in_leaf, - int* grid_dim, - int* block_dim); + void CalcBlockDim(const data_size_t num_data_in_leaf); void GenDataToLeftBitVector( const data_size_t num_data_in_leaf, @@ -139,8 +131,6 @@ class CUDADataPartition { const bool mfb_is_na, const bool max_bin_to_left, const int column_index, - const int num_blocks_final, - const int split_indices_block_size_data_partition_aligned, const int split_feature_index, const data_size_t leaf_data_start, const data_size_t num_data_in_leaf, @@ -163,8 +153,6 @@ class CUDADataPartition { const bool mfb_is_zero, const bool mfb_is_na, const int column_index, - const int num_blocks_final, - const int split_indices_block_size_data_partition_aligned, const int split_feature_index, const data_size_t leaf_data_start, const data_size_t num_data_in_leaf, @@ -199,9 +187,7 @@ class CUDADataPartition { const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_to_left, - const int num_blocks, - const int block_size); + const bool max_to_left); void LaunchAddPredictionToScoreKernel(const double* leaf_value, double* cuda_scores); @@ -221,6 +207,10 @@ class CUDADataPartition { std::vector feature_num_bins_; /*! \brief bin data stored by column */ const CUDAColumnData* cuda_column_data_; + /*! \brief grid dimension when splitting one leaf */ + int grid_dim_; + /*! \brief block dimension when splitting one leaf */ + int block_dim_; // config information /*! \brief maximum number of leaves in a tree */ diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 7f721501b1ce..bd879f9fa388 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -52,8 +52,6 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia void NewCUDATreeLearner::BeforeTrain() { cuda_data_partition_->BeforeTrain(nullptr); - global_timer.Start("CUDACentralizedInfo::BeforeTrain"); - global_timer.Stop("CUDACentralizedInfo::BeforeTrain"); cuda_smaller_leaf_splits_->InitValues( gradients_, hessians_, From d7c4bb4fe070e74ac34d010c375f112cc5eb5c71 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 15 Sep 2021 07:40:00 +0000 Subject: [PATCH 073/166] add bagging for CUDA --- include/LightGBM/cuda/cuda_column_data.hpp | 7 ++ src/io/cuda/cuda_column_data.cpp | 93 +++++++++++++++---- src/io/cuda/cuda_column_data.cu | 56 +++++++++++ src/io/dataset.cpp | 7 ++ src/treelearner/cuda/cuda_data_partition.cpp | 33 +++++-- src/treelearner/cuda/cuda_data_partition.cu | 6 +- src/treelearner/cuda/cuda_data_partition.hpp | 20 +++- src/treelearner/cuda/cuda_leaf_splits.cpp | 6 +- src/treelearner/cuda/cuda_leaf_splits.cu | 5 +- src/treelearner/cuda/cuda_leaf_splits.hpp | 5 +- .../cuda/new_cuda_tree_learner.cpp | 18 +++- .../cuda/new_cuda_tree_learner.hpp | 2 + 12 files changed, 215 insertions(+), 43 deletions(-) create mode 100644 src/io/cuda/cuda_column_data.cu diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp index efffd9dcae20..4be1c06149c2 100644 --- a/include/LightGBM/cuda/cuda_column_data.hpp +++ b/include/LightGBM/cuda/cuda_column_data.hpp @@ -38,6 +38,8 @@ class CUDAColumnData { const void* GetColumnData(const int column_index) const { return data_by_column_[column_index]; } + void CopySubrow(const CUDAColumnData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices); + void* const* cuda_data_by_column() const { return cuda_data_by_column_; } uint32_t feature_min_bin(const int feature_index) const { return feature_min_bin_[feature_index]; } @@ -88,6 +90,10 @@ class CUDAColumnData { template void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, void** out_column_data_pointer); + void LaunchCopySubrowKernel(void* const* in_cuda_data_by_column, const data_size_t num_used_indices); + + void InitColumnMetaInfo(); + int num_threads_; data_size_t num_data_; int num_columns_; @@ -116,6 +122,7 @@ class CUDAColumnData { uint8_t* cuda_feature_mfb_is_zero_; uint8_t* cuda_feature_mfb_is_na_; int* cuda_feature_to_column_; + data_size_t* cuda_used_indices_; }; } // namespace LightGBM diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp index 73e0dca6252d..3fddc01f8e07 100644 --- a/src/io/cuda/cuda_column_data.cpp +++ b/src/io/cuda/cuda_column_data.cpp @@ -15,6 +15,8 @@ CUDAColumnData::CUDAColumnData(const data_size_t num_data, const int gpu_device_ } else { CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); } + cuda_used_indices_ = nullptr; + cuda_data_by_column_ = nullptr; } CUDAColumnData::~CUDAColumnData() {} @@ -82,8 +84,10 @@ void CUDAColumnData::Init(const int num_columns, feature_mfb_is_zero_ = feature_mfb_is_zero; feature_mfb_is_na_ = feature_mfb_is_na; data_by_column_.resize(num_columns_, nullptr); + OMP_INIT_EX(); #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int column_index = 0; column_index < num_columns_; ++column_index) { + OMP_LOOP_EX_BEGIN(); const int8_t bit_type = column_bit_type[column_index]; if (column_data[column_index] != nullptr) { // is dense column @@ -111,61 +115,113 @@ void CUDAColumnData::Init(const int num_columns, Log::Fatal("Unknow column bit type %d", bit_type); } } + OMP_LOOP_EX_END(); } + OMP_THROW_EX(); feature_to_column_ = feature_to_column; InitCUDAMemoryFromHostMemoryOuter(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__); + InitColumnMetaInfo(); +} + +void CUDAColumnData::CopySubrow( + const CUDAColumnData* full_set, + const data_size_t* used_indices, + const data_size_t num_used_indices) { + num_threads_ = full_set->num_threads_; + num_columns_ = full_set->num_columns_; + column_bit_type_ = full_set->column_bit_type_; + feature_min_bin_ = full_set->feature_min_bin_; + feature_max_bin_ = full_set->feature_max_bin_; + feature_offset_ = full_set->feature_offset_; + feature_most_freq_bin_ = full_set->feature_most_freq_bin_; + feature_default_bin_ = full_set->feature_default_bin_; + feature_missing_is_zero_ = full_set->feature_missing_is_zero_; + feature_missing_is_na_ = full_set->feature_missing_is_na_; + feature_mfb_is_zero_ = full_set->feature_mfb_is_zero_; + feature_mfb_is_na_ = full_set->feature_mfb_is_na_; + if (cuda_used_indices_ == nullptr) { + // initialize the subset cuda column data + const size_t full_set_num_data = static_cast(full_set->num_data_); + AllocateCUDAMemoryOuter(&cuda_used_indices_, full_set_num_data, __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); + data_by_column_.resize(num_columns_, nullptr); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int column_index = 0; column_index < num_columns_; ++column_index) { + const uint8_t bit_type = column_bit_type_[column_index]; + if (bit_type == 8) { + uint8_t* column_data = nullptr; + AllocateCUDAMemoryOuter(&column_data, full_set_num_data, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } else if (bit_type == 16) { + uint16_t* column_data = nullptr; + AllocateCUDAMemoryOuter(&column_data, full_set_num_data, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } else if (bit_type == 32) { + uint32_t* column_data = nullptr; + AllocateCUDAMemoryOuter(&column_data, full_set_num_data, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } + } + InitCUDAMemoryFromHostMemoryOuter(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__); + InitColumnMetaInfo(); + } + LaunchCopySubrowKernel(full_set->cuda_data_by_column(), num_used_indices); +} + +void CUDAColumnData::InitColumnMetaInfo() { InitCUDAMemoryFromHostMemoryOuter(&cuda_column_bit_type_, column_bit_type_.data(), column_bit_type_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_max_bin_, - feature_max_bin.data(), - feature_max_bin.size(), + feature_max_bin_.data(), + feature_max_bin_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_min_bin_, - feature_min_bin.data(), - feature_min_bin.size(), + feature_min_bin_.data(), + feature_min_bin_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_offset_, - feature_offset.data(), - feature_offset.size(), + feature_offset_.data(), + feature_offset_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_most_freq_bin_, - feature_most_freq_bin.data(), - feature_most_freq_bin.size(), + feature_most_freq_bin_.data(), + feature_most_freq_bin_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_default_bin_, - feature_default_bin.data(), - feature_default_bin.size(), + feature_default_bin_.data(), + feature_default_bin_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_missing_is_zero_, - feature_missing_is_zero.data(), - feature_missing_is_zero.size(), + feature_missing_is_zero_.data(), + feature_missing_is_zero_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_missing_is_na_, - feature_missing_is_na.data(), - feature_missing_is_na.size(), + feature_missing_is_na_.data(), + feature_missing_is_na_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_mfb_is_zero_, - feature_mfb_is_zero.data(), - feature_mfb_is_zero.size(), + feature_mfb_is_zero_.data(), + feature_mfb_is_zero_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_mfb_is_na_, - feature_mfb_is_na.data(), - feature_mfb_is_na.size(), + feature_mfb_is_na_.data(), + feature_mfb_is_na_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_to_column_, @@ -173,7 +229,6 @@ void CUDAColumnData::Init(const int num_columns, feature_to_column_.size(), __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } } // namespace LightGBM diff --git a/src/io/cuda/cuda_column_data.cu b/src/io/cuda/cuda_column_data.cu new file mode 100644 index 000000000000..e5d982248b72 --- /dev/null +++ b/src/io/cuda/cuda_column_data.cu @@ -0,0 +1,56 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include + +#define COPY_SUBROW_BLOCK_SIZE_COLUMN_DATA (1024) + +namespace LightGBM { + +__global__ void CopySubrowKernel_ColumnData( + void* const* in_cuda_data_by_column, + const uint8_t* cuda_column_bit_type, + const data_size_t* cuda_used_indices, + const data_size_t num_used_indices, + void** out_cuda_data_by_column) { + const int column_index = static_cast(blockIdx.x); + const void* in_column_data = in_cuda_data_by_column[column_index]; + void* out_column_data = out_cuda_data_by_column[column_index]; + const uint8_t bit_type = cuda_column_bit_type[column_index]; + const data_size_t local_data_index_start = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (bit_type == 8) { + const uint8_t* true_in_column_data = reinterpret_cast(in_column_data); + uint8_t* true_out_column_data = reinterpret_cast(out_column_data); + for (data_size_t local_data_index = local_data_index_start; local_data_index < num_used_indices; local_data_index += static_cast(blockDim.x)) { + const data_size_t global_data_index = cuda_used_indices[local_data_index]; + true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; + } + } else if (bit_type == 16) { + const uint16_t* true_in_column_data = reinterpret_cast(in_column_data); + uint16_t* true_out_column_data = reinterpret_cast(out_column_data); + for (data_size_t local_data_index = local_data_index_start; local_data_index < num_used_indices; local_data_index += static_cast(blockDim.x)) { + const data_size_t global_data_index = cuda_used_indices[local_data_index]; + true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; + } + } else if (bit_type == 32) { + const uint32_t* true_in_column_data = reinterpret_cast(in_column_data); + uint32_t* true_out_column_data = reinterpret_cast(out_column_data); + for (data_size_t local_data_index = local_data_index_start; local_data_index < num_used_indices; local_data_index += static_cast(blockDim.x)) { + const data_size_t global_data_index = cuda_used_indices[local_data_index]; + true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; + } + } +} + +void CUDAColumnData::LanchCopySubrowKernel(void* const* in_cuda_data_by_column, const data_size_t num_used_indices) { + CopySubrowKernel_ColumnData<<>>( + in_cuda_data_by_column, + cuda_column_bit_type_, + cuda_used_indices_, + num_used_indices, + cuda_data_by_column_); +} + +} // namespace LightGBM diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 7fe17c9a01ca..0ef5eb044667 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -427,6 +427,7 @@ void Dataset::Construct(std::vector>* bin_mappers, } device_type_ = io_config.device_type; gpu_device_id_ = io_config.gpu_device_id; + gpu_device_id_ = -1; } void Dataset::FinishLoad() { @@ -842,6 +843,12 @@ void Dataset::CopySubrow(const Dataset* fullset, } } } + // update CUDA storage for column data and metadata + if (device_type_ == std::string("cuda")) { + cuda_column_data_.reset(new CUDAColumnData(num_used_indices, gpu_device_id_)); + cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices); + metadata_.CreateCUDAMetadata(gpu_device_id_); + } } bool Dataset::SetFloatField(const char* field_name, const float* field_data, diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 36650626a9a7..cc35da52a762 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -86,23 +86,26 @@ void CUDADataPartition::Init() { InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_num_bin_offsets_, feature_num_bin_offsets.data(), feature_num_bin_offsets.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_bin_upper_bounds_, flatten_bin_upper_bounds.data(), flatten_bin_upper_bounds.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_num_data_, &num_data_, 1, __FILE__, __LINE__); + use_bagging_ = false; } -void CUDADataPartition::BeforeTrain(const data_size_t* data_indices) { - if (data_indices == nullptr) { - // no bagging +void CUDADataPartition::BeforeTrain() { + if (!use_bagging_) { LaunchFillDataIndicesBeforeTrain(); - SetCUDAMemoryOuter(cuda_leaf_num_data_, 0, static_cast(num_leaves_), __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_leaf_data_start_, 0, static_cast(num_leaves_), __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_leaf_data_end_, 0, static_cast(num_leaves_), __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + } + SetCUDAMemoryOuter(cuda_leaf_num_data_, 0, static_cast(num_leaves_), __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_leaf_data_start_, 0, static_cast(num_leaves_), __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_leaf_data_end_, 0, static_cast(num_leaves_), __FILE__, __LINE__); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + if (!use_bagging_) { CopyFromCUDADeviceToCUDADeviceOuter(cuda_leaf_num_data_, cuda_num_data_, 1, __FILE__, __LINE__); CopyFromCUDADeviceToCUDADeviceOuter(cuda_leaf_data_end_, cuda_num_data_, 1, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__); } else { - Log::Fatal("bagging is not supported by GPU"); + CopyFromHostToCUDADeviceOuter(cuda_leaf_num_data_, &num_used_indices_, 1, __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_leaf_data_end_, &num_used_indices_, 1, __FILE__, __LINE__); } + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__); } void CUDADataPartition::Split( @@ -218,6 +221,16 @@ void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf) { block_dim_ = split_indices_block_size_data_partition_aligned; } +void CUDADataPartition::SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_used_indices) { + use_bagging_ = true; + num_used_indices_ = num_used_indices; + CopyFromHostToCUDADeviceOuter(cuda_data_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); +} + +void CUDADataPartition::SetUseBagging(const bool use_bagging) { + use_bagging_ = use_bagging; +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 7e427b3b2d89..02becba3e78f 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -23,8 +23,10 @@ __global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_dat } void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() { - const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; - FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_, cuda_data_index_to_leaf_index_); + if (used_indices == nullptr) { + const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; + FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_, cuda_data_index_to_leaf_index_); + } } __device__ __forceinline__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, uint16_t* block_to_left_offset, diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 04d7cf2e094d..1f4351e34807 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -34,7 +34,7 @@ class CUDADataPartition { void Init(); - void BeforeTrain(const data_size_t* data_indices); + void BeforeTrain(); void Split( // input best split info @@ -59,6 +59,18 @@ class CUDADataPartition { void UpdateTrainScore(const double* leaf_value, double* cuda_scores); + void SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_used_indices); + + void SetUseBagging(const bool use_bagging); + + data_size_t root_num_data() const { + if (use_bagging_) { + return num_used_indices_; + } else { + return num_data_; + } + } + const data_size_t* cuda_data_indices() const { return cuda_data_indices_; } const data_size_t* cuda_leaf_num_data() const { return cuda_leaf_num_data_; } @@ -218,6 +230,12 @@ class CUDADataPartition { /*! \brief number of threads */ const int num_threads_; + // per iteration information + /*! \brief whether bagging is used in this iteration */ + bool use_bagging_; + /*! \brief number of used data indices in this iteration */ + data_size_t num_used_indices_; + // tree structure information /*! \brief current number of leaves in tree */ int cur_num_leaves_; diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index dfb9a74e68d2..4a6c7c514b2a 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -33,13 +33,13 @@ void CUDALeafSplits::InitValues() { void CUDALeafSplits::InitValues( const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, - double* root_sum_hessians) { + const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, + hist_t* cuda_hist_in_leaf, double* root_sum_hessians) { cuda_gradients_ = cuda_gradients; cuda_hessians_ = cuda_hessians; SetCUDAMemoryOuter(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); - LaunchInitValuesKernal(cuda_data_indices_in_leaf, cuda_hist_in_leaf); + LaunchInitValuesKernal(cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf); CopyFromCUDADeviceToHostOuter(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 1072dee37b04..08d5de65ef8f 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -82,16 +82,17 @@ void CUDALeafSplits::LaunchInitValuesEmptyKernel() { void CUDALeafSplits::LaunchInitValuesKernal( const data_size_t* cuda_data_indices_in_leaf, + const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf) { CUDAInitValuesKernel1<<>>( - cuda_gradients_, cuda_hessians_, num_data_, cuda_sum_of_gradients_buffer_, + cuda_gradients_, cuda_hessians_, num_used_indices, cuda_sum_of_gradients_buffer_, cuda_sum_of_hessians_buffer_); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CUDAInitValuesKernel2<<<1, 1>>>( num_blocks_init_from_gradients_, cuda_sum_of_gradients_buffer_, cuda_sum_of_hessians_buffer_, - num_data_, + num_used_indices, cuda_data_indices_in_leaf, cuda_hist_in_leaf, cuda_struct_); diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index 39d239303422..f44dcc50bc63 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -41,8 +41,8 @@ class CUDALeafSplits { void InitValues( const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, - double* root_sum_hessians); + const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, + hist_t* cuda_hist_in_leaf, double* root_sum_hessians); void InitValues(); @@ -54,6 +54,7 @@ class CUDALeafSplits { void LaunchInitValuesEmptyKernel(); void LaunchInitValuesKernal(const data_size_t* cuda_data_indices_in_leaf, + const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf); // Host memory diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index bd879f9fa388..5dbcfcf9f4b2 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -51,17 +51,18 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia } void NewCUDATreeLearner::BeforeTrain() { - cuda_data_partition_->BeforeTrain(nullptr); + cuda_data_partition_->BeforeTrain(); cuda_smaller_leaf_splits_->InitValues( gradients_, hessians_, cuda_data_partition_->cuda_data_indices(), + cuda_data_partition_->root_num_data(), cuda_histogram_constructor_->cuda_hist_pointer(), &leaf_sum_hessians_[0]); + leaf_num_data_[0] = cuda_data_partition_->root_num_data(); cuda_larger_leaf_splits_->InitValues(); cuda_histogram_constructor_->BeforeTrain(gradients_, hessians_); cuda_best_split_finder_->BeforeTrain(); - leaf_num_data_[0] = num_data_; leaf_data_start_[0] = 0; smaller_leaf_index_ = 0; larger_leaf_index_ = -1; @@ -199,8 +200,13 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, void NewCUDATreeLearner::ResetTrainingData(const Dataset* /*train_data*/, bool /*is_constant_hessian*/) {} -void NewCUDATreeLearner::SetBaggingData(const Dataset* /*subset*/, - const data_size_t* /*used_indices*/, data_size_t /*num_data*/) {} +void NewCUDATreeLearner::SetBaggingData(const Dataset* subset, + const data_size_t* used_indices, data_size_t num_data) { + cuda_data_partition_->SetUsedDataIndices(used_indices, num_data); + if (subset == nullptr) { + + } +} void NewCUDATreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function /*residual_getter*/, const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { @@ -214,6 +220,10 @@ void NewCUDATreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* ob cuda_tree->cuda_leaf_value_ref()); } +void NewCUDATreeLearner::AfterTrain() { + cuda_data_partition_->SetUseBagging(false); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index d74ec9b33020..feb5653c7b53 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -39,6 +39,8 @@ class NewCUDATreeLearner: public SerialTreeLearner { protected: void BeforeTrain() override; + void AfterTrain(); + // number of GPUs int num_gpus_; // number of threads on CPU From d9bf3e5c13466c188f00331a6b13050b9e11d039 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 15 Sep 2021 09:02:51 +0000 Subject: [PATCH 074/166] clean up code --- include/LightGBM/boosting.h | 7 - include/LightGBM/cuda/cuda_tree.hpp | 27 +- include/LightGBM/metric.h | 4 - include/LightGBM/objective_function.h | 7 - include/LightGBM/tree_learner.h | 2 +- src/application/cuda/cuda_predictor.cpp | 275 ------------------ src/application/cuda/cuda_predictor.cu | 127 -------- src/application/cuda/cuda_predictor.hpp | 72 ----- src/application/predictor.hpp | 18 +- src/boosting/cuda/cuda_score_updater.cpp | 69 ----- src/boosting/cuda/cuda_score_updater.cu | 43 --- src/boosting/cuda/cuda_score_updater.hpp | 48 --- src/boosting/cuda/gbdt.cu | 33 --- src/boosting/gbdt.cpp | 114 ++------ src/boosting/gbdt.h | 30 +- src/boosting/rf.hpp | 2 +- src/boosting/score_updater.hpp | 14 +- src/c_api.cpp | 35 ++- src/io/cuda/cuda_column_data.cu | 8 +- src/io/cuda/cuda_row_data.cpp | 3 - src/io/cuda/cuda_tree.cpp | 19 +- src/io/cuda/cuda_tree.cu | 61 ---- src/io/dataset.cpp | 1 - src/metric/binary_metric.hpp | 6 +- src/metric/metric.cpp | 1 - src/metric/multiclass_metric.hpp | 4 +- src/metric/rank_metric.hpp | 2 +- src/metric/regression_metric.hpp | 2 +- src/metric/xentropy_metric.hpp | 6 +- src/objective/binary_objective.hpp | 2 +- src/objective/multiclass_objective.hpp | 4 +- src/objective/objective_function.cpp | 1 - src/objective/rank_objective.hpp | 12 +- src/objective/regression_objective.hpp | 2 - src/objective/xentropy_objective.hpp | 4 +- src/treelearner/cuda/cuda_data_partition.cu | 8 +- .../cuda/new_cuda_tree_learner.cpp | 50 +--- .../cuda/new_cuda_tree_learner.hpp | 7 +- src/treelearner/serial_tree_learner.cpp | 3 +- src/treelearner/serial_tree_learner.h | 2 +- 40 files changed, 115 insertions(+), 1020 deletions(-) delete mode 100644 src/application/cuda/cuda_predictor.cpp delete mode 100644 src/application/cuda/cuda_predictor.cu delete mode 100644 src/application/cuda/cuda_predictor.hpp delete mode 100644 src/boosting/cuda/cuda_score_updater.cpp delete mode 100644 src/boosting/cuda/cuda_score_updater.cu delete mode 100644 src/boosting/cuda/cuda_score_updater.hpp delete mode 100644 src/boosting/cuda/gbdt.cu diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index 8cb416cd8169..ddbcdbc18e44 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -7,7 +7,6 @@ #include #include -#include #include #include @@ -315,12 +314,6 @@ class LIGHTGBM_EXPORT Boosting { static Boosting* CreateBoosting(const std::string& type, const char* filename); virtual bool IsLinear() const { return false; } - - virtual const std::vector>& models() const = 0; - - virtual int num_tree_per_iteration() const = 0; - - virtual std::function GetCUDAConvertOutputFunc() const = 0; }; class GBDTBase : public Boosting { diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index b874758c81e2..0f6374da6344 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -43,27 +43,6 @@ class CUDATree : public Tree { const MissingType missing_type, const CUDASplitInfo* cuda_split_info); - /*! - * \brief Adding prediction value of this tree model to scores - * \param data The dataset - * \param num_data Number of total data - * \param score Will add prediction to score - */ - void AddPredictionToScore(const Dataset* data, - data_size_t num_data, - double* score) const override; - - /*! - * \brief Adding prediction value of this tree model to scores - * \param data The dataset - * \param used_data_indices Indices of used data - * \param num_data Number of total data - * \param score Will add prediction to score - */ - void AddPredictionToScore(const Dataset* data, - const data_size_t* used_data_indices, - data_size_t num_data, double* score) const override; - const int* cuda_left_child() const { return cuda_left_child_; } const int* cuda_right_child() const { return cuda_right_child_; } @@ -88,6 +67,8 @@ class CUDATree : public Tree { void ToHost(); + void SyncLeafOutputFromHostToCUDA(); + private: void InitCUDAMemory(); @@ -99,10 +80,6 @@ class CUDATree : public Tree { const MissingType missing_type, const CUDASplitInfo* cuda_split_info); - void LaunchAddPredictionToScoreKernel(const Dataset* data, - const data_size_t* used_data_indices, - data_size_t num_data, double* score) const; - void LaunchShrinkageKernel(const double rate); void LaunchAddBiasKernel(const double val); diff --git a/include/LightGBM/metric.h b/include/LightGBM/metric.h index c3d62b5a2aa3..cffc270ce675 100644 --- a/include/LightGBM/metric.h +++ b/include/LightGBM/metric.h @@ -135,10 +135,6 @@ class DCGCalculator { */ inline static double GetDiscount(data_size_t k) { return discount_[k]; } - inline static const std::vector& label_gain() { return label_gain_; } - - inline static const std::vector& discount() { return discount_; } - private: /*! \brief store gains for different label */ static std::vector label_gain_; diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index c5f16769d4b2..5ea838dece23 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -48,9 +48,6 @@ class ObjectiveFunction { const data_size_t*, data_size_t) const { return ori_output; } - virtual void RenewTreeOutputCUDA(const double* /*score*/, const data_size_t* /*data_indices_in_leaf*/, const data_size_t* /*num_data_in_leaf*/, - const data_size_t* /*data_start_in_leaf*/, const int /*num_leaves*/, double* /*leaf_value*/) const {} - virtual double BoostFromScore(int /*class_id*/) const { return 0.0; } virtual bool ClassNeedTrain(int /*class_id*/) const { return true; } @@ -91,10 +88,6 @@ class ObjectiveFunction { * \brief Load objective function from string object */ LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& str); - - virtual std::function GetCUDAConvertOutputFunc() const { - return [] (data_size_t, const double*, double*) {}; - } }; } // namespace LightGBM diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index f65fc591200d..55343ad714ee 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -86,7 +86,7 @@ class TreeLearner { virtual void AddPredictionToScore(const Tree* tree, double* out_score) const = 0; virtual void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, - const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const = 0; + data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const = 0; TreeLearner() = default; /*! \brief Disable copy */ diff --git a/src/application/cuda/cuda_predictor.cpp b/src/application/cuda/cuda_predictor.cpp deleted file mode 100644 index 7675a5e341fc..000000000000 --- a/src/application/cuda/cuda_predictor.cpp +++ /dev/null @@ -1,275 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_predictor.hpp" -#include - -namespace LightGBM { - -CUDAPredictor::CUDAPredictor(Boosting* boosting, int start_iteration, int num_iteration, bool is_raw_score, - bool predict_leaf_index, bool predict_contrib, bool early_stop, - int early_stop_freq, double early_stop_margin): - Predictor(boosting, start_iteration, num_iteration, is_raw_score, predict_leaf_index, predict_contrib, early_stop, early_stop_freq, early_stop_margin), - is_raw_score_(is_raw_score), predict_leaf_index_(predict_leaf_index), predict_contrib_(predict_contrib) { - if (predict_contrib_) { - Log::Fatal("pred_contrib=True is not supported by CUDA version yet."); - } - InitCUDAModel(start_iteration, num_iteration); - num_pred_in_one_row_ = static_cast(boosting_->NumPredictOneRow(start_iteration, num_iteration, predict_leaf_index, predict_contrib)); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); -} - -CUDAPredictor::~CUDAPredictor() {} - -void CUDAPredictor::Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check, bool precise_float_parser) { - if (predict_leaf_index_) { - CHECK_EQ(num_pred_in_one_row_, static_cast(num_iteration_)); - } - auto label_idx = header ? -1 : boosting_->LabelIdx(); - auto parser = std::unique_ptr(Parser::CreateParser(data_filename, header, boosting_->MaxFeatureIdx() + 1, label_idx, precise_float_parser)); - if (parser == nullptr) { - Log::Fatal("Could not recognize the data format of data file %s", data_filename); - } - if (!header && !disable_shape_check && parser->NumFeatures() != boosting_->MaxFeatureIdx() + 1) { - Log::Fatal("The number of features in data (%d) is not the same as it was in training data (%d).\n" \ - "You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", - parser->NumFeatures(), boosting_->MaxFeatureIdx() + 1); - } - TextReader predict_data_reader(data_filename, header); - std::vector feature_remapper(parser->NumFeatures(), -1); - bool need_adjust = false; - if (header) { - std::string first_line = predict_data_reader.first_line(); - std::vector header_words = Common::Split(first_line.c_str(), "\t,"); - std::unordered_map header_mapper; - for (int i = 0; i < static_cast(header_words.size()); ++i) { - if (header_mapper.count(header_words[i]) > 0) { - Log::Fatal("Feature (%s) appears more than one time.", header_words[i].c_str()); - } - header_mapper[header_words[i]] = i; - } - const auto& fnames = boosting_->FeatureNames(); - for (int i = 0; i < static_cast(fnames.size()); ++i) { - if (header_mapper.count(fnames[i]) <= 0) { - Log::Warning("Feature (%s) is missed in data file. If it is weight/query/group/ignore_column, you can ignore this warning.", fnames[i].c_str()); - } else { - feature_remapper[header_mapper.at(fnames[i])] = i; - } - } - for (int i = 0; i < static_cast(feature_remapper.size()); ++i) { - if (feature_remapper[i] >= 0 && i != feature_remapper[i]) { - need_adjust = true; - break; - } - } - } - // function for parse data - std::function>*)> parser_fun; - double tmp_label; - parser_fun = [&parser, &feature_remapper, &tmp_label, need_adjust] - (const char* buffer, std::vector>* feature) { - parser->ParseOneLine(buffer, feature, &tmp_label); - if (need_adjust) { - int i = 0, j = static_cast(feature->size()); - while (i < j) { - if (feature_remapper[(*feature)[i].first] >= 0) { - (*feature)[i].first = feature_remapper[(*feature)[i].first]; - ++i; - } else { - // move the non-used features to the end of the feature vector - std::swap((*feature)[i], (*feature)[--j]); - } - } - feature->resize(i); - } - }; - auto writer = VirtualFileWriter::Make(result_filename); - if (!writer->Init()) { - Log::Fatal("Prediction results file %s cannot be found", result_filename); - } - PredictWithParserFun(parser_fun, &predict_data_reader, writer.get()); -} - -void CUDAPredictor::PredictWithParserFun(std::function>*)> parser_fun, - TextReader* predict_data_reader, - VirtualFileWriter* writer) { - // use lager buffer size to reduce the time spent in copying from Host to CUDA - // TODO(shiyu1994): optimize the pipeline and asynchronization behavior - const data_size_t buffer_size = 50000; - AllocateCUDAMemoryOuter(&cuda_data_, static_cast(buffer_size) * static_cast(num_feature_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_result_buffer_, static_cast(buffer_size) * static_cast(num_pred_in_one_row_), __FILE__, __LINE__); - std::vector buffer(buffer_size * num_feature_, 0.0f); - std::vector result_buffer(buffer_size * num_pred_in_one_row_, 0.0f); - auto process_fun = [&parser_fun, &writer, &buffer, &result_buffer, buffer_size, this] - (data_size_t /*start_index*/, const std::vector& lines) { - std::vector> oneline_features; - std::vector result_to_write(lines.size()); - const data_size_t num_lines = static_cast(lines.size()); - const int num_blocks = (num_lines + buffer_size - 1) / buffer_size; - for (int block_index = 0; block_index < num_blocks; ++block_index) { - const data_size_t block_start = block_index * buffer_size; - const data_size_t block_end = std::min(block_start + buffer_size, num_lines); - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) firstprivate(oneline_features) - for (data_size_t i = block_start; i < block_end; ++i) { - OMP_LOOP_EX_BEGIN(); - oneline_features.clear(); - // parser - parser_fun(lines[i].c_str(), &oneline_features); - // predict - const data_size_t index_in_block = i - block_start; - double* one_row_data = buffer.data() + index_in_block * num_feature_; - for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { - one_row_data[feature_index] = 0.0f; - } - for (const auto& pair : oneline_features) { - one_row_data[pair.first] = pair.second; - } - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - CopyFromHostToCUDADeviceAsyncOuter(cuda_data_, buffer.data(), static_cast(buffer_size * num_feature_), cuda_stream_, __FILE__, __LINE__); - LaunchPredictKernelAsync(buffer_size, false); - CopyFromCUDADeviceToHostAsyncOuter(result_buffer.data(), - cuda_result_buffer_, - static_cast(buffer_size) * static_cast(num_pred_in_one_row_), - cuda_stream_, - __FILE__, - __LINE__); - SynchronizeCUDADeviceOuter(cuda_stream_, __FILE__, __LINE__); - { - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) - for (data_size_t i = block_start; i < block_end; ++i) { - OMP_LOOP_EX_BEGIN(); - const data_size_t index_in_block = i - block_start; - const double* begin = result_buffer.data() + index_in_block * num_pred_in_one_row_; - const double* end = begin + num_pred_in_one_row_; - result_to_write[i] = Common::Join(std::vector(begin, end), "\t"); - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - } - } - for (data_size_t i = 0; i < static_cast(result_to_write.size()); ++i) { - writer->Write(result_to_write[i].c_str(), result_to_write[i].size()); - writer->Write("\n", 1); - } - }; - predict_data_reader->ReadAllAndProcessParallel(process_fun); -} - -void CUDAPredictor::Predict(const data_size_t num_data, - const int64_t num_pred_in_one_row, - const std::function>(int row_idx)>& get_row_fun, - double* out_result) { - const data_size_t buffer_size = 50000; - CHECK_EQ(num_pred_in_one_row_, num_pred_in_one_row); - if (predict_leaf_index_) { - CHECK_EQ(num_pred_in_one_row_, static_cast(num_iteration_)); - } - AllocateCUDAMemoryOuter(&cuda_data_, static_cast(buffer_size) * static_cast(num_feature_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_result_buffer_, static_cast(buffer_size) * static_cast(num_pred_in_one_row_), __FILE__, __LINE__); - std::vector buffer(buffer_size * num_feature_, 0.0f); - const int num_blocks = (num_data + buffer_size - 1) / buffer_size; - data_size_t block_offset = 0; - for (int block_index = 0; block_index < num_blocks; ++block_index) { - Threading::For(0, buffer_size, 512, - [block_offset, get_row_fun, &buffer, this] (int /*thread_index*/, data_size_t start, data_size_t end) { - std::vector> oneline_feature; - for (data_size_t i = start; i < end; ++i) { - oneline_feature = get_row_fun(i + block_offset); - double* one_row_data = buffer.data() + i * num_feature_; - for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { - one_row_data[feature_index] = 0.0f; - } - for (const auto& pair : oneline_feature) { - one_row_data[pair.first] = pair.second; - } - } - }); - CopyFromHostToCUDADeviceAsyncOuter(cuda_data_, buffer.data(), static_cast(buffer_size * num_feature_), cuda_stream_, __FILE__, __LINE__); - LaunchPredictKernelAsync(buffer_size, false); - CopyFromCUDADeviceToHostAsyncOuter(out_result + static_cast(block_offset) * static_cast(num_pred_in_one_row_), - cuda_result_buffer_, - static_cast(buffer_size) * static_cast(num_pred_in_one_row_), - cuda_stream_, - __FILE__, - __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - block_offset += buffer_size; - } -} - -void CUDAPredictor::InitCUDAModel(const int start_iteration, const int num_iteration) { - const std::vector>& models = boosting_->models(); - cuda_convert_output_function_ = boosting_->GetCUDAConvertOutputFunc(); - const int num_tree_per_iteration = boosting_->num_tree_per_iteration(); - num_iteration_ = static_cast(models.size()) / num_tree_per_iteration; - start_iteration_ = std::max(start_iteration, 0); - start_iteration_ = std::min(start_iteration_, num_iteration_); - if (num_iteration > 0) { - num_iteration_ = std::min(num_iteration, num_iteration_ - start_iteration_); - } else { - num_iteration_ = num_iteration_ - start_iteration_; - } - std::vector tree_num_leaves(num_iteration_, 0); - std::vector tree_left_child(num_iteration_, nullptr); - std::vector tree_right_child(num_iteration_, nullptr); - std::vector tree_leaf_value(num_iteration_, nullptr); - std::vector tree_threshold(num_iteration_, nullptr); - std::vector tree_decision_type(num_iteration_, nullptr); - std::vector tree_split_feature_index(num_iteration_, nullptr); - const int num_threads = OMP_NUM_THREADS(); - #pragma omp parallel for schedule(static) num_threads(num_threads) if (num_iteration_ >= 1024) - for (int tree_index = 0; tree_index < num_iteration_; ++tree_index) { - CHECK(models[tree_index]->is_cuda_tree()); - const CUDATree* cuda_tree = reinterpret_cast(models[tree_index + start_iteration_].get()); - tree_num_leaves[tree_index] = cuda_tree->num_leaves(); - tree_left_child[tree_index] = cuda_tree->cuda_left_child(); - tree_right_child[tree_index] = cuda_tree->cuda_right_child(); - tree_leaf_value[tree_index] = cuda_tree->cuda_leaf_value(); - tree_threshold[tree_index] = cuda_tree->cuda_threshold(); - tree_decision_type[tree_index] = cuda_tree->cuda_decision_type(); - tree_split_feature_index[tree_index] = cuda_tree->cuda_split_feature(); - } - InitCUDAMemoryFromHostMemoryOuter(&cuda_tree_num_leaves_, - tree_num_leaves.data(), - tree_num_leaves.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_left_child_, - tree_left_child.data(), - tree_left_child.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_right_child_, - tree_right_child.data(), - tree_right_child.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_leaf_value_, - tree_leaf_value.data(), - tree_leaf_value.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_threshold_, - tree_threshold.data(), - tree_threshold.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_decision_type_, - tree_decision_type.data(), - tree_decision_type.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_split_feature_index_, - tree_split_feature_index.data(), - tree_split_feature_index.size(), - __FILE__, - __LINE__); -} - -} // namespace LightGBM diff --git a/src/application/cuda/cuda_predictor.cu b/src/application/cuda/cuda_predictor.cu deleted file mode 100644 index 24a2f8c94846..000000000000 --- a/src/application/cuda/cuda_predictor.cu +++ /dev/null @@ -1,127 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_predictor.hpp" - -namespace LightGBM { - -template -__global__ void PredictKernel(const data_size_t num_data, - const int num_feature, - const int* num_leaves, - const int** left_child, - const int** right_child, - const double** threshold, - const int8_t** decision_type, - const double** leaf_value, - const int** split_feature_index, - const int num_trees, - double* data, - double* cuda_result_buffer) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const unsigned int thread_index = threadIdx.x; - double* data_pointer = nullptr; - if (data_index < num_data) { - data_pointer = data + data_index * num_feature; - if (!PREDICT_LEAF_INDEX) { - cuda_result_buffer[data_index] = 0.0f; - } - } - __shared__ double shared_tree_threshold[CUDA_PREDICTOR_MAX_TREE_SIZE]; - __shared__ int shared_tree_left_child[CUDA_PREDICTOR_MAX_TREE_SIZE]; - __shared__ int shared_tree_right_child[CUDA_PREDICTOR_MAX_TREE_SIZE]; - __shared__ int8_t shared_tree_decision_type[CUDA_PREDICTOR_MAX_TREE_SIZE]; - __shared__ double shared_tree_leaf_value[CUDA_PREDICTOR_MAX_TREE_SIZE]; - __shared__ int shared_tree_split_feature_index[CUDA_PREDICTOR_MAX_TREE_SIZE]; - for (int tree_index = 0; tree_index < num_trees; ++tree_index) { - const int tree_num_leaves = num_leaves[tree_index]; - const int* tree_left_child = left_child[tree_index]; - const int* tree_right_child = right_child[tree_index]; - const double* tree_threshold = threshold[tree_index]; - const double* tree_leaf_value = leaf_value[tree_index]; - const int8_t* tree_decision_type = decision_type[tree_index]; - const int* tree_split_feature_index = split_feature_index[tree_index]; - for (int leaf_index = static_cast(thread_index); leaf_index < tree_num_leaves; leaf_index += static_cast(blockDim.x)) { - shared_tree_threshold[leaf_index] = tree_threshold[leaf_index]; - shared_tree_left_child[leaf_index] = tree_left_child[leaf_index]; - shared_tree_right_child[leaf_index] = tree_right_child[leaf_index]; - shared_tree_leaf_value[leaf_index] = tree_leaf_value[leaf_index]; - shared_tree_decision_type[leaf_index] = tree_decision_type[leaf_index]; - shared_tree_split_feature_index[leaf_index] = tree_split_feature_index[leaf_index]; - } - __syncthreads(); - if (data_index < num_data) { - int node = 0; - while (node >= 0) { - const double node_threshold = shared_tree_threshold[node]; - const int node_split_feature_index = shared_tree_split_feature_index[node]; - const int8_t node_decision_type = shared_tree_decision_type[node]; - double value = data_pointer[node_split_feature_index]; - uint8_t missing_type = GetMissingTypeCUDA(node_decision_type); - if (isnan(value) && missing_type != MissingType::NaN) { - value = 0.0f; - } - if ((missing_type == MissingType::Zero && IsZeroCUDA(value)) || - (missing_type == MissingType::NaN && isnan(value))) { - if (GetDecisionTypeCUDA(node_decision_type, kDefaultLeftMask)) { - node = shared_tree_left_child[node]; - } else { - node = shared_tree_right_child[node]; - } - } else { - if (value <= node_threshold) { - node = shared_tree_left_child[node]; - } else { - node = shared_tree_right_child[node]; - } - } - } - if (PREDICT_LEAF_INDEX) { - cuda_result_buffer[data_index * num_trees + tree_index] = ~node; - } else { - cuda_result_buffer[data_index] += shared_tree_leaf_value[~node]; - } - } - __syncthreads(); - } -} - -#define PREDICT_KERNEL_ARGS \ - num_data, \ - num_feature_, \ - cuda_tree_num_leaves_, \ - cuda_left_child_, \ - cuda_right_child_, \ - cuda_threshold_, \ - cuda_decision_type_, \ - cuda_leaf_value_, \ - cuda_split_feature_index_, \ - num_iteration_, \ - cuda_data_, \ - cuda_result_buffer_ - -void CUDAPredictor::LaunchPredictKernelAsync(const data_size_t num_data, const bool is_csr) { - const int num_blocks = (num_data + CUAA_PREDICTOR_PREDICT_BLOCK_SIZE - 1) / CUAA_PREDICTOR_PREDICT_BLOCK_SIZE; - if (is_csr) { - if (predict_leaf_index_) { - PredictKernel<<>>(PREDICT_KERNEL_ARGS); - } else { - PredictKernel<<>>(PREDICT_KERNEL_ARGS); - } - } else { - if (predict_leaf_index_) { - PredictKernel<<>>(PREDICT_KERNEL_ARGS); - } else { - PredictKernel<<>>(PREDICT_KERNEL_ARGS); - } - } - if (!is_raw_score_ && !predict_leaf_index_) { - cuda_convert_output_function_(num_data, cuda_result_buffer_, cuda_result_buffer_); - } -} - -#undef PREDICT_KERNEL_ARGS - -} // namespace LightGBM diff --git a/src/application/cuda/cuda_predictor.hpp b/src/application/cuda/cuda_predictor.hpp deleted file mode 100644 index f795dfa7dfb8..000000000000 --- a/src/application/cuda/cuda_predictor.hpp +++ /dev/null @@ -1,72 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_APPLICATION_CUDA_CUDA_PREDICTOR_HPP_ -#define LIGHTGBM_APPLICATION_CUDA_CUDA_PREDICTOR_HPP_ - -#include -#include -#include -#include - -#include "../predictor.hpp" - -#define CUDA_PREDICTOR_MAX_TREE_SIZE (1024) -#define CUAA_PREDICTOR_PREDICT_BLOCK_SIZE (1024) - -namespace LightGBM { - -class CUDAPredictor : public Predictor { - public: - CUDAPredictor(Boosting* boosting, int start_iteration, int num_iteration, bool is_raw_score, - bool predict_leaf_index, bool predict_contrib, bool early_stop, - int early_stop_freq, double early_stop_margin); - - ~CUDAPredictor(); - - virtual void Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check, bool precise_float_parser) override; - - virtual void Predict(const data_size_t num_data, - const int64_t num_pred_in_one_row, - const std::function>(int row_idx)>& get_row_fun, - double* out_result) override; - - private: - void InitCUDAModel(const int start_iteration, const int num_iteration); - - void LaunchPredictKernelAsync(const data_size_t num_data, const bool is_csr); - - void PredictWithParserFun(std::function>*)> parser_fun, - TextReader* predict_data_reader, - VirtualFileWriter* writer); - - std::function>*)> GetParserFun(const char* data_filename, - const bool header, - const bool disable_shape_check); - - double* cuda_result_buffer_; - double* cuda_data_; - - int* cuda_tree_num_leaves_; - const int** cuda_left_child_; - const int** cuda_right_child_; - const double** cuda_threshold_; - const int8_t** cuda_decision_type_; - const double** cuda_leaf_value_; - const int** cuda_split_feature_index_; - - cudaStream_t cuda_stream_; - - int start_iteration_; - int num_iteration_; - int64_t num_pred_in_one_row_; - const bool is_raw_score_; - const bool predict_leaf_index_; - const bool predict_contrib_; - std::function cuda_convert_output_function_; -}; - -} // namespace LightGBM - -#endif // LIGHTGBM_APPLICATION_CUDA_CUDA_PREDICTOR_HPP_ diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp index ba6e9f726f9b..836ecb6dbf16 100644 --- a/src/application/predictor.hpp +++ b/src/application/predictor.hpp @@ -253,23 +253,7 @@ class Predictor { predict_data_reader.ReadAllAndProcessParallel(process_fun); } - virtual void Predict(const data_size_t num_data, - const int64_t num_pred_in_one_row, - const std::function>(int row_idx)>& get_row_fun, - double* out_result) { - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) - for (int i = 0; i < num_data; ++i) { - OMP_LOOP_EX_BEGIN(); - auto one_row = get_row_fun(i); - auto pred_wrt_ptr = out_result + static_cast(num_pred_in_one_row) * i; - predict_fun_(one_row, pred_wrt_ptr); - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - } - - protected: + private: void CopyToPredictBuffer(double* pred_buf, const std::vector>& features) { for (const auto &feature : features) { if (feature.first < num_feature_) { diff --git a/src/boosting/cuda/cuda_score_updater.cpp b/src/boosting/cuda/cuda_score_updater.cpp deleted file mode 100644 index 336842ea5fa8..000000000000 --- a/src/boosting/cuda/cuda_score_updater.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_score_updater.hpp" - -namespace LightGBM { - -CUDAScoreUpdater::CUDAScoreUpdater(const Dataset* data, int num_tree_per_iteration): - ScoreUpdater(data, num_tree_per_iteration), num_threads_per_block_(1024) { - num_data_ = data->num_data(); - int64_t total_size = static_cast(num_data_) * num_tree_per_iteration; - InitCUDA(total_size); - has_init_score_ = false; - const double* init_score = data->metadata().init_score(); - // if exists initial score, will start from it - if (init_score != nullptr) { - if ((data->metadata().num_init_score() % num_data_) != 0 - || (data->metadata().num_init_score() / num_data_) != num_tree_per_iteration) { - Log::Fatal("Number of class for initial score error"); - } - has_init_score_ = true; - CopyFromHostToCUDADeviceOuter(cuda_score_, init_score, total_size, __FILE__, __LINE__); - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -void CUDAScoreUpdater::InitCUDA(const size_t total_size) { - AllocateCUDAMemoryOuter(&cuda_score_, total_size, __FILE__, __LINE__); -} - -CUDAScoreUpdater::~CUDAScoreUpdater() { - DeallocateCUDAMemoryOuter(&cuda_score_, __FILE__, __LINE__); -} - -inline void CUDAScoreUpdater::AddScore(double val, int cur_tree_id) { - Common::FunctionTimer fun_timer("CUDAScoreUpdater::AddScore", global_timer); - const size_t offset = static_cast(num_data_) * cur_tree_id; - LaunchAddScoreConstantKernel(val, offset); -} - -inline void CUDAScoreUpdater::AddScore(const Tree* tree, int cur_tree_id) { - Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); - const size_t offset = static_cast(num_data_) * cur_tree_id; - tree->AddPredictionToScore(data_, num_data_, cuda_score_ + offset); -} - -inline void CUDAScoreUpdater::AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) { - Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); - const size_t offset = static_cast(num_data_) * cur_tree_id; - tree_learner->AddPredictionToScore(tree, cuda_score_ + offset); -} - -inline void CUDAScoreUpdater::AddScore(const Tree* tree, const data_size_t* data_indices, - data_size_t data_cnt, int cur_tree_id) { - // TODO(shiyu1994): bagging is not supported yet - Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); - const size_t offset = static_cast(num_data_) * cur_tree_id; - tree->AddPredictionToScore(data_, data_indices, data_cnt, cuda_score_ + offset); -} - -inline void CUDAScoreUpdater::MultiplyScore(double val, int cur_tree_id) { - Common::FunctionTimer fun_timer("CUDAScoreUpdater::MultiplyScore", global_timer); - const size_t offset = static_cast(num_data_) * cur_tree_id; - LaunchMultiplyScoreConstantKernel(val, offset); -} - -} // namespace LightGBM diff --git a/src/boosting/cuda/cuda_score_updater.cu b/src/boosting/cuda/cuda_score_updater.cu deleted file mode 100644 index 009a1873bd2d..000000000000 --- a/src/boosting/cuda/cuda_score_updater.cu +++ /dev/null @@ -1,43 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "cuda_score_updater.hpp" - -namespace LightGBM { - -__global__ void AddScoreConstantKernel( - const double val, - const size_t offset, - const data_size_t num_data, - double* score) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - score[data_index + offset] += val; - } -} - -void CUDAScoreUpdater::LaunchAddScoreConstantKernel(const double val, const size_t offset) { - const int num_blocks = (num_data_ + num_threads_per_block_) / num_threads_per_block_; - Log::Warning("adding init score = %f", val); - AddScoreConstantKernel<<>>(val, offset, num_data_, cuda_score_); -} - -__global__ void MultiplyScoreConstantKernel( - const double val, - const size_t offset, - const data_size_t num_data, - double* score) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - score[data_index] *= val; - } -} - -void CUDAScoreUpdater::LaunchMultiplyScoreConstantKernel(const double val, const size_t offset) { - const int num_blocks = (num_data_ + num_threads_per_block_) / num_threads_per_block_; - MultiplyScoreConstantKernel<<>>(val, offset, num_data_, cuda_score_); -} - -} diff --git a/src/boosting/cuda/cuda_score_updater.hpp b/src/boosting/cuda/cuda_score_updater.hpp deleted file mode 100644 index 623df1c84740..000000000000 --- a/src/boosting/cuda/cuda_score_updater.hpp +++ /dev/null @@ -1,48 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include - -#include "../score_updater.hpp" - -namespace LightGBM { - -class CUDAScoreUpdater: public ScoreUpdater { - public: - CUDAScoreUpdater(const Dataset* data, int num_tree_per_iteration); - - ~CUDAScoreUpdater(); - - inline void AddScore(double val, int cur_tree_id) override; - - inline void AddScore(const Tree* tree, int cur_tree_id) override; - - inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) override; - - inline void AddScore(const Tree* tree, const data_size_t* data_indices, - data_size_t data_cnt, int cur_tree_id) override; - - inline void MultiplyScore(double val, int cur_tree_id) override; - - inline const double* score() const override { return cuda_score_; } - - /*! \brief Disable copy */ - CUDAScoreUpdater& operator=(const CUDAScoreUpdater&) = delete; - - CUDAScoreUpdater(const CUDAScoreUpdater&) = delete; - - private: - void InitCUDA(const size_t total_size); - - void LaunchAddScoreConstantKernel(const double val, const size_t offset); - - void LaunchMultiplyScoreConstantKernel(const double val, const size_t offset); - - double* cuda_score_; - - const int num_threads_per_block_; -}; - -} // namespace LightGBM diff --git a/src/boosting/cuda/gbdt.cu b/src/boosting/cuda/gbdt.cu deleted file mode 100644 index 9a029f75d941..000000000000 --- a/src/boosting/cuda/gbdt.cu +++ /dev/null @@ -1,33 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include "../gbdt.h" - -#define COPY_SUBSAMPLE_GRADIENTS_BLOCK_SIZE (1024) - -namespace LightGBM { - -__global__ void CopySubsampleGradientsKernel( - score_t* dst_grad, score_t* dst_hess, - const score_t* src_grad, const score_t* src_hess, - const data_size_t* bag_data_indices, - const data_size_t bag_data_cnt) { - const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (local_data_index < bag_data_cnt) { - const data_size_t global_data_index = bag_data_indices[local_data_index]; - dst_grad[local_data_index] = src_grad[global_data_index]; - dst_hess[local_data_index] = src_hess[global_data_index]; - } -} - -void GBDT::LaunchCopySubsampleGradientsKernel( -score_t* dst_grad, score_t* dst_hess, -const score_t* src_grad, const score_t* src_hess) { - const int num_blocks = (bag_data_cnt_ + COPY_SUBSAMPLE_GRADIENTS_BLOCK_SIZE - 1) / COPY_SUBSAMPLE_GRADIENTS_BLOCK_SIZE; - CopySubsampleGradientsKernel<<>>( - dst_grad, dst_hess, src_grad, src_hess, bag_data_indices_.data(), bag_data_cnt_); -} - -} // namespace LightGBM diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index a83decb550a3..d393d46d5133 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -67,8 +67,6 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective if (config_->device_type == std::string("cuda")) { LGBM_config_::current_learner = use_cuda_learner; - const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0; - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); } // load forced_splits file @@ -105,26 +103,14 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective } training_metrics_.shrink_to_fit(); - if (config_->device_type == std::string("cuda")) { - train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_)); - } else { - train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_)); - } + train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_)); num_data_ = train_data_->num_data(); // create buffer for gradients and Hessians - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - if (config_->device_type == std::string("cuda")) { - AllocateCUDAMemoryOuter(&gradients_pointer_, total_size, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&hessians_pointer_, total_size, __FILE__, __LINE__); - } if (objective_function_ != nullptr) { + size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; gradients_.resize(total_size); hessians_.resize(total_size); - if (config_->device_type == std::string("cpu")) { - gradients_pointer_ = gradients_.data(); - hessians_pointer_ = hessians_.data(); - } } // get max feature index max_feature_idx_ = train_data_->num_total_features() - 1; @@ -157,9 +143,7 @@ void GBDT::AddValidDataset(const Dataset* valid_data, Log::Fatal("Cannot add validation data, since it has different bin mappers with training data"); } // for a validation dataset, we need its score and metric - auto new_score_updater = config_->device_type == std::string("cuda") ? - std::unique_ptr(new CUDAScoreUpdater(valid_data, num_tree_per_iteration_)) : - std::unique_ptr(new ScoreUpdater(valid_data, num_tree_per_iteration_)); + auto new_score_updater = std::unique_ptr(new ScoreUpdater(valid_data, num_tree_per_iteration_)); // update score for (int i = 0; i < iter_; ++i) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { @@ -327,8 +311,8 @@ void GBDT::RefitTree(const std::vector>& tree_leaf_prediction) CHECK_LT(leaf_pred[i], models_[model_index]->num_leaves()); } size_t offset = static_cast(tree_id) * num_data_; - auto grad = gradients_pointer_ + offset; - auto hess = hessians_pointer_ + offset; + auto grad = gradients_.data() + offset; + auto hess = hessians_.data() + offset; auto new_tree = tree_learner_->FitByExistingTree(models_[model_index].get(), leaf_pred, grad, hess); train_score_updater_->AddScore(tree_learner_.get(), new_tree, tree_id); models_[model_index].reset(new_tree); @@ -391,22 +375,12 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true); } Boosting(); - } - if (config_->device_type == std::string("cuda")) { - const size_t total_size = static_cast(num_data_ * num_class_); - const score_t* host_gradients = gradients == nullptr ? gradients_.data() : gradients; - const score_t* host_hessians = hessians == nullptr ? hessians_.data() : hessians; - global_timer.Start("Copy gradients from Host to CUDA"); - CopyFromHostToCUDADeviceOuter(gradients_pointer_, host_gradients, total_size, __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(hessians_pointer_, host_hessians, total_size, __FILE__, __LINE__); - global_timer.Stop("Copy gradients from Host to CUDA"); - } - if (gradients == nullptr || hessians == nullptr || config_->device_type == std::string("cuda")) { - gradients = gradients_pointer_; - hessians = hessians_pointer_; + gradients = gradients_.data(); + hessians = hessians_.data(); } // bagging logic Bagging(iter_); + bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { const size_t offset = static_cast(cur_tree_id) * num_data_; @@ -416,16 +390,12 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto hess = hessians + offset; // need to copy gradients for bagging subset. if (is_use_subset_ && bag_data_cnt_ < num_data_) { - if (config_->device_type == std::string("cuda")) { - CopySubsampleGradientsCUDA(gradients_pointer_ + offset, hessians_pointer_ + offset, grad, hess); - } else { - for (int i = 0; i < bag_data_cnt_; ++i) { - gradients_pointer_[offset + i] = grad[bag_data_indices_[i]]; - gradients_pointer_[offset + i] = hess[bag_data_indices_[i]]; - } + for (int i = 0; i < bag_data_cnt_; ++i) { + gradients_[offset + i] = grad[bag_data_indices_[i]]; + hessians_[offset + i] = hess[bag_data_indices_[i]]; } - grad = gradients_pointer_ + offset; - hess = hessians_pointer_ + offset; + grad = gradients_.data() + offset; + hess = hessians_.data() + offset; } bool is_first_tree = models_.size() < static_cast(num_tree_per_iteration_); new_tree.reset(tree_learner_->Train(grad, hess, is_first_tree)); @@ -436,7 +406,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto score_ptr = train_score_updater_->score() + offset; auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast(label[i]) - score_ptr[i]; }; tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter, - score_ptr, num_data_, bag_data_indices_.data(), bag_data_cnt_); + num_data_, bag_data_indices_.data(), bag_data_cnt_); // shrinkage by learning rate new_tree->Shrinkage(shrinkage_rate_); // update score @@ -457,7 +427,6 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } new_tree->AsConstantTree(output); // updates scores - // TODO(shiyu1994): check here, default score has been added in BoostFromAverage ? train_score_updater_->AddScore(output, cur_tree_id); for (auto& score_updater : valid_score_updater_) { score_updater->AddScore(output, cur_tree_id); @@ -541,15 +510,8 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { } } -std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { - if (config_->device_type == std::string("cuda")) { - metric_temp_score_.resize(num_data * num_class_, 0.0f); - CopyFromCUDADeviceToHostOuter(metric_temp_score_.data(), score, static_cast(num_data * num_class_), __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - return metric->Eval(metric_temp_score_.data(), objective_function_); - } else { - return metric->Eval(score, objective_function_); - } +std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score) const { + return metric->Eval(score, objective_function_); } std::string GBDT::OutputMetric(int iter) { @@ -561,7 +523,7 @@ std::string GBDT::OutputMetric(int iter) { if (need_output) { for (auto& sub_metric : training_metrics_) { auto name = sub_metric->GetName(); - auto scores = EvalOneMetric(sub_metric, train_score_updater_->score(), train_data_->num_data()); + auto scores = EvalOneMetric(sub_metric, train_score_updater_->score()); for (size_t k = 0; k < name.size(); ++k) { std::stringstream tmp_buf; tmp_buf << "Iteration:" << iter @@ -578,7 +540,7 @@ std::string GBDT::OutputMetric(int iter) { if (need_output || early_stopping_round_ > 0) { for (size_t i = 0; i < valid_metrics_.size(); ++i) { for (size_t j = 0; j < valid_metrics_[i].size(); ++j) { - auto test_scores = EvalOneMetric(valid_metrics_[i][j], valid_score_updater_[i]->score(), valid_score_updater_[i]->num_data()); + auto test_scores = EvalOneMetric(valid_metrics_[i][j], valid_score_updater_[i]->score()); auto name = valid_metrics_[i][j]->GetName(); for (size_t k = 0; k < name.size(); ++k) { std::stringstream tmp_buf; @@ -618,7 +580,7 @@ std::vector GBDT::GetEvalAt(int data_idx) const { std::vector ret; if (data_idx == 0) { for (auto& sub_metric : training_metrics_) { - auto scores = EvalOneMetric(sub_metric, train_score_updater_->score(), train_score_updater_->num_data()); + auto scores = EvalOneMetric(sub_metric, train_score_updater_->score()); for (auto score : scores) { ret.push_back(score); } @@ -626,7 +588,7 @@ std::vector GBDT::GetEvalAt(int data_idx) const { } else { auto used_idx = data_idx - 1; for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) { - auto test_scores = EvalOneMetric(valid_metrics_[used_idx][j], valid_score_updater_[used_idx]->score(), valid_score_updater_[used_idx]->num_data()); + auto test_scores = EvalOneMetric(valid_metrics_[used_idx][j], valid_score_updater_[used_idx]->score()); for (auto score : test_scores) { ret.push_back(score); } @@ -638,13 +600,7 @@ std::vector GBDT::GetEvalAt(int data_idx) const { /*! \brief Get training scores result */ const double* GBDT::GetTrainingScore(int64_t* out_len) { *out_len = static_cast(train_score_updater_->num_data()) * num_class_; - if (config_->device_type == std::string("cpu")) { - return train_score_updater_->score(); - } else if (config_->device_type == std::string("cuda")) { - training_temp_score_.resize(*out_len); - CopyFromCUDADeviceToHostOuter(training_temp_score_.data(), train_score_updater_->score(), *out_len, __FILE__, __LINE__); - return training_temp_score_.data(); - } + return train_score_updater_->score(); } void GBDT::PredictContrib(const double* features, double* output) const { @@ -766,15 +722,8 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* // create buffer for gradients and hessians if (objective_function_ != nullptr) { size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - if (config_->device_type == std::string("cuda")) { - AllocateCUDAMemoryOuter(&gradients_pointer_, total_size, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&hessians_pointer_, total_size, __FILE__, __LINE__); - } else { - gradients_.resize(total_size); - hessians_.resize(total_size); - gradients_pointer_ = gradients_.data(); - hessians_pointer_ = hessians_.data(); - } + gradients_.resize(total_size); + hessians_.resize(total_size); } max_feature_idx_ = train_data_->num_total_features() - 1; @@ -874,15 +823,8 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { if (is_use_subset_ && bag_data_cnt_ < num_data_) { if (objective_function_ == nullptr) { size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - if (config_->device_type == std::string("cuda")) { - AllocateCUDAMemoryOuter(&gradients_pointer_, total_size, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&hessians_pointer_, total_size, __FILE__, __LINE__); - } else { - gradients_.resize(total_size); - hessians_.resize(total_size); - gradients_pointer_ = gradients_.data(); - hessians_pointer_ = hessians_.data(); - } + gradients_.resize(total_size); + hessians_.resize(total_size); } } } else { @@ -893,10 +835,4 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { } } -void GBDT::CopySubsampleGradientsCUDA( -score_t* dst_grad, score_t* dst_hess, -const score_t* src_grad, const score_t* src_hess) { - LaunchCopySubsampleGradientsKernel(dst_grad, dst_hess, src_grad, src_hess); -} - } // namespace LightGBM diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index c707d67b8413..9ef33ca53ff7 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -24,7 +23,6 @@ #include #include -#include "cuda/cuda_score_updater.hpp" #include "score_updater.hpp" namespace LightGBM { @@ -396,18 +394,6 @@ class GBDT : public GBDTBase { bool IsLinear() const override { return linear_tree_; } - const std::vector>& models() const override { return models_; } - - int num_tree_per_iteration() const override { return num_tree_per_iteration_; } - - virtual std::function GetCUDAConvertOutputFunc() const { - if (objective_function_ != nullptr) { - return objective_function_->GetCUDAConvertOutputFunc(); - } else { - return [] (data_size_t, const double*, double*) {}; - } - } - protected: virtual bool GetIsConstHessian(const ObjectiveFunction* objective_function) { if (objective_function != nullptr) { @@ -454,7 +440,7 @@ class GBDT : public GBDTBase { * \brief eval results for one metric */ - virtual std::vector EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const; + virtual std::vector EvalOneMetric(const Metric* metric, const double* score) const; /*! * \brief Print metric result of current iteration @@ -465,14 +451,6 @@ class GBDT : public GBDTBase { double BoostFromAverage(int class_id, bool update_scorer); - void CopySubsampleGradientsCUDA( - score_t* dst_grad, score_t* dst_hess, - const score_t* src_grad, const score_t* src_hess); - - void LaunchCopySubsampleGradientsKernel( - score_t* dst_grad, score_t* dst_hess, - const score_t* src_grad, const score_t* src_hess); - /*! \brief current iteration */ int iter_; /*! \brief Pointer to training data */ @@ -517,8 +495,6 @@ class GBDT : public GBDTBase { /*! \brief Second order derivative of training data */ std::vector> hessians_; #endif - score_t* gradients_pointer_; - score_t* hessians_pointer_; /*! \brief Store the indices of in-bag data */ std::vector> bag_data_indices_; /*! \brief Number of in-bag data */ @@ -557,10 +533,6 @@ class GBDT : public GBDTBase { ParallelPartitionRunner bagging_runner_; Json forced_splits_json_; bool linear_tree_; - /*! \brief temporary storage on CPU for the evaluation of metric when CUDA tree learner is used */ - mutable std::vector metric_temp_score_; - /*! \brief temporary storage on CPU for training data when CUDA tree learner is used */ - std::vector training_temp_score_; }; } // namespace LightGBM diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 35646c6a170e..5a9eb226fef5 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -132,7 +132,7 @@ class RF : public GBDT { double pred = init_scores_[cur_tree_id]; auto residual_getter = [pred](const label_t* label, int i) {return static_cast(label[i]) - pred; }; tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter, - train_score_updater_->score(), num_data_, bag_data_indices_.data(), bag_data_cnt_); + num_data_, bag_data_indices_.data(), bag_data_cnt_); if (std::fabs(init_scores_[cur_tree_id]) > kEpsilon) { new_tree->AddBias(init_scores_[cur_tree_id]); } diff --git a/src/boosting/score_updater.hpp b/src/boosting/score_updater.hpp index 0e79ed762736..7446691a4709 100644 --- a/src/boosting/score_updater.hpp +++ b/src/boosting/score_updater.hpp @@ -51,7 +51,7 @@ class ScoreUpdater { inline bool has_init_score() const { return has_init_score_; } - virtual inline void AddScore(double val, int cur_tree_id) { + inline void AddScore(double val, int cur_tree_id) { Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; #pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) @@ -60,7 +60,7 @@ class ScoreUpdater { } } - virtual inline void MultiplyScore(double val, int cur_tree_id) { + inline void MultiplyScore(double val, int cur_tree_id) { const size_t offset = static_cast(num_data_) * cur_tree_id; #pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) for (int i = 0; i < num_data_; ++i) { @@ -73,7 +73,7 @@ class ScoreUpdater { * \param tree Trained tree model * \param cur_tree_id Current tree for multiclass training */ - virtual inline void AddScore(const Tree* tree, int cur_tree_id) { + inline void AddScore(const Tree* tree, int cur_tree_id) { Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; tree->AddPredictionToScore(data_, num_data_, score_.data() + offset); @@ -85,7 +85,7 @@ class ScoreUpdater { * \param tree_learner * \param cur_tree_id Current tree for multiclass training */ - virtual inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) { + inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) { Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; tree_learner->AddPredictionToScore(tree, score_.data() + offset); @@ -98,14 +98,14 @@ class ScoreUpdater { * \param data_cnt Number of data that will be processed * \param cur_tree_id Current tree for multiclass training */ - virtual inline void AddScore(const Tree* tree, const data_size_t* data_indices, + inline void AddScore(const Tree* tree, const data_size_t* data_indices, data_size_t data_cnt, int cur_tree_id) { Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + offset); } /*! \brief Pointer of score */ - virtual inline const double* score() const { return score_.data(); } + inline const double* score() const { return score_.data(); } inline data_size_t num_data() const { return num_data_; } @@ -114,7 +114,7 @@ class ScoreUpdater { /*! \brief Disable copy */ ScoreUpdater(const ScoreUpdater&) = delete; - protected: + private: /*! \brief Number of total data */ data_size_t num_data_; /*! \brief Pointer of data set */ diff --git a/src/c_api.cpp b/src/c_api.cpp index d81ea3ef5220..3f11223c8bcb 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -26,7 +26,6 @@ #include #include -#include "application/cuda/cuda_predictor.hpp" #include "application/predictor.hpp" #include #include @@ -403,7 +402,7 @@ class Booster { *out_len = single_row_predictor->num_pred_in_one_row; } - Predictor* CreatePredictor(int start_iteration, int num_iteration, int predict_type, int ncol, const Config& config) const { + Predictor CreatePredictor(int start_iteration, int num_iteration, int predict_type, int ncol, const Config& config) const { if (!config.predict_disable_shape_check && ncol != boosting_->MaxFeatureIdx() + 1) { Log::Fatal("The number of features in data (%d) is not the same as it was in training data (%d).\n" \ "You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", ncol, boosting_->MaxFeatureIdx() + 1); @@ -420,7 +419,8 @@ class Booster { } else { is_raw_score = false; } - return new Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + + return Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); } @@ -429,7 +429,7 @@ class Booster { const Config& config, double* out_result, int64_t* out_len) const { SHARED_LOCK(mutex_); - auto predictor = std::unique_ptr(CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config)); + auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config); bool is_predict_leaf = false; bool predict_contrib = false; if (predict_type == C_API_PREDICT_LEAF_INDEX) { @@ -438,7 +438,17 @@ class Booster { predict_contrib = true; } int64_t num_pred_in_one_row = boosting_->NumPredictOneRow(start_iteration, num_iteration, is_predict_leaf, predict_contrib); - predictor->Predict(nrow, num_pred_in_one_row, get_row_fun, out_result); + auto pred_fun = predictor.GetPredictFunction(); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) + for (int i = 0; i < nrow; ++i) { + OMP_LOOP_EX_BEGIN(); + auto one_row = get_row_fun(i); + auto pred_wrt_ptr = out_result + static_cast(num_pred_in_one_row) * i; + pred_fun(one_row, pred_wrt_ptr); + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); *out_len = num_pred_in_one_row * nrow; } @@ -448,8 +458,8 @@ class Booster { std::vector>>* agg_ptr, int32_t** out_indices, void** out_data, int data_type, bool* is_data_float32_ptr, int num_matrices) const { - auto predictor = std::unique_ptr(CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config)); - auto pred_sparse_fun = predictor->GetPredictSparseFunction(); + auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config); + auto pred_sparse_fun = predictor.GetPredictSparseFunction(); std::vector>>& agg = *agg_ptr; OMP_INIT_EX(); #pragma omp parallel for schedule(static) @@ -583,8 +593,8 @@ class Booster { SHARED_LOCK(mutex_); // Get the number of trees per iteration (for multiclass scenario we output multiple sparse matrices) int num_matrices = boosting_->NumModelPerIteration(); - auto predictor = std::unique_ptr(CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config)); - auto pred_sparse_fun = predictor->GetPredictSparseFunction(); + auto predictor = CreatePredictor(start_iteration, num_iteration, predict_type, ncol, config); + auto pred_sparse_fun = predictor.GetPredictSparseFunction(); bool is_col_ptr_int32 = false; bool is_data_float32 = false; int num_output_cols = ncol + 1; @@ -700,11 +710,10 @@ class Booster { } else { is_raw_score = false; } - std::unique_ptr predictor; - predictor.reset(new Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, - config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin)); + Predictor predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, + config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); bool bool_data_has_header = data_has_header > 0 ? true : false; - predictor->Predict(data_filename, result_filename, bool_data_has_header, config.predict_disable_shape_check, + predictor.Predict(data_filename, result_filename, bool_data_has_header, config.predict_disable_shape_check, config.precise_float_parser); } diff --git a/src/io/cuda/cuda_column_data.cu b/src/io/cuda/cuda_column_data.cu index e5d982248b72..12aee9bc9b4e 100644 --- a/src/io/cuda/cuda_column_data.cu +++ b/src/io/cuda/cuda_column_data.cu @@ -22,21 +22,21 @@ __global__ void CopySubrowKernel_ColumnData( const data_size_t local_data_index_start = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (bit_type == 8) { const uint8_t* true_in_column_data = reinterpret_cast(in_column_data); - uint8_t* true_out_column_data = reinterpret_cast(out_column_data); + uint8_t* true_out_column_data = reinterpret_cast(out_column_data); for (data_size_t local_data_index = local_data_index_start; local_data_index < num_used_indices; local_data_index += static_cast(blockDim.x)) { const data_size_t global_data_index = cuda_used_indices[local_data_index]; true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; } } else if (bit_type == 16) { const uint16_t* true_in_column_data = reinterpret_cast(in_column_data); - uint16_t* true_out_column_data = reinterpret_cast(out_column_data); + uint16_t* true_out_column_data = reinterpret_cast(out_column_data); for (data_size_t local_data_index = local_data_index_start; local_data_index < num_used_indices; local_data_index += static_cast(blockDim.x)) { const data_size_t global_data_index = cuda_used_indices[local_data_index]; true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; } } else if (bit_type == 32) { const uint32_t* true_in_column_data = reinterpret_cast(in_column_data); - uint32_t* true_out_column_data = reinterpret_cast(out_column_data); + uint32_t* true_out_column_data = reinterpret_cast(out_column_data); for (data_size_t local_data_index = local_data_index_start; local_data_index < num_used_indices; local_data_index += static_cast(blockDim.x)) { const data_size_t global_data_index = cuda_used_indices[local_data_index]; true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; @@ -44,7 +44,7 @@ __global__ void CopySubrowKernel_ColumnData( } } -void CUDAColumnData::LanchCopySubrowKernel(void* const* in_cuda_data_by_column, const data_size_t num_used_indices) { +void CUDAColumnData::LaunchCopySubrowKernel(void* const* in_cuda_data_by_column, const data_size_t num_used_indices) { CopySubrowKernel_ColumnData<<>>( in_cuda_data_by_column, cuda_column_bit_type_, diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index c909c1a371c4..2e7facb4a608 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -29,7 +29,6 @@ void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_sha const void* host_row_ptr = nullptr; row_ptr_bit_type_ = 0; const void* host_data = train_share_state->GetRowWiseData(&bit_type_, &total_size, &is_sparse_, &host_row_ptr, &row_ptr_bit_type_); - Log::Warning("bit_type_ = %d, is_sparse_ = %d, row_ptr_bit_type_ = %d", bit_type_, static_cast(is_sparse_), row_ptr_bit_type_); if (bit_type_ == 8) { if (!is_sparse_) { std::vector partitioned_data; @@ -215,8 +214,6 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha __FILE__, __LINE__); - Log::Warning("num_columns_ = %d", column_index); - Log::Warning("column_hist_offsets_.size() = %d", column_hist_offsets_.size()); InitCUDAMemoryFromHostMemoryOuter(&cuda_column_hist_offsets_, column_hist_offsets_.data(), column_hist_offsets_.size(), diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index bf768540394c..28bbdc139de0 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -154,21 +154,6 @@ int CUDATree::Split(const int leaf_index, return num_leaves_ - 1; } -void CUDATree::AddPredictionToScore(const Dataset* data, - data_size_t num_data, - double* score) const { - LaunchAddPredictionToScoreKernel(data, nullptr, num_data, score); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -void CUDATree::AddPredictionToScore(const Dataset* data, - const data_size_t* used_data_indices, - data_size_t num_data, double* score) const { - // TODO(shiyu1994): used_data_indices should reside on GPU - LaunchAddPredictionToScoreKernel(data, used_data_indices, num_data, score); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - inline void CUDATree::Shrinkage(double rate) { Tree::Shrinkage(rate); LaunchShrinkageKernel(rate); @@ -217,4 +202,8 @@ void CUDATree::ToHost() { SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } +void CUDATree::SyncLeafOutputFromHostToCUDA() { + CopyFromHostToCUDADeviceOuter(cuda_leaf_value_, leaf_value_.data(), leaf_value_.size(), __FILE__, __LINE__); +} + } // namespace LightGBM diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index ded06eb875f3..143d734866bd 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -170,7 +170,6 @@ __global__ void AddPredictionToScoreKernel( const data_size_t data_index = USE_INDICES ? cuda_used_indices[inner_data_index] : inner_data_index; if (data_index < num_data) { int node = 0; - int iter = 0; while (node >= 0) { const int split_feature_inner = cuda_split_feature_inner[node]; const int column = cuda_feature_to_column[split_feature_inner]; @@ -210,71 +209,11 @@ __global__ void AddPredictionToScoreKernel( node = cuda_right_child[node]; } } - ++iter; - if (iter > 1000) { - printf("error iter = %d, node = %d, cuda_left_child[%d] = %d, cuda_right_child[%d] = %d\n", - iter, node, node, cuda_left_child[node], node, cuda_right_child[node]); - } } score[data_index] += cuda_leaf_value[~node]; } } -void CUDATree::LaunchAddPredictionToScoreKernel( - const Dataset* data, - const data_size_t* used_data_indices, - data_size_t num_data, - double* score) const { - const CUDAColumnData* cuda_column_data = data->cuda_column_data(); - const int num_blocks = (num_data + num_threads_per_block_add_prediction_to_score_ - 1) / num_threads_per_block_add_prediction_to_score_; - if (used_data_indices == nullptr) { - AddPredictionToScoreKernel<<>>( - // dataset information - num_data, - cuda_column_data->cuda_data_by_column(), - cuda_column_data->cuda_column_bit_type(), - cuda_column_data->cuda_feature_min_bin(), - cuda_column_data->cuda_feature_max_bin(), - cuda_column_data->cuda_feature_offset(), - cuda_column_data->cuda_feature_default_bin(), - cuda_column_data->cuda_feature_most_freq_bin(), - cuda_column_data->cuda_feature_to_column(), - nullptr, - // tree information - cuda_threshold_in_bin_, - cuda_decision_type_, - cuda_split_feature_inner_, - cuda_left_child_, - cuda_right_child_, - cuda_leaf_value_, - // output - score); - } else { - AddPredictionToScoreKernel<<>>( - // dataset information - num_data, - cuda_column_data->cuda_data_by_column(), - cuda_column_data->cuda_column_bit_type(), - cuda_column_data->cuda_feature_min_bin(), - cuda_column_data->cuda_feature_max_bin(), - cuda_column_data->cuda_feature_offset(), - cuda_column_data->cuda_feature_default_bin(), - cuda_column_data->cuda_feature_most_freq_bin(), - cuda_column_data->cuda_feature_to_column(), - used_data_indices, - // tree information - cuda_threshold_in_bin_, - cuda_decision_type_, - cuda_split_feature_inner_, - cuda_left_child_, - cuda_right_child_, - cuda_leaf_value_, - // output - score); - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - __global__ void ShrinkageKernel(const double rate, double* cuda_leaf_value, const int num_leaves) { const int leaf_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); if (leaf_index < num_leaves) { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 0ef5eb044667..07458eb61486 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1524,7 +1524,6 @@ void Dataset::CreateCUDAColumnData() { std::vector feature_missing_is_na(num_features_, 0); std::vector feature_mfb_is_zero(num_features_, 0); std::vector feature_mfb_is_na(num_features_, 0); - Log::Warning("num_groups_ = %d", num_groups_); for (int feature_group_index = 0; feature_group_index < num_groups_; ++feature_group_index) { if (feature_groups_[feature_group_index]->is_multi_val_) { for (int sub_feature_index = 0; sub_feature_index < feature_groups_[feature_group_index]->num_feature_; ++sub_feature_index) { diff --git a/src/metric/binary_metric.hpp b/src/metric/binary_metric.hpp index cc4e990a96b7..f70a4ef4ac14 100644 --- a/src/metric/binary_metric.hpp +++ b/src/metric/binary_metric.hpp @@ -96,7 +96,7 @@ class BinaryMetric: public Metric { return std::vector(1, loss); } - protected: + private: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ @@ -250,7 +250,7 @@ class AUCMetric: public Metric { return std::vector(1, auc); } - protected: + private: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ @@ -371,7 +371,7 @@ class AveragePrecisionMetric: public Metric { return std::vector(1, ap); } - protected: + private: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ diff --git a/src/metric/metric.cpp b/src/metric/metric.cpp index 321399405aa8..a7104c2a7880 100644 --- a/src/metric/metric.cpp +++ b/src/metric/metric.cpp @@ -61,7 +61,6 @@ Metric* Metric::CreateMetric(const std::string& type, const Config& config) { } else if (type == std::string("tweedie")) { return new TweedieMetric(config); } - Log::Fatal("Unknown metric type name: %s", type.c_str()); return nullptr; } diff --git a/src/metric/multiclass_metric.hpp b/src/metric/multiclass_metric.hpp index c30dfb9aaa3a..c83b2c842790 100644 --- a/src/metric/multiclass_metric.hpp +++ b/src/metric/multiclass_metric.hpp @@ -118,7 +118,7 @@ class MulticlassMetric: public Metric { return std::vector(1, loss); } - protected: + private: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ @@ -339,7 +339,7 @@ class AucMuMetric : public Metric { return std::vector(1, ans); } - protected: + private: /*! \brief Number of data*/ data_size_t num_data_; /*! \brief Pointer to label*/ diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp index dd9663c8abf8..888849950be3 100644 --- a/src/metric/rank_metric.hpp +++ b/src/metric/rank_metric.hpp @@ -143,7 +143,7 @@ class NDCGMetric:public Metric { return result; } - protected: + private: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ diff --git a/src/metric/regression_metric.hpp b/src/metric/regression_metric.hpp index 9ae13189c39d..d9631811d780 100644 --- a/src/metric/regression_metric.hpp +++ b/src/metric/regression_metric.hpp @@ -101,7 +101,7 @@ class RegressionMetric: public Metric { inline static void CheckLabel(label_t) { } - protected: + private: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Pointer of label */ diff --git a/src/metric/xentropy_metric.hpp b/src/metric/xentropy_metric.hpp index 49e4274cebc2..241b0a856efe 100644 --- a/src/metric/xentropy_metric.hpp +++ b/src/metric/xentropy_metric.hpp @@ -146,7 +146,7 @@ class CrossEntropyMetric : public Metric { return -1.0f; // negative means smaller loss is better, positive means larger loss is better } - protected: + private: /*! \brief Number of data points */ data_size_t num_data_; /*! \brief Pointer to label */ @@ -232,7 +232,7 @@ class CrossEntropyLambdaMetric : public Metric { return -1.0f; } - protected: + private: /*! \brief Number of data points */ data_size_t num_data_; /*! \brief Pointer to label */ @@ -338,7 +338,7 @@ class KullbackLeiblerDivergence : public Metric { return -1.0f; } - protected: + private: /*! \brief Number of data points */ data_size_t num_data_; /*! \brief Pointer to label */ diff --git a/src/objective/binary_objective.hpp b/src/objective/binary_objective.hpp index 12230632b0fd..52be93eeeca1 100644 --- a/src/objective/binary_objective.hpp +++ b/src/objective/binary_objective.hpp @@ -189,7 +189,7 @@ class BinaryLogloss: public ObjectiveFunction { data_size_t NumPositiveData() const override { return num_pos_data_; } - protected: + private: /*! \brief Number of data */ data_size_t num_data_; /*! \brief Number of positive samples */ diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp index 5379caec1199..cc4d2c849a54 100644 --- a/src/objective/multiclass_objective.hpp +++ b/src/objective/multiclass_objective.hpp @@ -165,7 +165,7 @@ class MulticlassSoftmax: public ObjectiveFunction { } } - protected: + private: double factor_; /*! \brief Number of data */ data_size_t num_data_; @@ -266,7 +266,7 @@ class MulticlassOVA: public ObjectiveFunction { return binary_loss_[class_id]->ClassNeedTrain(0); } - protected: + private: MulticlassOVA() {} /*! \brief Number of data */ diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index d58b6f26aa4c..193353d935c3 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -53,7 +53,6 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& } ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& str) { - // TODO(shiyu1994): consider the case for CUDA auto strs = Common::Split(str.c_str(), ' '); auto type = strs[0]; if (type == std::string("regression")) { diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 189573441342..9d68116fd9c4 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -62,10 +62,6 @@ class RankingObjective : public ObjectiveFunction { } } } - const int num_show = 1000; - for (int i = 0; i < num_show; ++i) { - Log::Warning("gradients[%d] = %f, hessians[%d] = %f, score[%d] = %f", i, gradients[i], i, hessians[i], i, score[i]); - } } virtual void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt, @@ -259,7 +255,7 @@ class LambdarankNDCG : public RankingObjective { const char* GetName() const override { return "lambdarank"; } - protected: + private: /*! \brief Simgoid param */ double sigmoid_; /*! \brief Normalize the lambdas or not */ @@ -356,13 +352,13 @@ class RankXENDCG : public RankingObjective { } } - const char* GetName() const override { return "rank_xendcg"; } - - protected: double Phi(const label_t l, double g) const { return Common::Pow(2, static_cast(l)) - g; } + const char* GetName() const override { return "rank_xendcg"; } + + private: mutable std::vector rands_; }; diff --git a/src/objective/regression_objective.hpp b/src/objective/regression_objective.hpp index 538d9cb1228a..e711da012066 100644 --- a/src/objective/regression_objective.hpp +++ b/src/objective/regression_objective.hpp @@ -620,7 +620,6 @@ class RegressionMAPELOSS : public RegressionL1loss { #pragma omp parallel for schedule(static) for (data_size_t i = 0; i < num_data_; ++i) { const double diff = score[i] - label_[i]; - // TODO(shiyu1994): sample weight should be considered in the gradient calculation gradients[i] = static_cast(Common::Sign(diff) * label_weight_[i]); hessians[i] = weights_[i]; } @@ -663,7 +662,6 @@ class RegressionMAPELOSS : public RegressionL1loss { } bool IsConstantHessian() const override { - // TODO(shiyu1994): true only when weights is constant return true; } diff --git a/src/objective/xentropy_objective.hpp b/src/objective/xentropy_objective.hpp index 2f955ae21f48..22f9b4d33cbb 100644 --- a/src/objective/xentropy_objective.hpp +++ b/src/objective/xentropy_objective.hpp @@ -136,7 +136,7 @@ class CrossEntropy: public ObjectiveFunction { return initscore; } - protected: + private: /*! \brief Number of data points */ data_size_t num_data_; /*! \brief Pointer for label */ @@ -264,7 +264,7 @@ class CrossEntropyLambda: public ObjectiveFunction { return initscore; } - protected: + private: /*! \brief Number of data points */ data_size_t num_data_; /*! \brief Pointer for label */ diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 02becba3e78f..269fed965702 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -23,10 +23,8 @@ __global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_dat } void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() { - if (used_indices == nullptr) { - const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; - FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_, cuda_data_index_to_leaf_index_); - } + const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; + FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_, cuda_data_index_to_leaf_index_); } __device__ __forceinline__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, uint16_t* block_to_left_offset, @@ -1155,7 +1153,7 @@ __global__ void AddPredictionToScoreKernel( if (data_index < num_data) { const int leaf_index = cuda_data_index_to_leaf_index[data_index]; const double leaf_prediction_value = leaf_value[leaf_index]; - cuda_scores[data_index] += leaf_prediction_value; + cuda_scores[data_index] = leaf_prediction_value; } } diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 5dbcfcf9f4b2..3db58d5a6f0c 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -48,6 +48,9 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia leaf_num_data_.resize(config_->num_leaves, 0); leaf_data_start_.resize(config_->num_leaves, 0); leaf_sum_hessians_.resize(config_->num_leaves, 0.0f); + + AllocateCUDAMemoryOuter(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); + add_train_score_.resize(num_data_, 0.0f); } void NewCUDATreeLearner::BeforeTrain() { @@ -71,28 +74,28 @@ void NewCUDATreeLearner::BeforeTrain() { void NewCUDATreeLearner::AddPredictionToScore(const Tree* tree, double* out_score) const { CHECK(tree->is_cuda_tree()); const CUDATree* cuda_tree = reinterpret_cast(tree); - cuda_data_partition_->UpdateTrainScore(cuda_tree->cuda_leaf_value(), out_score); + cuda_data_partition_->UpdateTrainScore(cuda_tree->cuda_leaf_value(), cuda_add_train_score_); + CopyFromCUDADeviceToHostOuter(add_train_score_.data(), + cuda_add_train_score_, static_cast(cuda_data_partition_->root_num_data()), __FILE__, __LINE__); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (data_size_t data_index = 0; data_index < cuda_data_partition_->root_num_data(); ++data_index) { + out_score[data_index] += add_train_score_[data_index]; + } + OMP_THROW_EX(); } Tree* NewCUDATreeLearner::Train(const score_t* gradients, const score_t* hessians, bool /*is_first_tree*/) { gradients_ = gradients; hessians_ = hessians; - const auto start = std::chrono::steady_clock::now(); - auto before_train_start = std::chrono::steady_clock::now(); global_timer.Start("NewCUDATreeLearner::BeforeTrain"); BeforeTrain(); global_timer.Stop("NewCUDATreeLearner::BeforeTrain"); - auto before_train_end = std::chrono::steady_clock::now(); - double construct_histogram_time = 0.0f; - double find_best_split_time = 0.0f; - double find_best_split_from_all_leaves_time = 0.0f; - double split_data_indices_time = 0.0f; const bool track_branch_features = !(config_->interaction_constraints_vector.empty()); std::unique_ptr tree(new CUDATree(config_->num_leaves, track_branch_features, config_->linear_tree, config_->gpu_device_id)); for (int i = 0; i < config_->num_leaves - 1; ++i) { global_timer.Start("NewCUDATreeLearner::ConstructHistogramForLeaf"); - auto start = std::chrono::steady_clock::now(); const data_size_t num_data_in_smaller_leaf = leaf_num_data_[smaller_leaf_index_]; const data_size_t num_data_in_larger_leaf = larger_leaf_index_ < 0 ? 0 : leaf_num_data_[larger_leaf_index_]; const double sum_hessians_in_smaller_leaf = leaf_sum_hessians_[smaller_leaf_index_]; @@ -104,23 +107,15 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, num_data_in_larger_leaf, sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf); - auto end = std::chrono::steady_clock::now(); - auto duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::ConstructHistogramForLeaf"); - construct_histogram_time += duration.count(); global_timer.Start("NewCUDATreeLearner::FindBestSplitsForLeaf"); - start = std::chrono::steady_clock::now(); cuda_best_split_finder_->FindBestSplitsForLeaf( cuda_smaller_leaf_splits_->GetCUDAStruct(), cuda_larger_leaf_splits_->GetCUDAStruct(), smaller_leaf_index_, larger_leaf_index_, num_data_in_smaller_leaf, num_data_in_larger_leaf, sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::FindBestSplitsForLeaf"); - find_best_split_time += duration.count(); - start = std::chrono::steady_clock::now(); global_timer.Start("NewCUDATreeLearner::FindBestFromAllSplits"); const CUDASplitInfo* best_split_info = nullptr; if (larger_leaf_index_ >= 0) { @@ -149,9 +144,6 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, &best_leaf_index_); } global_timer.Stop("NewCUDATreeLearner::FindBestFromAllSplits"); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); - find_best_split_from_all_leaves_time += duration.count(); if (best_leaf_index_ == -1) { Log::Warning("No further splits with positive gain, training stopped with %d leaves.", (i + 1)); @@ -159,7 +151,6 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, } global_timer.Start("NewCUDATreeLearner::Split"); - start = std::chrono::steady_clock::now(); int right_leaf_index = tree->Split(best_leaf_index_, train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]), train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], @@ -185,14 +176,9 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, &leaf_sum_hessians_[right_leaf_index]); smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index); larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_); - end = std::chrono::steady_clock::now(); - duration = static_cast>(end - start); global_timer.Stop("NewCUDATreeLearner::Split"); - split_data_indices_time += duration.count(); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - const auto end = std::chrono::steady_clock::now(); - const double duration = (static_cast>(end - start)).count(); tree->ToHost(); return tree.release(); } @@ -208,16 +194,12 @@ void NewCUDATreeLearner::SetBaggingData(const Dataset* subset, } } -void NewCUDATreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function /*residual_getter*/, - const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { +void NewCUDATreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, + data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { CHECK(tree->is_cuda_tree()); CUDATree* cuda_tree = reinterpret_cast(tree); - obj->RenewTreeOutputCUDA(score, - cuda_data_partition_->cuda_data_indices(), - cuda_data_partition_->cuda_leaf_num_data(), - cuda_data_partition_->cuda_leaf_data_start(), - tree->num_leaves(), - cuda_tree->cuda_leaf_value_ref()); + SerialTreeLearner::RenewTreeOutput(tree, obj, residual_getter, total_num_data, bag_indices, bag_cnt); + cuda_tree->SyncLeafOutputFromHostToCUDA(); } void NewCUDATreeLearner::AfterTrain() { diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index feb5653c7b53..0586f9b978c3 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -34,7 +34,7 @@ class NewCUDATreeLearner: public SerialTreeLearner { void AddPredictionToScore(const Tree* tree, double* out_score) const override; void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, - const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; + data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; protected: void BeforeTrain() override; @@ -66,6 +66,11 @@ class NewCUDATreeLearner: public SerialTreeLearner { int smaller_leaf_index_; int larger_leaf_index_; int best_leaf_index_; + + // added train score buffer in CUDA + double* cuda_add_train_score_; + // add train score buffer in host + mutable std::vector add_train_score_; }; } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 5fd872d61eeb..8b1725a64992 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -573,6 +573,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, } *left_leaf = best_leaf; auto next_leaf_id = tree->NextLeafId(); + // update before tree split constraints_->BeforeSplit(best_leaf, next_leaf_id, best_split_info.monotone_type); @@ -681,7 +682,7 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, } void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, - const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { + data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { if (obj != nullptr && obj->IsRenewTreeOutput()) { CHECK_LE(tree->num_leaves(), data_partition_->num_leaves()); const data_size_t* bag_mapper = nullptr; diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 31875afb0cf8..0466cafa43b3 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -113,7 +113,7 @@ class SerialTreeLearner: public TreeLearner { } void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, - const double* score, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; + data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; /*! \brief Get output of parent node, used for path smoothing */ double GetParentOutput(const Tree* tree, const LeafSplits* leaf_splits) const; From 95fd61a3a635880c9046de01e686efae27dba811 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 15 Sep 2021 09:20:23 +0000 Subject: [PATCH 075/166] copy gradients from host to device --- src/treelearner/cuda/new_cuda_tree_learner.cpp | 15 ++++++++++----- src/treelearner/cuda/new_cuda_tree_learner.hpp | 8 ++++++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 3db58d5a6f0c..fb81f35d136f 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -50,21 +50,26 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia leaf_sum_hessians_.resize(config_->num_leaves, 0.0f); AllocateCUDAMemoryOuter(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_gradients_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_hessians_, static_cast(num_data_), __FILE__, __LINE__); add_train_score_.resize(num_data_, 0.0f); } void NewCUDATreeLearner::BeforeTrain() { + const data_size_t root_num_data = cuda_data_partition_->root_num_data(); + CopyFromHostToCUDADeviceOuter(cuda_gradients_, gradients_, static_cast(root_num_data), __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_hessians_, hessians_, static_cast(root_num_data), __FILE__, __LINE__); cuda_data_partition_->BeforeTrain(); cuda_smaller_leaf_splits_->InitValues( - gradients_, - hessians_, + cuda_gradients_, + cuda_hessians_, cuda_data_partition_->cuda_data_indices(), - cuda_data_partition_->root_num_data(), + root_num_data, cuda_histogram_constructor_->cuda_hist_pointer(), &leaf_sum_hessians_[0]); - leaf_num_data_[0] = cuda_data_partition_->root_num_data(); + leaf_num_data_[0] = root_num_data; cuda_larger_leaf_splits_->InitValues(); - cuda_histogram_constructor_->BeforeTrain(gradients_, hessians_); + cuda_histogram_constructor_->BeforeTrain(cuda_gradients_, cuda_hessians_); cuda_best_split_finder_->BeforeTrain(); leaf_data_start_[0] = 0; smaller_leaf_index_ = 0; diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 0586f9b978c3..cba818a91419 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -67,10 +67,14 @@ class NewCUDATreeLearner: public SerialTreeLearner { int larger_leaf_index_; int best_leaf_index_; - // added train score buffer in CUDA + /*! \brief added train score buffer in CUDA */ double* cuda_add_train_score_; - // add train score buffer in host + /*! \brief add train score buffer in host */ mutable std::vector add_train_score_; + /*! \brief gradients on CUDA */ + score_t* cuda_gradients_; + /*! \brief hessians on CUDA */ + score_t* cuda_hessians_; }; } // namespace LightGBM From 285c2d637e946257a3e841a0812553fe65545298 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 15 Sep 2021 16:09:53 +0000 Subject: [PATCH 076/166] support bagging without using subset --- src/boosting/gbdt.cpp | 1 + src/io/dataset.cpp | 8 +-- .../cuda/cuda_best_split_finder.cu | 10 ++-- src/treelearner/cuda/cuda_data_partition.cpp | 37 +++++++++++++- src/treelearner/cuda/cuda_data_partition.cu | 50 +++++++++++++++---- src/treelearner/cuda/cuda_data_partition.hpp | 12 ++++- src/treelearner/cuda/cuda_leaf_splits.cu | 21 +++++--- .../cuda/new_cuda_tree_learner.cpp | 22 +++----- .../cuda/new_cuda_tree_learner.hpp | 4 -- 9 files changed, 117 insertions(+), 48 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index d393d46d5133..57040c646db9 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -390,6 +390,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto hess = hessians + offset; // need to copy gradients for bagging subset. if (is_use_subset_ && bag_data_cnt_ < num_data_) { + Log::Warning("use subset !!!"); for (int i = 0; i < bag_data_cnt_; ++i) { gradients_[offset + i] = grad[bag_data_indices_[i]]; hessians_[offset + i] = hess[bag_data_indices_[i]]; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 07458eb61486..a05e83968237 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1152,10 +1152,10 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, template void Dataset::ConstructHistogramsInner( - const std::vector& is_feature_used, const data_size_t* data_indices, - data_size_t num_data, const score_t* gradients, const score_t* hessians, - score_t* ordered_gradients, score_t* ordered_hessians, - TrainingShareStates* share_state, hist_t* hist_data) const { + const std::vector& is_feature_used, const data_size_t* data_indices, + data_size_t num_data, const score_t* gradients, const score_t* hessians, + score_t* ordered_gradients, score_t* ordered_hessians, + TrainingShareStates* share_state, hist_t* hist_data) const { if (!share_state->is_col_wise) { return ConstructHistogramsMultiVal( data_indices, num_data, gradients, hessians, share_state, hist_data); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 55e309eb0a5c..03116e786474 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -554,7 +554,7 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i CUDASplitInfo* cuda_split_info = cuda_leaf_best_split_info + buffer_write_pos; const CUDASplitInfo* best_split_info = cuda_best_split_info + best_read_index; if (best_split_info->is_valid) { - /*cuda_split_info->gain = best_split_info->gain; + cuda_split_info->gain = best_split_info->gain; cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; cuda_split_info->default_left = best_split_info->default_left; @@ -569,11 +569,11 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i cuda_split_info->right_count = best_split_info->right_count; cuda_split_info->right_gain = best_split_info->right_gain; cuda_split_info->right_value = best_split_info->right_value; - cuda_split_info->is_valid = true;*/ - *cuda_split_info = *best_split_info; - cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : - cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; cuda_split_info->is_valid = true; + //*cuda_split_info = *best_split_info; + //cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : + // cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; + //cuda_split_info->is_valid = true; } else { cuda_split_info->gain = kMinScore; cuda_split_info->is_valid = false; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index cc35da52a762..c7d7f5c25628 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -86,7 +86,10 @@ void CUDADataPartition::Init() { InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_num_bin_offsets_, feature_num_bin_offsets.data(), feature_num_bin_offsets.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_bin_upper_bounds_, flatten_bin_upper_bounds.data(), flatten_bin_upper_bounds.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_num_data_, &num_data_, 1, __FILE__, __LINE__); + add_train_score_.resize(num_data_, 0.0f); + AllocateCUDAMemoryOuter(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); use_bagging_ = false; + used_indices_ = nullptr; } void CUDADataPartition::BeforeTrain() { @@ -202,8 +205,33 @@ void CUDADataPartition::SplitInner( ++cur_num_leaves_; } -void CUDADataPartition::UpdateTrainScore(const double* leaf_value, double* cuda_scores) { - LaunchAddPredictionToScoreKernel(leaf_value, cuda_scores); +void CUDADataPartition::UpdateTrainScore(const Tree* tree, double* scores) { + CHECK(tree->is_cuda_tree()); + const CUDATree* cuda_tree = reinterpret_cast(tree); + const data_size_t num_data_in_root = root_num_data(); + CopyFromHostToCUDADeviceOuter(cuda_data_indices_, used_indices_, static_cast(num_used_indices_), __FILE__, __LINE__); + LaunchAddPredictionToScoreKernel(cuda_tree->cuda_leaf_value(), cuda_add_train_score_); + CopyFromCUDADeviceToHostOuter(add_train_score_.data(), + cuda_add_train_score_, static_cast(num_data_in_root), __FILE__, __LINE__); + if (!use_bagging_) { + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (data_size_t data_index = 0; data_index < num_data_in_root; ++data_index) { + OMP_LOOP_EX_BEGIN(); + scores[data_index] += add_train_score_[data_index]; + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + } else { + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (data_size_t data_index = 0; data_index < num_data_in_root; ++data_index) { + OMP_LOOP_EX_BEGIN(); + scores[used_indices_[data_index]] += add_train_score_[data_index]; + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + } } void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf) { @@ -224,11 +252,16 @@ void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf) { void CUDADataPartition::SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_used_indices) { use_bagging_ = true; num_used_indices_ = num_used_indices; + used_indices_ = used_indices; CopyFromHostToCUDADeviceOuter(cuda_data_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); + LaunchFillDataIndexToLeafIndex(); } void CUDADataPartition::SetUseBagging(const bool use_bagging) { use_bagging_ = use_bagging; + if (!use_bagging_) { + used_indices_ = nullptr; + } } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 269fed965702..5f960d4097fc 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -22,11 +22,27 @@ __global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_dat } } +__global__ void FillDataIndexToLeafIndexKernel( + const data_size_t num_data, + const data_size_t* data_indices, + int* data_index_to_leaf_index) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + data_index_to_leaf_index[data_indices[data_index]] = 0; + } +} + void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() { const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_, cuda_data_index_to_leaf_index_); } +void CUDADataPartition::LaunchFillDataIndexToLeafIndex() { + const data_size_t num_data_in_root = root_num_data(); + const int num_blocks = (num_data_in_root + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; + FillDataIndexToLeafIndexKernel<<>>(num_data_in_root, cuda_data_indices_, cuda_data_index_to_leaf_index_); +} + __device__ __forceinline__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, uint16_t* block_to_left_offset, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, const uint16_t thread_to_left_offset_cnt, uint16_t* shared_mem_buffer) { @@ -1142,26 +1158,40 @@ void CUDADataPartition::LaunchSplitInnerKernel( *right_leaf_sum_of_hessians_ref = cpu_sum_hessians_info[1]; } +template __global__ void AddPredictionToScoreKernel( - const data_size_t* num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const data_size_t* leaf_data_start, const double* leaf_value, double* cuda_scores, + const data_size_t* data_indices_in_leaf, + const double* leaf_value, double* cuda_scores, const int* cuda_data_index_to_leaf_index, const data_size_t num_data) { const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockIdx_x = blockIdx.x; const unsigned int blockDim_x = blockDim.x; - const int data_index = static_cast(blockIdx_x * blockDim_x + threadIdx_x); - if (data_index < num_data) { - const int leaf_index = cuda_data_index_to_leaf_index[data_index]; - const double leaf_prediction_value = leaf_value[leaf_index]; - cuda_scores[data_index] = leaf_prediction_value; + const data_size_t local_data_index = static_cast(blockIdx_x * blockDim_x + threadIdx_x); + if (local_data_index < num_data) { + if (USE_BAGGING) { + const data_size_t global_data_index = data_indices_in_leaf[local_data_index]; + const int leaf_index = cuda_data_index_to_leaf_index[global_data_index]; + const double leaf_prediction_value = leaf_value[leaf_index]; + cuda_scores[local_data_index] = leaf_prediction_value; + } else { + const int leaf_index = cuda_data_index_to_leaf_index[local_data_index]; + const double leaf_prediction_value = leaf_value[leaf_index]; + cuda_scores[local_data_index] = leaf_prediction_value; + } } } void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double* leaf_value, double* cuda_scores) { global_timer.Start("CUDADataPartition::AddPredictionToScoreKernel"); - const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; - AddPredictionToScoreKernel<<>>( - cuda_leaf_num_data_, cuda_data_indices_, cuda_leaf_data_start_, leaf_value, cuda_scores, cuda_data_index_to_leaf_index_, num_data_); + const data_size_t num_data_in_root = root_num_data(); + const int num_blocks = (num_data_in_root + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; + if (use_bagging_) { + AddPredictionToScoreKernel<<>>( + cuda_data_indices_, leaf_value, cuda_scores, cuda_data_index_to_leaf_index_, num_data_in_root); + } else { + AddPredictionToScoreKernel<<>>( + cuda_data_indices_, leaf_value, cuda_scores, cuda_data_index_to_leaf_index_, num_data_in_root); + } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel"); } diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 1f4351e34807..7201d42001bc 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -15,6 +15,7 @@ #include "cuda_leaf_splits.hpp" #include +#include // TODO(shiyu1994): adjust these values according to different CUDA and GPU versions #define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) @@ -57,7 +58,7 @@ class CUDADataPartition { double* left_leaf_sum_of_hessians, double* right_leaf_sum_of_hessians); - void UpdateTrainScore(const double* leaf_value, double* cuda_scores); + void UpdateTrainScore(const Tree* tree, double* cuda_scores); void SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_used_indices); @@ -203,6 +204,7 @@ class CUDADataPartition { void LaunchAddPredictionToScoreKernel(const double* leaf_value, double* cuda_scores); + void LaunchFillDataIndexToLeafIndex(); // Host memory @@ -223,6 +225,10 @@ class CUDADataPartition { int grid_dim_; /*! \brief block dimension when splitting one leaf */ int block_dim_; + /*! \brief add train score buffer in host */ + mutable std::vector add_train_score_; + /*! \brief data indices used in this iteration */ + const data_size_t* used_indices_; // config information /*! \brief maximum number of leaves in a tree */ @@ -288,6 +294,10 @@ class CUDADataPartition { /*! \brief number of data in training set, for intialization of cuda_leaf_num_data_ and cuda_leaf_data_end_ */ data_size_t* cuda_num_data_; + // for train score update + /*! \brief added train score buffer in CUDA */ + double* cuda_add_train_score_; + // CUDA memory, held by other object diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 08d5de65ef8f..78796a0ff13d 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -10,8 +10,10 @@ namespace LightGBM { +template __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t num_data, double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) { + const data_size_t num_data, const data_size_t* data_indices_in_leaf, + double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) { __shared__ score_t shared_gradients[NUM_THRADS_PER_BLOCK_LEAF_SPLITS]; __shared__ score_t shared_hessians[NUM_THRADS_PER_BLOCK_LEAF_SPLITS]; const unsigned int tid = threadIdx.x; @@ -21,8 +23,9 @@ __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score __syncthreads(); for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_LEAF_SPLITS; ++j) { if (i + j < num_data) { - shared_gradients[tid] += cuda_gradients[i + j]; - shared_hessians[tid] += cuda_hessians[i + j]; + const data_size_t data_index = USE_INDICES ? data_indices_in_leaf[i + j] : static_cast(i + j); + shared_gradients[tid] += cuda_gradients[data_index]; + shared_hessians[tid] += cuda_hessians[data_index]; } } __syncthreads(); @@ -84,9 +87,15 @@ void CUDALeafSplits::LaunchInitValuesKernal( const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf) { - CUDAInitValuesKernel1<<>>( - cuda_gradients_, cuda_hessians_, num_used_indices, cuda_sum_of_gradients_buffer_, - cuda_sum_of_hessians_buffer_); + if (num_used_indices == num_data_) { + CUDAInitValuesKernel1<<>>( + cuda_gradients_, cuda_hessians_, num_used_indices, cuda_data_indices_in_leaf, cuda_sum_of_gradients_buffer_, + cuda_sum_of_hessians_buffer_); + } else { + CUDAInitValuesKernel1<<>>( + cuda_gradients_, cuda_hessians_, num_used_indices, cuda_data_indices_in_leaf, cuda_sum_of_gradients_buffer_, + cuda_sum_of_hessians_buffer_); + } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); CUDAInitValuesKernel2<<<1, 1>>>( num_blocks_init_from_gradients_, diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index fb81f35d136f..87aa4d7036e3 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -49,16 +49,14 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia leaf_data_start_.resize(config_->num_leaves, 0); leaf_sum_hessians_.resize(config_->num_leaves, 0.0f); - AllocateCUDAMemoryOuter(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_gradients_, static_cast(num_data_), __FILE__, __LINE__); AllocateCUDAMemoryOuter(&cuda_hessians_, static_cast(num_data_), __FILE__, __LINE__); - add_train_score_.resize(num_data_, 0.0f); } void NewCUDATreeLearner::BeforeTrain() { const data_size_t root_num_data = cuda_data_partition_->root_num_data(); - CopyFromHostToCUDADeviceOuter(cuda_gradients_, gradients_, static_cast(root_num_data), __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_hessians_, hessians_, static_cast(root_num_data), __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_gradients_, gradients_, static_cast(num_data_), __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_hessians_, hessians_, static_cast(num_data_), __FILE__, __LINE__); cuda_data_partition_->BeforeTrain(); cuda_smaller_leaf_splits_->InitValues( cuda_gradients_, @@ -77,17 +75,7 @@ void NewCUDATreeLearner::BeforeTrain() { } void NewCUDATreeLearner::AddPredictionToScore(const Tree* tree, double* out_score) const { - CHECK(tree->is_cuda_tree()); - const CUDATree* cuda_tree = reinterpret_cast(tree); - cuda_data_partition_->UpdateTrainScore(cuda_tree->cuda_leaf_value(), cuda_add_train_score_); - CopyFromCUDADeviceToHostOuter(add_train_score_.data(), - cuda_add_train_score_, static_cast(cuda_data_partition_->root_num_data()), __FILE__, __LINE__); - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (data_size_t data_index = 0; data_index < cuda_data_partition_->root_num_data(); ++data_index) { - out_score[data_index] += add_train_score_[data_index]; - } - OMP_THROW_EX(); + cuda_data_partition_->UpdateTrainScore(tree, out_score); } Tree* NewCUDATreeLearner::Train(const score_t* gradients, @@ -184,6 +172,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, global_timer.Stop("NewCUDATreeLearner::Split"); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + //AfterTrain(); tree->ToHost(); return tree.release(); } @@ -193,8 +182,9 @@ void NewCUDATreeLearner::ResetTrainingData(const Dataset* /*train_data*/, void NewCUDATreeLearner::SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) { - cuda_data_partition_->SetUsedDataIndices(used_indices, num_data); if (subset == nullptr) { + cuda_data_partition_->SetUsedDataIndices(used_indices, num_data); + } else { } } diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index cba818a91419..fe4efd49d024 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -67,10 +67,6 @@ class NewCUDATreeLearner: public SerialTreeLearner { int larger_leaf_index_; int best_leaf_index_; - /*! \brief added train score buffer in CUDA */ - double* cuda_add_train_score_; - /*! \brief add train score buffer in host */ - mutable std::vector add_train_score_; /*! \brief gradients on CUDA */ score_t* cuda_gradients_; /*! \brief hessians on CUDA */ From 1a09c192c30e20649d30199c9e857709b00d2d81 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 17 Sep 2021 08:07:54 +0000 Subject: [PATCH 077/166] add support of bagging with subset for CUDAColumnData --- include/LightGBM/cuda/cuda_column_data.hpp | 8 +- include/LightGBM/cuda/cuda_row_data.hpp | 13 + src/boosting/gbdt.cpp | 5 +- src/io/cuda/cuda_column_data.cpp | 57 ++- src/io/cuda/cuda_column_data.cu | 6 +- src/io/cuda/cuda_row_data.cpp | 54 +++ src/io/cuda/cuda_row_data.cu | 48 +++ src/io/dataset.cpp | 13 +- .../cuda/cuda_best_split_finder.cu | 12 +- .../cuda/cuda_best_split_finder.hpp | 2 +- src/treelearner/cuda/cuda_data_partition.cpp | 47 ++- src/treelearner/cuda/cuda_data_partition.cu | 12 +- src/treelearner/cuda/cuda_data_partition.hpp | 12 +- .../cuda/cuda_histogram_constructor.cpp | 12 + .../cuda/cuda_histogram_constructor.cu | 328 ++++++++++++------ .../cuda/cuda_histogram_constructor.hpp | 10 +- src/treelearner/cuda/cuda_leaf_splits.cpp | 21 +- src/treelearner/cuda/cuda_leaf_splits.cu | 86 +++-- src/treelearner/cuda/cuda_leaf_splits.hpp | 9 +- .../cuda/new_cuda_tree_learner.cpp | 36 +- .../cuda/new_cuda_tree_learner.hpp | 2 - 21 files changed, 587 insertions(+), 206 deletions(-) create mode 100644 src/io/cuda/cuda_row_data.cu diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp index 4be1c06149c2..d590b502a09b 100644 --- a/include/LightGBM/cuda/cuda_column_data.hpp +++ b/include/LightGBM/cuda/cuda_column_data.hpp @@ -90,10 +90,12 @@ class CUDAColumnData { template void InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, void** out_column_data_pointer); - void LaunchCopySubrowKernel(void* const* in_cuda_data_by_column, const data_size_t num_used_indices); + void LaunchCopySubrowKernel(void* const* in_cuda_data_by_column); void InitColumnMetaInfo(); + void ResizeWhenCopySubrow(const data_size_t num_used_indices); + int num_threads_; data_size_t num_data_; int num_columns_; @@ -122,7 +124,11 @@ class CUDAColumnData { uint8_t* cuda_feature_mfb_is_zero_; uint8_t* cuda_feature_mfb_is_na_; int* cuda_feature_to_column_; + + // used when bagging with subset data_size_t* cuda_used_indices_; + data_size_t num_used_indices_; + data_size_t cur_subset_buffer_size_; }; } // namespace LightGBM diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 9e86ab3b634b..14b6d1c58b4c 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -26,6 +26,8 @@ class CUDARowData { void Init(const Dataset* train_data, TrainingShareStates* train_share_state); + void CopySubrow(const CUDARowData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices); + int num_feature_partitions() const { return num_feature_partitions_; } int max_num_column_per_partition() const { return max_num_column_per_partition_; } @@ -80,6 +82,11 @@ class CUDARowData { ROW_PTR_TYPE** cuda_row_ptr, ROW_PTR_TYPE** cuda_partition_ptr); + void ResizeWhenCopySubrow(const data_size_t num_used_indices); + + + void LaunchCopySubrowKernel(const CUDARowData* full_set); + /*! \brief number of threads to use */ int num_threads_; /*! \brief number of training data */ @@ -106,6 +113,10 @@ class CUDARowData { int max_num_column_per_partition_; /*! \brief number of partitions */ int num_feature_partitions_; + /*! \brief used when bagging with subset, number of used indice */ + data_size_t num_used_indices_; + /*! \brief used when bagging with subset, the size of buffer for copy subrow */ + data_size_t cur_subset_buffer_size_; // CUDA memory @@ -133,6 +144,8 @@ class CUDARowData { uint32_t* cuda_column_hist_offsets_; /*! \brief hisotgram offset of each partition */ uint32_t* cuda_partition_hist_offsets_; + /*! \brief used when bagging with subset, used indice */ + data_size_t* cuda_used_indices_; }; } // namespace LightGBM diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 57040c646db9..8d74ea125b3a 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -253,10 +253,14 @@ void GBDT::Bagging(int iter) { } else { // get subset tmp_subset_->ReSize(bag_data_cnt_); + global_timer.Start("GBDT::CopySubrow"); tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); + global_timer.Stop("GBDT::CopySubrow"); + global_timer.Start("GBDT::SetBaggingData"); tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), bag_data_cnt_); + global_timer.Stop("GBDT::SetBaggingData"); } } } @@ -390,7 +394,6 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto hess = hessians + offset; // need to copy gradients for bagging subset. if (is_use_subset_ && bag_data_cnt_ < num_data_) { - Log::Warning("use subset !!!"); for (int i = 0; i < bag_data_cnt_; ++i) { gradients_[offset + i] = grad[bag_data_indices_[i]]; hessians_[offset + i] = hess[bag_data_indices_[i]]; diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp index 3fddc01f8e07..7ad60e213060 100644 --- a/src/io/cuda/cuda_column_data.cpp +++ b/src/io/cuda/cuda_column_data.cpp @@ -143,34 +143,77 @@ void CUDAColumnData::CopySubrow( feature_missing_is_na_ = full_set->feature_missing_is_na_; feature_mfb_is_zero_ = full_set->feature_mfb_is_zero_; feature_mfb_is_na_ = full_set->feature_mfb_is_na_; + feature_to_column_ = full_set->feature_to_column_; if (cuda_used_indices_ == nullptr) { // initialize the subset cuda column data - const size_t full_set_num_data = static_cast(full_set->num_data_); - AllocateCUDAMemoryOuter(&cuda_used_indices_, full_set_num_data, __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); + const size_t num_used_indices_size = static_cast(num_used_indices); + AllocateCUDAMemoryOuter(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__); data_by_column_.resize(num_columns_, nullptr); OMP_INIT_EX(); #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int column_index = 0; column_index < num_columns_; ++column_index) { + OMP_LOOP_EX_BEGIN(); const uint8_t bit_type = column_bit_type_[column_index]; if (bit_type == 8) { uint8_t* column_data = nullptr; - AllocateCUDAMemoryOuter(&column_data, full_set_num_data, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); data_by_column_[column_index] = reinterpret_cast(column_data); } else if (bit_type == 16) { uint16_t* column_data = nullptr; - AllocateCUDAMemoryOuter(&column_data, full_set_num_data, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); data_by_column_[column_index] = reinterpret_cast(column_data); } else if (bit_type == 32) { uint32_t* column_data = nullptr; - AllocateCUDAMemoryOuter(&column_data, full_set_num_data, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); data_by_column_[column_index] = reinterpret_cast(column_data); } + OMP_LOOP_EX_END(); } + OMP_THROW_EX(); InitCUDAMemoryFromHostMemoryOuter(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__); InitColumnMetaInfo(); + cur_subset_buffer_size_ = num_used_indices; + } else { + if (num_used_indices > cur_subset_buffer_size_) { + ResizeWhenCopySubrow(num_used_indices); + cur_subset_buffer_size_ = num_used_indices; + } + } + CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); + num_used_indices_ = num_used_indices; + LaunchCopySubrowKernel(full_set->cuda_data_by_column()); +} + +void CUDAColumnData::ResizeWhenCopySubrow(const data_size_t num_used_indices) { + const size_t num_used_indices_size = static_cast(num_used_indices); + DeallocateCUDAMemoryOuter(&cuda_used_indices_, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int column_index = 0; column_index < num_columns_; ++column_index) { + OMP_LOOP_EX_BEGIN(); + const uint8_t bit_type = column_bit_type_[column_index]; + if (bit_type == 8) { + uint8_t* column_data = reinterpret_cast(data_by_column_[column_index]); + DeallocateCUDAMemoryOuter(&column_data, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } else if (bit_type == 16) { + uint16_t* column_data = reinterpret_cast(data_by_column_[column_index]); + DeallocateCUDAMemoryOuter(&column_data, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } else if (bit_type == 32) { + uint32_t* column_data = reinterpret_cast(data_by_column_[column_index]); + DeallocateCUDAMemoryOuter(&column_data, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); + data_by_column_[column_index] = reinterpret_cast(column_data); + } + OMP_LOOP_EX_END(); } - LaunchCopySubrowKernel(full_set->cuda_data_by_column(), num_used_indices); + OMP_THROW_EX(); + DeallocateCUDAMemoryOuter(&cuda_data_by_column_, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__); } void CUDAColumnData::InitColumnMetaInfo() { diff --git a/src/io/cuda/cuda_column_data.cu b/src/io/cuda/cuda_column_data.cu index 12aee9bc9b4e..c6b9b0738fe1 100644 --- a/src/io/cuda/cuda_column_data.cu +++ b/src/io/cuda/cuda_column_data.cu @@ -19,7 +19,7 @@ __global__ void CopySubrowKernel_ColumnData( const void* in_column_data = in_cuda_data_by_column[column_index]; void* out_column_data = out_cuda_data_by_column[column_index]; const uint8_t bit_type = cuda_column_bit_type[column_index]; - const data_size_t local_data_index_start = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const data_size_t local_data_index_start = static_cast(threadIdx.x); if (bit_type == 8) { const uint8_t* true_in_column_data = reinterpret_cast(in_column_data); uint8_t* true_out_column_data = reinterpret_cast(out_column_data); @@ -44,12 +44,12 @@ __global__ void CopySubrowKernel_ColumnData( } } -void CUDAColumnData::LaunchCopySubrowKernel(void* const* in_cuda_data_by_column, const data_size_t num_used_indices) { +void CUDAColumnData::LaunchCopySubrowKernel(void* const* in_cuda_data_by_column) { CopySubrowKernel_ColumnData<<>>( in_cuda_data_by_column, cuda_column_bit_type_, cuda_used_indices_, - num_used_indices, + num_used_indices_, cuda_data_by_column_); } diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 2e7facb4a608..9f2bb226fd13 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -20,6 +20,7 @@ CUDARowData::CUDARowData(const Dataset* train_data, } else { CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); } + cuda_used_indices_ = nullptr; } void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_share_state) { @@ -333,4 +334,57 @@ void CUDARowData::InitSparseData(const BIN_TYPE* host_data, } } +void CUDARowData::CopySubrow( + const CUDARowData* full_set, + const data_size_t* used_indices, + const data_size_t num_used_indices) { + num_used_indices_ = num_used_indices; + if (cuda_used_indices_ == nullptr) { + // initialize meta information + bit_type_ = full_set->bit_type_; + row_ptr_bit_type_ = full_set->row_ptr_bit_type_; + is_sparse_ = full_set->is_sparse_; + feature_partition_column_index_offsets_ = full_set->feature_partition_column_index_offsets_; + column_hist_offsets_ = full_set->column_hist_offsets_; + partition_hist_offsets_ = full_set->partition_hist_offsets_; + max_num_column_per_partition_ = full_set->max_num_column_per_partition_; + num_feature_partitions_ = full_set->num_feature_partitions_; + + InitCUDAMemoryFromHostMemoryOuter( + &cuda_feature_partition_column_index_offsets_, + feature_partition_column_index_offsets_.data(), + feature_partition_column_index_offsets_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_column_hist_offsets_, + column_hist_offsets_.data(), column_hist_offsets_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_partition_hist_offsets_, + partition_hist_offsets_.data(), partition_hist_offsets_.size(), __FILE__, __LINE__); + + cur_subset_buffer_size_ = num_used_indices_; + InitCUDAMemoryFromHostMemoryOuter(&cuda_used_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); + if (!is_sparse_) { + const int num_column = feature_partition_column_index_offsets_.back(); + size_t total_size = static_cast(num_used_indices_ * num_column); + if (bit_type_ == 8) { + AllocateCUDAMemoryOuter(&cuda_data_uint8_t_, total_size, __FILE__, __LINE__); + } else if (bit_type_ == 16) { + AllocateCUDAMemoryOuter(&cuda_data_uint16_t_, total_size, __FILE__, __LINE__); + } else if (bit_type_ == 32) { + AllocateCUDAMemoryOuter(&cuda_data_uint32_t_, total_size, __FILE__, __LINE__); + } + } else { + // TODO(shiyu1994): copy subrow for sparse data + } + } else { + if (num_used_indices_ > cur_subset_buffer_size_) { + ResizeWhenCopySubrow(num_used_indices_); + cur_subset_buffer_size_ = num_used_indices_; + } + CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices_), __FILE__, __LINE__); + LaunchCopySubrowKernel(full_set); + } +} + +// TODO(shiyu1994): implement this +void CUDARowData::ResizeWhenCopySubrow(const data_size_t /*num_used_indices*/) {} + } // namespace LightGBM diff --git a/src/io/cuda/cuda_row_data.cu b/src/io/cuda/cuda_row_data.cu new file mode 100644 index 000000000000..22e7a9905934 --- /dev/null +++ b/src/io/cuda/cuda_row_data.cu @@ -0,0 +1,48 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include + +#define COPY_SUBROW_BLOCK_SIZE_ROW_DATA (1024) + +namespace LightGBM { + +template +__global__ void CopySubrowDenseKernel(const BIN_TYPE* full_set_bin_data, const int num_column, const data_size_t num_used_indices, + const data_size_t* used_indices, BIN_TYPE* bin_data) { + const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (local_data_index < num_used_indices) { + const data_size_t global_data_index = used_indices[local_data_index]; + const BIN_TYPE* src = full_set_bin_data + global_data_index * num_column; + BIN_TYPE* dst = bin_data + local_data_index * num_column; + for (int column_index = 0; column_index < num_column; ++column_index) { + dst[column_index] = src[column_index]; + } + } +} + +void CUDARowData::LaunchCopySubrowKernel(const CUDARowData* full_set) { + const int num_column = feature_partition_column_index_offsets_.back(); + if (!is_sparse_) { + const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; + if (bit_type_ == 8) { + const uint8_t* full_set_bin_data = full_set->cuda_data_uint8_t_; + CopySubrowDenseKernel<<>>( + full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint8_t_); + } else if (bit_type_ == 16) { + const uint16_t* full_set_bin_data = full_set->cuda_data_uint16_t_; + CopySubrowDenseKernel<<>>( + full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint16_t_); + } else if (bit_type_ == 32) { + const uint32_t* full_set_bin_data = full_set->cuda_data_uint32_t_; + CopySubrowDenseKernel<<>>( + full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint32_t_); + } + } else { + // TODO(shiyu1994): copy subrow for sparse data + } +} + +} // namespace LightGBM diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index a05e83968237..260ee17ac694 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -427,7 +427,6 @@ void Dataset::Construct(std::vector>* bin_mappers, } device_type_ = io_config.device_type; gpu_device_id_ = io_config.gpu_device_id; - gpu_device_id_ = -1; } void Dataset::FinishLoad() { @@ -844,10 +843,18 @@ void Dataset::CopySubrow(const Dataset* fullset, } } // update CUDA storage for column data and metadata + device_type_ = fullset->device_type_; + gpu_device_id_ = fullset->gpu_device_id_; if (device_type_ == std::string("cuda")) { - cuda_column_data_.reset(new CUDAColumnData(num_used_indices, gpu_device_id_)); + global_timer.Start("prepare subset cuda column data"); + if (cuda_column_data_ == nullptr) { + cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_)); + metadata_.CreateCUDAMetadata(gpu_device_id_); + } + global_timer.Start("copy subset cuda column data"); cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices); - metadata_.CreateCUDAMetadata(gpu_device_id_); + global_timer.Stop("copy subset cuda column data"); + global_timer.Stop("prepare subset cuda column data"); } } diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 03116e786474..e8c6ec9c8ef8 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -268,7 +268,7 @@ __device__ void FindBestSplitsForLeafKernelInner( } } } else { - if (threadIdx_x <= feature_num_bin_minus_offset - 2/* && !skip_sum*/) { + if (threadIdx_x <= feature_num_bin_minus_offset - 2 && !skip_sum) { const double sum_left_gradient = local_grad_hist; const double sum_left_hessian = local_hess_hist; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); @@ -554,7 +554,7 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i CUDASplitInfo* cuda_split_info = cuda_leaf_best_split_info + buffer_write_pos; const CUDASplitInfo* best_split_info = cuda_best_split_info + best_read_index; if (best_split_info->is_valid) { - cuda_split_info->gain = best_split_info->gain; + /*cuda_split_info->gain = best_split_info->gain; cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; cuda_split_info->default_left = best_split_info->default_left; @@ -569,11 +569,11 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i cuda_split_info->right_count = best_split_info->right_count; cuda_split_info->right_gain = best_split_info->right_gain; cuda_split_info->right_value = best_split_info->right_value; + cuda_split_info->is_valid = true;*/ + *cuda_split_info = *best_split_info; + cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : + cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; cuda_split_info->is_valid = true; - //*cuda_split_info = *best_split_info; - //cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : - // cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; - //cuda_split_info->is_valid = true; } else { cuda_split_info->gain = kMinScore; cuda_split_info->is_valid = false; diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 6411e24eeb18..b5f6a052c46a 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -19,7 +19,7 @@ #define MAX_NUM_BIN_IN_FEATURE (256) #define NUM_THREADS_FIND_BEST_LEAF (256) -#define NUM_TASKS_PER_SYNC_BLOCK (32) +#define NUM_TASKS_PER_SYNC_BLOCK (1024) namespace LightGBM { diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index c7d7f5c25628..dc6b3af5e3fd 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -89,11 +89,12 @@ void CUDADataPartition::Init() { add_train_score_.resize(num_data_, 0.0f); AllocateCUDAMemoryOuter(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); use_bagging_ = false; + use_bagging_subset_ = false; used_indices_ = nullptr; } void CUDADataPartition::BeforeTrain() { - if (!use_bagging_) { + if (!use_bagging_ || use_bagging_subset_) { LaunchFillDataIndicesBeforeTrain(); } SetCUDAMemoryOuter(cuda_leaf_num_data_, 0, static_cast(num_leaves_), __FILE__, __LINE__); @@ -209,7 +210,9 @@ void CUDADataPartition::UpdateTrainScore(const Tree* tree, double* scores) { CHECK(tree->is_cuda_tree()); const CUDATree* cuda_tree = reinterpret_cast(tree); const data_size_t num_data_in_root = root_num_data(); - CopyFromHostToCUDADeviceOuter(cuda_data_indices_, used_indices_, static_cast(num_used_indices_), __FILE__, __LINE__); + if (use_bagging_) { + CopyFromHostToCUDADeviceOuter(cuda_data_indices_, used_indices_, static_cast(num_used_indices_), __FILE__, __LINE__); + } LaunchAddPredictionToScoreKernel(cuda_tree->cuda_leaf_value(), cuda_add_train_score_); CopyFromCUDADeviceToHostOuter(add_train_score_.data(), cuda_add_train_score_, static_cast(num_data_in_root), __FILE__, __LINE__); @@ -251,19 +254,51 @@ void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf) { void CUDADataPartition::SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_used_indices) { use_bagging_ = true; + use_bagging_subset_ = false; num_used_indices_ = num_used_indices; used_indices_ = used_indices; CopyFromHostToCUDADeviceOuter(cuda_data_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); LaunchFillDataIndexToLeafIndex(); } -void CUDADataPartition::SetUseBagging(const bool use_bagging) { - use_bagging_ = use_bagging; - if (!use_bagging_) { - used_indices_ = nullptr; +void CUDADataPartition::ResetTrainingData(const Dataset* train_data) { + const data_size_t old_num_data = num_data_; + num_data_ = train_data->num_data(); + if (num_data_ > old_num_data) { + CalcBlockDim(num_data_); + const int old_max_num_split_indices_blocks = max_num_split_indices_blocks_; + max_num_split_indices_blocks_ = grid_dim_; + if (max_num_split_indices_blocks_ > old_max_num_split_indices_blocks) { + DeallocateCUDAMemoryOuter(&cuda_block_data_to_left_offset_, __FILE__, __LINE__); + DeallocateCUDAMemoryOuter(&cuda_block_data_to_right_offset_, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_data_to_left_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_data_to_right_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_data_to_left_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_block_data_to_right_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + } + DeallocateCUDAMemoryOuter(&cuda_data_indices_, __FILE__, __LINE__); + DeallocateCUDAMemoryOuter(&cuda_block_to_left_offset_, __FILE__, __LINE__); + DeallocateCUDAMemoryOuter(&cuda_data_index_to_leaf_index_, __FILE__, __LINE__); + DeallocateCUDAMemoryOuter(&cuda_out_data_indices_in_leaf_, __FILE__, __LINE__); + DeallocateCUDAMemoryOuter(&cuda_add_train_score_, __FILE__, __LINE__); + add_train_score_.resize(num_data_, 0.0f); + + AllocateCUDAMemoryOuter(&cuda_data_indices_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_block_to_left_offset_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_data_index_to_leaf_index_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_out_data_indices_in_leaf_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); } } +void CUDADataPartition::SetBaggingSubset(const Dataset* subset) { + num_used_indices_ = subset->num_data(); + used_indices_ = nullptr; + use_bagging_ = true; + use_bagging_subset_ = true; + cuda_column_data_ = subset->cuda_column_data(); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 5f960d4097fc..e109e183ffb9 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -12,11 +12,10 @@ namespace LightGBM { -__global__ void FillDataIndicesBeforeTrainKernel(const data_size_t* cuda_num_data, +__global__ void FillDataIndicesBeforeTrainKernel(const data_size_t num_data, data_size_t* data_indices, int* cuda_data_index_to_leaf_index) { - const data_size_t num_data_ref = *cuda_num_data; const unsigned int data_index = threadIdx.x + blockIdx.x * blockDim.x; - if (data_index < num_data_ref) { + if (data_index < num_data) { data_indices[data_index] = data_index; cuda_data_index_to_leaf_index[data_index] = 0; } @@ -33,8 +32,9 @@ __global__ void FillDataIndexToLeafIndexKernel( } void CUDADataPartition::LaunchFillDataIndicesBeforeTrain() { - const int num_blocks = (num_data_ + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; - FillDataIndicesBeforeTrainKernel<<>>(cuda_num_data_, cuda_data_indices_, cuda_data_index_to_leaf_index_); + const data_size_t num_data_in_root = root_num_data(); + const int num_blocks = (num_data_in_root + FILL_INDICES_BLOCK_SIZE_DATA_PARTITION - 1) / FILL_INDICES_BLOCK_SIZE_DATA_PARTITION; + FillDataIndicesBeforeTrainKernel<<>>(num_data_in_root, cuda_data_indices_, cuda_data_index_to_leaf_index_); } void CUDADataPartition::LaunchFillDataIndexToLeafIndex() { @@ -603,7 +603,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num const uint32_t most_freq_bin = cuda_column_data_->feature_most_freq_bin(split_feature_index); const uint32_t min_bin = cuda_column_data_->feature_min_bin(split_feature_index); const uint32_t max_bin = cuda_column_data_->feature_max_bin(split_feature_index); - uint32_t th = split_threshold + min_bin; uint32_t t_zero_bin = min_bin + default_bin; if (most_freq_bin == 0) { @@ -1115,7 +1114,6 @@ void CUDADataPartition::LaunchSplitInnerKernel( SynchronizeCUDADeviceOuter(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); global_timer.Start("CUDADataPartition::SplitInnerKernel"); - SplitInnerKernel<<>>( left_leaf_index, right_leaf_index, cuda_leaf_data_start_, cuda_leaf_num_data_, cuda_data_indices_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_block_to_left_offset_, diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 7201d42001bc..62ec205706b9 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -62,7 +62,9 @@ class CUDADataPartition { void SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_used_indices); - void SetUseBagging(const bool use_bagging); + void SetBaggingSubset(const Dataset* subset); + + void ResetTrainingData(const Dataset* train_data); data_size_t root_num_data() const { if (use_bagging_) { @@ -78,6 +80,10 @@ class CUDADataPartition { const data_size_t* cuda_leaf_data_start() const { return cuda_leaf_data_start_; } + bool use_bagging() const { return use_bagging_; } + + bool use_bagging_subset() const { return use_bagging_subset_; } + private: void CalcBlockDim(const data_size_t num_data_in_leaf); @@ -210,7 +216,7 @@ class CUDADataPartition { // dataset information /*! \brief number of training data */ - const data_size_t num_data_; + data_size_t num_data_; /*! \brief number of features in training data */ const int num_features_; /*! \brief number of total bins in training data */ @@ -239,6 +245,8 @@ class CUDADataPartition { // per iteration information /*! \brief whether bagging is used in this iteration */ bool use_bagging_; + /*! \brief whether use subset data for bagging in this iteration */ + bool use_bagging_subset_; /*! \brief number of used data indices in this iteration */ data_size_t num_used_indices_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index fbd8c6f99e99..64b735d5cf90 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -83,6 +83,7 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat InitCUDAMemoryFromHostMemoryOuter(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(), need_fix_histogram_features_num_bin_aligend_.size(), __FILE__, __LINE__); + cuda_used_indices_ = nullptr; } void CUDAHistogramConstructor::ConstructHistogramForLeaf( @@ -116,6 +117,17 @@ void CUDAHistogramConstructor::CalcConstructHistogramKernelDim( ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + (*block_dim_y) - 1) / (*block_dim_y)); } +void CUDAHistogramConstructor::ResetTrainingData(const Dataset* train_data) { + num_data_ = train_data->num_data(); +} + +void CUDAHistogramConstructor::SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_data) { + if (cuda_used_indices_ == nullptr) { + AllocateCUDAMemoryOuter(&cuda_used_indices_, static_cast(num_data_), __FILE__, __LINE__); + } + CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_data), __FILE__, __LINE__); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 79739ec035b3..bb712f15fb54 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -11,7 +11,7 @@ namespace LightGBM { -template +template __global__ void CUDAConstructHistogramDenseKernel( const CUDALeafSplitsStruct* smaller_leaf_splits, const score_t* cuda_gradients, @@ -20,7 +20,8 @@ __global__ void CUDAConstructHistogramDenseKernel( const uint32_t* column_hist_offsets, const uint32_t* column_hist_offsets_full, const int* feature_partition_column_index_offsets, - const data_size_t num_data) { + const data_size_t num_data, + const data_size_t* used_indices) { const int dim_y = static_cast(gridDim.y * blockDim.y); const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; @@ -55,7 +56,9 @@ __global__ void CUDAConstructHistogramDenseKernel( const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; const score_t grad = cuda_gradients[data_index]; const score_t hess = cuda_hessians[data_index]; - const uint32_t bin = static_cast(data_ptr[data_index * num_columns_in_partition + threadIdx.x]); + const uint32_t bin = USE_SUBSET ? + static_cast(data_ptr[used_indices[data_index] * num_columns_in_partition + threadIdx.x]) : + static_cast(data_ptr[data_index * num_columns_in_partition + threadIdx.x]); const uint32_t pos = bin << 1; float* pos_ptr = shared_hist_ptr + pos; atomicAdd_block(pos_ptr, grad); @@ -70,7 +73,7 @@ __global__ void CUDAConstructHistogramDenseKernel( } } -template +template __global__ void CUDAConstructHistogramSparseKernel( const CUDALeafSplitsStruct* smaller_leaf_splits, const score_t* cuda_gradients, @@ -79,7 +82,8 @@ __global__ void CUDAConstructHistogramSparseKernel( const DATA_PTR_TYPE* row_ptr, const DATA_PTR_TYPE* partition_ptr, const uint32_t* column_hist_offsets_full, - const data_size_t num_data) { + const data_size_t num_data, + const data_size_t* used_indices) { const int dim_y = static_cast(gridDim.y * blockDim.y); const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; @@ -107,8 +111,8 @@ __global__ void CUDAConstructHistogramSparseKernel( data_size_t inner_data_index = static_cast(threadIdx_y); for (data_size_t i = 0; i < num_iteration_this; ++i) { const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; - const DATA_PTR_TYPE row_start = block_row_ptr[data_index]; - const DATA_PTR_TYPE row_end = block_row_ptr[data_index + 1]; + const DATA_PTR_TYPE row_start = USE_SUBSET ? block_row_ptr[used_indices[data_index]] : block_row_ptr[data_index]; + const DATA_PTR_TYPE row_end = USE_SUBSET ? block_row_ptr[used_indices[data_index] + 1] : block_row_ptr[data_index + 1]; const DATA_PTR_TYPE row_size = row_end - row_start; if (threadIdx.x < row_size) { const score_t grad = cuda_gradients[data_index]; @@ -138,124 +142,246 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf); dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); - if (cuda_row_data_->is_sparse()) { - if (cuda_row_data_->bit_type() == 8) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); + if (cuda_used_indices_ == nullptr) { + if (cuda_row_data_->is_sparse()) { + if (cuda_row_data_->bit_type() == 8) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, nullptr); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, nullptr); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, nullptr); + } + } else if (cuda_row_data_->bit_type() == 16) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, nullptr); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, nullptr); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, nullptr); + } + } else if (cuda_row_data_->bit_type() == 32) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, nullptr); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, nullptr); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, nullptr); + } } - } else if (cuda_row_data_->bit_type() == 16) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( + } else { + if (cuda_row_data_->bit_type() == 8) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( + cuda_row_data_->cuda_feature_partition_column_index_offsets(), + num_data_, nullptr); + } else if (cuda_row_data_->bit_type() == 16) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( + cuda_row_data_->cuda_feature_partition_column_index_offsets(), + num_data_, nullptr); + } else if (cuda_row_data_->bit_type() == 32) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); + cuda_row_data_->cuda_feature_partition_column_index_offsets(), + num_data_, nullptr); + } + } + } else { + if (cuda_row_data_->is_sparse()) { + if (cuda_row_data_->bit_type() == 8) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, cuda_used_indices_); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, cuda_used_indices_); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, cuda_used_indices_); + } + } else if (cuda_row_data_->bit_type() == 16) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, cuda_used_indices_); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, cuda_used_indices_); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, cuda_used_indices_); + } + } else if (cuda_row_data_->bit_type() == 32) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, cuda_used_indices_); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, cuda_used_indices_); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, cuda_used_indices_); + } } - } else if (cuda_row_data_->bit_type() == 32) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( + } else { + if (cuda_row_data_->bit_type() == 8) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( + cuda_row_data_->cuda_feature_partition_column_index_offsets(), + num_data_, cuda_used_indices_); + } else if (cuda_row_data_->bit_type() == 16) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( + cuda_row_data_->cuda_feature_partition_column_index_offsets(), + num_data_, cuda_used_indices_); + } else if (cuda_row_data_->bit_type() == 32) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); + cuda_row_data_->cuda_feature_partition_column_index_offsets(), + num_data_, cuda_used_indices_); } } - } else { - if (cuda_row_data_->bit_type() == 8) { - CUDAConstructHistogramDenseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_); - } else if (cuda_row_data_->bit_type() == 16) { - CUDAConstructHistogramDenseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_); - } else if (cuda_row_data_->bit_type() == 32) { - CUDAConstructHistogramDenseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_); - } } } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 2719cf77ece2..5ca6fb4047e3 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -51,6 +51,10 @@ class CUDAHistogramConstructor { const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); + void SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_data); + + void ResetTrainingData(const Dataset* train_data); + void BeforeTrain(const score_t* gradients, const score_t* hessians); const hist_t* cuda_hist() const { return cuda_hist_; } @@ -78,7 +82,7 @@ class CUDAHistogramConstructor { // Host memory /*! \brief size of training data */ - const data_size_t num_data_; + data_size_t num_data_; /*! \brief number of features in training data */ const int num_features_; /*! \brief maximum number of leaves */ @@ -113,6 +117,8 @@ class CUDAHistogramConstructor { /*! \brief CUDA row wise data */ std::unique_ptr cuda_row_data_; + /*! \brief CUDA row wise data, used when bagging with subset */ + std::unique_ptr cuda_row_data_subset_; /*! \brief number of bins per feature */ uint32_t* cuda_feature_num_bins_; /*! \brief offsets in histogram of all features */ @@ -125,6 +131,8 @@ class CUDAHistogramConstructor { int* cuda_need_fix_histogram_features_; /*! \brief aligned number of bins of the features whose histograms need to be fixed */ uint32_t* cuda_need_fix_histogram_features_num_bin_aligned_; + /*! \brief used data indices when using subset for bagging */ + data_size_t* cuda_used_indices_; // CUDA memory, held by other object diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 4a6c7c514b2a..43ddb427d359 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -16,7 +16,7 @@ num_data_(num_data) { } void CUDALeafSplits::Init() { - num_blocks_init_from_gradients_ = (num_data_ + INIT_SUM_BLOCK_SIZE_LEAF_SPLITS - 1) / INIT_SUM_BLOCK_SIZE_LEAF_SPLITS; + num_blocks_init_from_gradients_ = (num_data_ + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS; // allocate more memory for sum reduction in CUDA // only the first element records the final sum @@ -33,17 +33,30 @@ void CUDALeafSplits::InitValues() { void CUDALeafSplits::InitValues( const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, - hist_t* cuda_hist_in_leaf, double* root_sum_hessians) { + const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, + const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf, double* root_sum_hessians) { cuda_gradients_ = cuda_gradients; cuda_hessians_ = cuda_hessians; SetCUDAMemoryOuter(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); SetCUDAMemoryOuter(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); - LaunchInitValuesKernal(cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf); + LaunchInitValuesKernal(cuda_bagging_data_indices, cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf); CopyFromCUDADeviceToHostOuter(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, __FILE__, __LINE__); SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } +void CUDALeafSplits::Resize(const data_size_t num_data) { + if (num_data > num_data_) { + DeallocateCUDAMemoryOuter(&cuda_sum_of_gradients_buffer_, __FILE__, __LINE__); + DeallocateCUDAMemoryOuter(&cuda_sum_of_hessians_buffer_, __FILE__, __LINE__); + num_blocks_init_from_gradients_ = (num_data + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS; + AllocateCUDAMemoryOuter(&cuda_sum_of_gradients_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_sum_of_hessians_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); + } else { + num_blocks_init_from_gradients_ = (num_data + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS; + } + num_data_ = num_data; +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 78796a0ff13d..d67357dc2c30 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -4,41 +4,29 @@ * license information. */ -#ifdef USE_CUDA - #include "cuda_leaf_splits.hpp" +#include namespace LightGBM { template __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t num_data, const data_size_t* data_indices_in_leaf, + const data_size_t num_data, const data_size_t* cuda_bagging_data_indices, double* cuda_sum_of_gradients, double* cuda_sum_of_hessians) { - __shared__ score_t shared_gradients[NUM_THRADS_PER_BLOCK_LEAF_SPLITS]; - __shared__ score_t shared_hessians[NUM_THRADS_PER_BLOCK_LEAF_SPLITS]; - const unsigned int tid = threadIdx.x; - const unsigned int i = (blockIdx.x * blockDim.x + tid) * NUM_DATA_THREAD_ADD_LEAF_SPLITS; - shared_gradients[tid] = 0.0f; - shared_hessians[tid] = 0.0f; - __syncthreads(); - for (unsigned int j = 0; j < NUM_DATA_THREAD_ADD_LEAF_SPLITS; ++j) { - if (i + j < num_data) { - const data_size_t data_index = USE_INDICES ? data_indices_in_leaf[i + j] : static_cast(i + j); - shared_gradients[tid] += cuda_gradients[data_index]; - shared_hessians[tid] += cuda_hessians[data_index]; - } + __shared__ double shared_mem_buffer[32]; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + double gradient = 0.0f; + double hessian = 0.0f; + if (data_index < num_data) { + gradient = USE_INDICES ? cuda_gradients[cuda_bagging_data_indices[data_index]] : cuda_gradients[data_index]; + hessian = USE_INDICES ? cuda_hessians[cuda_bagging_data_indices[data_index]] : cuda_hessians[data_index]; } + const double block_sum_gradient = ShuffleReduceSum(gradient, shared_mem_buffer, blockDim.x); __syncthreads(); - for (unsigned int s = 1; s < blockDim.x; s *= 2) { - if (tid % (2 * s) == 0 && (tid + s) < NUM_THRADS_PER_BLOCK_LEAF_SPLITS) { - shared_gradients[tid] += shared_gradients[tid + s]; - shared_hessians[tid] += shared_hessians[tid + s]; - } - __syncthreads(); - } - if (tid == 0) { - cuda_sum_of_gradients[blockIdx.x] += shared_gradients[0]; - cuda_sum_of_hessians[blockIdx.x] += shared_hessians[0]; + const double block_sum_hessian = ShuffleReduceSum(hessian, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + cuda_sum_of_gradients[blockIdx.x] += block_sum_gradient; + cuda_sum_of_hessians[blockIdx.x] += block_sum_hessian; } } @@ -50,22 +38,27 @@ __global__ void CUDAInitValuesKernel2( const data_size_t* cuda_data_indices_in_leaf, hist_t* cuda_hist_in_leaf, CUDALeafSplitsStruct* cuda_struct) { - double sum_of_gradients = 0.0f; - double sum_of_hessians = 0.0f; - for (unsigned int i = 0; i < num_blocks_to_reduce; ++i) { - sum_of_gradients += cuda_sum_of_gradients[i]; - sum_of_hessians += cuda_sum_of_hessians[i]; + __shared__ double shared_mem_buffer[32]; + double thread_sum_of_gradients = 0.0f; + double thread_sum_of_hessians = 0.0f; + for (int block_index = static_cast(threadIdx.x); block_index < num_blocks_to_reduce; block_index += static_cast(blockDim.x)) { + thread_sum_of_gradients += cuda_sum_of_gradients[block_index]; + thread_sum_of_hessians += cuda_sum_of_hessians[block_index]; + } + const double sum_of_gradients = ShuffleReduceSum(thread_sum_of_gradients, shared_mem_buffer, blockDim.x); + __syncthreads(); + const double sum_of_hessians = ShuffleReduceSum(thread_sum_of_hessians, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + cuda_sum_of_hessians[0] = sum_of_hessians; + cuda_struct->leaf_index = 0; + cuda_struct->sum_of_gradients = sum_of_gradients; + cuda_struct->sum_of_hessians = sum_of_hessians; + cuda_struct->num_data_in_leaf = num_data; + cuda_struct->gain = 0.0f; + cuda_struct->leaf_value = 0.0f; + cuda_struct->data_indices_in_leaf = cuda_data_indices_in_leaf; + cuda_struct->hist_in_leaf = cuda_hist_in_leaf; } - cuda_sum_of_gradients[0] = sum_of_gradients; - cuda_sum_of_hessians[0] = sum_of_hessians; - cuda_struct->leaf_index = 0; - cuda_struct->sum_of_gradients = sum_of_gradients; - cuda_struct->sum_of_hessians = sum_of_hessians; - cuda_struct->num_data_in_leaf = num_data; - cuda_struct->gain = 0.0f; - cuda_struct->leaf_value = 0.0f; - cuda_struct->data_indices_in_leaf = cuda_data_indices_in_leaf; - cuda_struct->hist_in_leaf = cuda_hist_in_leaf; } __global__ void InitValuesEmptyKernel(CUDALeafSplitsStruct* cuda_struct) { @@ -84,20 +77,21 @@ void CUDALeafSplits::LaunchInitValuesEmptyKernel() { } void CUDALeafSplits::LaunchInitValuesKernal( + const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf) { - if (num_used_indices == num_data_) { + if (cuda_bagging_data_indices == nullptr) { CUDAInitValuesKernel1<<>>( - cuda_gradients_, cuda_hessians_, num_used_indices, cuda_data_indices_in_leaf, cuda_sum_of_gradients_buffer_, + cuda_gradients_, cuda_hessians_, num_used_indices, nullptr, cuda_sum_of_gradients_buffer_, cuda_sum_of_hessians_buffer_); } else { CUDAInitValuesKernel1<<>>( - cuda_gradients_, cuda_hessians_, num_used_indices, cuda_data_indices_in_leaf, cuda_sum_of_gradients_buffer_, + cuda_gradients_, cuda_hessians_, num_used_indices, cuda_bagging_data_indices, cuda_sum_of_gradients_buffer_, cuda_sum_of_hessians_buffer_); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CUDAInitValuesKernel2<<<1, 1>>>( + CUDAInitValuesKernel2<<<1, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>( num_blocks_init_from_gradients_, cuda_sum_of_gradients_buffer_, cuda_sum_of_hessians_buffer_, @@ -109,5 +103,3 @@ void CUDALeafSplits::LaunchInitValuesKernal( } } // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index f44dcc50bc63..9dbd0404a679 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -13,7 +13,6 @@ #include #include -#define INIT_SUM_BLOCK_SIZE_LEAF_SPLITS (6144) #define NUM_THRADS_PER_BLOCK_LEAF_SPLITS (1024) #define NUM_DATA_THREAD_ADD_LEAF_SPLITS (6) @@ -41,6 +40,7 @@ class CUDALeafSplits { void InitValues( const score_t* cuda_gradients, const score_t* cuda_hessians, + const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf, double* root_sum_hessians); @@ -50,15 +50,18 @@ class CUDALeafSplits { CUDALeafSplitsStruct* GetCUDAStructRef() { return cuda_struct_; } + void Resize(const data_size_t num_data); + private: void LaunchInitValuesEmptyKernel(); - void LaunchInitValuesKernal(const data_size_t* cuda_data_indices_in_leaf, + void LaunchInitValuesKernal(const data_size_t* cuda_bagging_data_indices, + const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf); // Host memory - const int num_data_; + data_size_t num_data_; int num_blocks_init_from_gradients_; // CUDA memory, held by this object diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 87aa4d7036e3..0732dccf5cc2 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -31,7 +31,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia cuda_larger_leaf_splits_->Init(); cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, share_state_->feature_hist_offsets(), - config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf, config_->gpu_device_id)); + config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf, gpu_device_id)); cuda_histogram_constructor_->Init(train_data_, share_state_.get()); cuda_data_partition_.reset(new CUDADataPartition( @@ -55,12 +55,16 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia void NewCUDATreeLearner::BeforeTrain() { const data_size_t root_num_data = cuda_data_partition_->root_num_data(); - CopyFromHostToCUDADeviceOuter(cuda_gradients_, gradients_, static_cast(num_data_), __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_hessians_, hessians_, static_cast(num_data_), __FILE__, __LINE__); + const size_t num_gradients_to_copy = cuda_data_partition_->use_bagging_subset() ? static_cast(root_num_data) : static_cast(num_data_); + CopyFromHostToCUDADeviceOuter(cuda_gradients_, gradients_, num_gradients_to_copy, __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_hessians_, hessians_, num_gradients_to_copy, __FILE__, __LINE__); + const data_size_t* leaf_splits_init_indices = + (cuda_data_partition_->use_bagging_subset() || !cuda_data_partition_->use_bagging()) ? nullptr : cuda_data_partition_->cuda_data_indices(); cuda_data_partition_->BeforeTrain(); cuda_smaller_leaf_splits_->InitValues( cuda_gradients_, cuda_hessians_, + leaf_splits_init_indices, cuda_data_partition_->cuda_data_indices(), root_num_data, cuda_histogram_constructor_->cuda_hist_pointer(), @@ -172,20 +176,34 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, global_timer.Stop("NewCUDATreeLearner::Split"); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - //AfterTrain(); tree->ToHost(); return tree.release(); } -void NewCUDATreeLearner::ResetTrainingData(const Dataset* /*train_data*/, - bool /*is_constant_hessian*/) {} +void NewCUDATreeLearner::ResetTrainingData( + const Dataset* train_data, + bool is_constant_hessian) { + // TODO(shiyu1994): separte logic of reset training data and set bagging data + train_data_ = train_data; + num_data_ = train_data_->num_data(); + CHECK_EQ(num_features_, train_data_->num_features()); + //cuda_data_partition_->ResetTrainingData(train_data); + cuda_histogram_constructor_->ResetTrainingData(train_data); + cuda_smaller_leaf_splits_->Resize(num_data_); + cuda_larger_leaf_splits_->Resize(num_data_); + CHECK_EQ(is_constant_hessian, share_state_->is_constant_hessian); +} void NewCUDATreeLearner::SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) { if (subset == nullptr) { cuda_data_partition_->SetUsedDataIndices(used_indices, num_data); } else { - + cuda_histogram_constructor_->SetUsedDataIndices(used_indices, num_data); + train_data_ = subset; + num_data_ = train_data_->num_data(); + CHECK_EQ(num_features_, train_data_->num_features()); + cuda_data_partition_->SetBaggingSubset(subset); } } @@ -197,10 +215,6 @@ void NewCUDATreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* ob cuda_tree->SyncLeafOutputFromHostToCUDA(); } -void NewCUDATreeLearner::AfterTrain() { - cuda_data_partition_->SetUseBagging(false); -} - } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index fe4efd49d024..652b9b9ee699 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -39,8 +39,6 @@ class NewCUDATreeLearner: public SerialTreeLearner { protected: void BeforeTrain() override; - void AfterTrain(); - // number of GPUs int num_gpus_; // number of threads on CPU From 740f853aef308a01c6a057d964dbda46ff82bc50 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sat, 18 Sep 2021 03:49:31 +0000 Subject: [PATCH 078/166] add support of bagging with subset for dense CUDARowData --- include/LightGBM/cuda/cuda_algorithms.hpp | 3 + include/LightGBM/cuda/cuda_row_data.hpp | 30 +- src/cuda/cuda_algorithms.cu | 68 ++++ src/io/cuda/cuda_column_data.cu | 52 +-- src/io/cuda/cuda_row_data.cpp | 160 +++++++- src/io/cuda/cuda_row_data.cu | 263 ++++++++++++- .../cuda/cuda_histogram_constructor.cpp | 19 +- .../cuda/cuda_histogram_constructor.cu | 346 ++++++------------ .../cuda/cuda_histogram_constructor.hpp | 7 +- .../cuda/new_cuda_tree_learner.cpp | 2 +- 10 files changed, 638 insertions(+), 312 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 5c2ff2cddd4e..2e5bc049edec 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -60,6 +60,9 @@ __device__ __forceinline__ T ShufflePrefixSum(T value, T* shared_mem_buffer) { return warp_base + value; } +template +void ShufflePrefixSumGlobal(T* values, size_t len, T* block_prefix_sum_buffer); + template __device__ __forceinline__ T ShuffleReduceSumWarp(T value, const data_size_t len) { if (len > 0) { diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 14b6d1c58b4c..73307671738b 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -15,6 +15,7 @@ #include "../train_share_states.h" #define SHRAE_HIST_SIZE (6144 * 2) +#define COPY_SUBROW_BLOCK_SIZE_ROW_DATA (1024) namespace LightGBM { @@ -23,6 +24,8 @@ class CUDARowData { CUDARowData(const Dataset* train_data, const TrainingShareStates* train_share_state, const int gpu_device_id); + CUDARowData(); + void Init(const Dataset* train_data, TrainingShareStates* train_share_state); @@ -82,10 +85,17 @@ class CUDARowData { ROW_PTR_TYPE** cuda_row_ptr, ROW_PTR_TYPE** cuda_partition_ptr); - void ResizeWhenCopySubrow(const data_size_t num_used_indices); + void CopyDenseSubrowData(const CUDARowData* full_set, const data_size_t num_used_indices, const data_size_t* used_indices); + + void CopySparseSubrowData(const CUDARowData* full_set, const data_size_t num_used_indices, const data_size_t* used_indices); + + uint64_t CalcTotalNumberOfElements(const CUDARowData* full_set); + uint64_t LaunchCalcTotalNumberOfElementsKernel(const CUDARowData* full_set); - void LaunchCopySubrowKernel(const CUDARowData* full_set); + void LaunchCopyDenseSubrowKernel(const CUDARowData* full_set); + + void LaunchCopySparseSubrowKernel(const CUDARowData* full_set); /*! \brief number of threads to use */ int num_threads_; @@ -115,8 +125,16 @@ class CUDARowData { int num_feature_partitions_; /*! \brief used when bagging with subset, number of used indice */ data_size_t num_used_indices_; + /*! \brief used when bagging with subset, number of total elements */ + uint64_t num_total_elements_; /*! \brief used when bagging with subset, the size of buffer for copy subrow */ data_size_t cur_subset_buffer_size_; + /*! \brief used when bagging with subset, the size of buffer for copy subrow */ + uint64_t cur_total_elements_buffer_size_; + /*! \brief used when bagging with subset, block buffer when reducing the number of elements in the subset */ + uint64_t* cuda_block_sum_buffer_; + /*! \brief CUDA device ID */ + int gpu_device_id_; // CUDA memory @@ -146,6 +164,14 @@ class CUDARowData { uint32_t* cuda_partition_hist_offsets_; /*! \brief used when bagging with subset, used indice */ data_size_t* cuda_used_indices_; + /*! \brief block buffer when calculating prefix sum */ + uint16_t* cuda_block_buffer_uint16_t_; + /*! \brief block buffer when calculating prefix sum */ + uint32_t* cuda_block_buffer_uint32_t_; + /*! \brief block buffer when calculating prefix sum */ + uint64_t* cuda_block_buffer_uint64_t_; + /*! \brief partition ptr buffer */ + uint64_t* cuda_partition_ptr_buffer_; }; } // namespace LightGBM diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 7b168c10fa17..d1fa48c91fe4 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -7,4 +7,72 @@ namespace LightGBM { +template +__global__ void ShufflePrefixSumGlobalKernel(T* values, size_t len, T* block_prefix_sum_buffer) { + __shared__ T shared_mem_buffer[32]; + const size_t index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + T value = 0; + if (index < len) { + value = values[index]; + } + const T prefix_sum_value = ShufflePrefixSum(value, shared_mem_buffer); + values[index] = prefix_sum_value; + if (threadIdx.x == blockDim.x - 1) { + block_prefix_sum_buffer[blockIdx.x] = prefix_sum_value; + } +} + +template +__global__ void ShufflePrefixSumGlobalReduceBlockKernel(T* block_prefix_sum_buffer, int num_blocks) { + __shared__ T shared_mem_buffer[32]; + const int num_blocks_per_thread = (num_blocks + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 2) / (GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1); + int thread_block_start = threadIdx.x == 0 ? 0 : (threadIdx.x - 1) * num_blocks_per_thread; + int thread_block_end = threadIdx.x == 0 ? 0 : min(thread_block_start + num_blocks_per_thread, num_blocks); + T base = 0; + for (int block_index = thread_block_start; block_index < thread_block_end; ++block_index) { + base += block_prefix_sum_buffer[block_index]; + } + base = ShufflePrefixSum(base, shared_mem_buffer); + thread_block_start = threadIdx.x == blockDim.x - 1 ? 0 : threadIdx.x * num_blocks_per_thread; + thread_block_end = threadIdx.x == blockDim.x - 1 ? 0 : min(thread_block_start + num_blocks_per_thread, num_blocks); + for (int block_index = thread_block_start + 1; block_index < thread_block_end; ++block_index) { + block_prefix_sum_buffer[block_index] += block_prefix_sum_buffer[block_index - 1]; + } + for (int block_index = thread_block_start; block_index < thread_block_end; ++block_index) { + block_prefix_sum_buffer[block_index] += base; + } +} + +template +__global__ void ShufflePrefixSumGlobalAddBase(size_t len, const T* block_prefix_sum_buffer, T* values) { + const T base = blockIdx.x == 0 ? 0 : block_prefix_sum_buffer[blockIdx.x - 1]; + const size_t index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (index < len) { + values[index] += base; + } +} + +template +void ShufflePrefixSumGlobalInner(T* values, size_t len, T* block_prefix_sum_buffer) { + const int num_blocks = (static_cast(len) + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE; + ShufflePrefixSumGlobalKernel<<>>(values, len, block_prefix_sum_buffer); + ShufflePrefixSumGlobalReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_prefix_sum_buffer, num_blocks); + ShufflePrefixSumGlobalAddBase<<>>(len, block_prefix_sum_buffer, values); +} + +template <> +void ShufflePrefixSumGlobal(uint16_t* values, size_t len, uint16_t* block_prefix_sum_buffer) { + ShufflePrefixSumGlobalInner(values, len, block_prefix_sum_buffer); +} + +template <> +void ShufflePrefixSumGlobal(uint32_t* values, size_t len, uint32_t* block_prefix_sum_buffer) { + ShufflePrefixSumGlobalInner(values, len, block_prefix_sum_buffer); +} + +template <> +void ShufflePrefixSumGlobal(uint64_t* values, size_t len, uint64_t* block_prefix_sum_buffer) { + ShufflePrefixSumGlobalInner(values, len, block_prefix_sum_buffer); +} + } // namespace LightGBM diff --git a/src/io/cuda/cuda_column_data.cu b/src/io/cuda/cuda_column_data.cu index c6b9b0738fe1..881cb91a9676 100644 --- a/src/io/cuda/cuda_column_data.cu +++ b/src/io/cuda/cuda_column_data.cu @@ -14,42 +14,42 @@ __global__ void CopySubrowKernel_ColumnData( const uint8_t* cuda_column_bit_type, const data_size_t* cuda_used_indices, const data_size_t num_used_indices, + const int num_column, void** out_cuda_data_by_column) { - const int column_index = static_cast(blockIdx.x); - const void* in_column_data = in_cuda_data_by_column[column_index]; - void* out_column_data = out_cuda_data_by_column[column_index]; - const uint8_t bit_type = cuda_column_bit_type[column_index]; - const data_size_t local_data_index_start = static_cast(threadIdx.x); - if (bit_type == 8) { - const uint8_t* true_in_column_data = reinterpret_cast(in_column_data); - uint8_t* true_out_column_data = reinterpret_cast(out_column_data); - for (data_size_t local_data_index = local_data_index_start; local_data_index < num_used_indices; local_data_index += static_cast(blockDim.x)) { - const data_size_t global_data_index = cuda_used_indices[local_data_index]; - true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; - } - } else if (bit_type == 16) { - const uint16_t* true_in_column_data = reinterpret_cast(in_column_data); - uint16_t* true_out_column_data = reinterpret_cast(out_column_data); - for (data_size_t local_data_index = local_data_index_start; local_data_index < num_used_indices; local_data_index += static_cast(blockDim.x)) { - const data_size_t global_data_index = cuda_used_indices[local_data_index]; - true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; - } - } else if (bit_type == 32) { - const uint32_t* true_in_column_data = reinterpret_cast(in_column_data); - uint32_t* true_out_column_data = reinterpret_cast(out_column_data); - for (data_size_t local_data_index = local_data_index_start; local_data_index < num_used_indices; local_data_index += static_cast(blockDim.x)) { - const data_size_t global_data_index = cuda_used_indices[local_data_index]; - true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; + const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (local_data_index < num_used_indices) { + for (int column_index = 0; column_index < num_column; ++column_index) { + const void* in_column_data = in_cuda_data_by_column[column_index]; + void* out_column_data = out_cuda_data_by_column[column_index]; + const uint8_t bit_type = cuda_column_bit_type[column_index]; + if (bit_type == 8) { + const uint8_t* true_in_column_data = reinterpret_cast(in_column_data); + uint8_t* true_out_column_data = reinterpret_cast(out_column_data); + const data_size_t global_data_index = cuda_used_indices[local_data_index]; + true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; + } else if (bit_type == 16) { + const uint16_t* true_in_column_data = reinterpret_cast(in_column_data); + uint16_t* true_out_column_data = reinterpret_cast(out_column_data); + const data_size_t global_data_index = cuda_used_indices[local_data_index]; + true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; + } else if (bit_type == 32) { + const uint32_t* true_in_column_data = reinterpret_cast(in_column_data); + uint32_t* true_out_column_data = reinterpret_cast(out_column_data); + const data_size_t global_data_index = cuda_used_indices[local_data_index]; + true_out_column_data[local_data_index] = true_in_column_data[global_data_index]; + } } } } void CUDAColumnData::LaunchCopySubrowKernel(void* const* in_cuda_data_by_column) { - CopySubrowKernel_ColumnData<<>>( + const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_COLUMN_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_COLUMN_DATA; + CopySubrowKernel_ColumnData<<>>( in_cuda_data_by_column, cuda_column_bit_type_, cuda_used_indices_, num_used_indices_, + num_columns_, cuda_data_by_column_); } diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 9f2bb226fd13..e6aaa5bc8c57 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -9,7 +9,7 @@ namespace LightGBM { CUDARowData::CUDARowData(const Dataset* train_data, const TrainingShareStates* train_share_state, - const int gpu_device_id) { + const int gpu_device_id): gpu_device_id_(gpu_device_id) { num_threads_ = OMP_NUM_THREADS(); num_data_ = train_data->num_data(); num_total_bin_ = static_cast(train_share_state->feature_hist_offsets().back()); @@ -21,6 +21,14 @@ CUDARowData::CUDARowData(const Dataset* train_data, CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); } cuda_used_indices_ = nullptr; + cur_subset_buffer_size_ = 0; + cur_total_elements_buffer_size_ = 0; +} + +CUDARowData::CUDARowData() { + cuda_used_indices_ = nullptr; + cur_subset_buffer_size_ = 0; + cur_total_elements_buffer_size_ = 0; } void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_share_state) { @@ -338,11 +346,22 @@ void CUDARowData::CopySubrow( const CUDARowData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices) { - num_used_indices_ = num_used_indices; if (cuda_used_indices_ == nullptr) { + CHECK_EQ(cur_subset_buffer_size_, 0); + CHECK_EQ(cur_total_elements_buffer_size_, 0); // initialize meta information + num_threads_ = full_set->num_threads_; + num_data_ = full_set->num_data_; + num_total_bin_ = full_set->num_total_bin_; + num_feature_group_ = full_set->num_feature_group_; + num_feature_ = full_set->num_feature_; + gpu_device_id_ = full_set->gpu_device_id_; + if (gpu_device_id_ >= 0) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id_)); + } else { + CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + } bit_type_ = full_set->bit_type_; - row_ptr_bit_type_ = full_set->row_ptr_bit_type_; is_sparse_ = full_set->is_sparse_; feature_partition_column_index_offsets_ = full_set->feature_partition_column_index_offsets_; column_hist_offsets_ = full_set->column_hist_offsets_; @@ -358,33 +377,134 @@ void CUDARowData::CopySubrow( column_hist_offsets_.data(), column_hist_offsets_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_partition_hist_offsets_, partition_hist_offsets_.data(), partition_hist_offsets_.size(), __FILE__, __LINE__); + } + if (!full_set->is_sparse_) { + CopyDenseSubrowData(full_set, num_used_indices, used_indices); + } else { + CopySparseSubrowData(full_set, num_used_indices, used_indices); + } +} - cur_subset_buffer_size_ = num_used_indices_; - InitCUDAMemoryFromHostMemoryOuter(&cuda_used_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); - if (!is_sparse_) { - const int num_column = feature_partition_column_index_offsets_.back(); - size_t total_size = static_cast(num_used_indices_ * num_column); +uint64_t CUDARowData::CalcTotalNumberOfElements(const CUDARowData* full_set) { + return LaunchCalcTotalNumberOfElementsKernel(full_set); +} + +void CUDARowData::CopyDenseSubrowData(const CUDARowData* full_set, const data_size_t num_used_indices, const data_size_t* used_indices) { + num_used_indices_ = num_used_indices; + if (num_used_indices_ > cur_subset_buffer_size_) { + // allocate cuda memory + if (cur_subset_buffer_size_ == 0) { + CHECK_EQ(cuda_used_indices_, nullptr); + CHECK_EQ(cur_total_elements_buffer_size_, 0); + } else { + DeallocateCUDAMemoryOuter(&cuda_used_indices_, __FILE__, __LINE__); if (bit_type_ == 8) { - AllocateCUDAMemoryOuter(&cuda_data_uint8_t_, total_size, __FILE__, __LINE__); + DeallocateCUDAMemoryOuter(&cuda_data_uint8_t_, __FILE__, __LINE__); } else if (bit_type_ == 16) { - AllocateCUDAMemoryOuter(&cuda_data_uint16_t_, total_size, __FILE__, __LINE__); + DeallocateCUDAMemoryOuter(&cuda_data_uint16_t_, __FILE__, __LINE__); } else if (bit_type_ == 32) { - AllocateCUDAMemoryOuter(&cuda_data_uint32_t_, total_size, __FILE__, __LINE__); + DeallocateCUDAMemoryOuter(&cuda_data_uint32_t_, __FILE__, __LINE__); } + } + AllocateCUDAMemoryOuter(&cuda_used_indices_, static_cast(num_used_indices_), __FILE__, __LINE__); + const int num_column = feature_partition_column_index_offsets_.back(); + size_t total_size = static_cast(num_used_indices_ * num_column); + if (bit_type_ == 8) { + AllocateCUDAMemoryOuter(&cuda_data_uint8_t_, total_size, __FILE__, __LINE__); + } else if (bit_type_ == 16) { + AllocateCUDAMemoryOuter(&cuda_data_uint16_t_, total_size, __FILE__, __LINE__); + } else if (bit_type_ == 32) { + AllocateCUDAMemoryOuter(&cuda_data_uint32_t_, total_size, __FILE__, __LINE__); + } + cur_subset_buffer_size_ = num_used_indices_; + } + CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); + LaunchCopyDenseSubrowKernel(full_set); +} + +void CUDARowData::CopySparseSubrowData(const CUDARowData* full_set, const data_size_t num_used_indices, const data_size_t* used_indices) { + num_used_indices_ = num_used_indices; + bool need_reallocate_row_ptr = false; + bool need_reallocate_data = false; + if (num_used_indices_ > cur_subset_buffer_size_) { + if (cur_subset_buffer_size_ == 0) { + CHECK_EQ(cur_total_elements_buffer_size_, 0); + CHECK_EQ(cuda_used_indices_, nullptr); + const int num_blocks = (num_data_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; + AllocateCUDAMemoryOuter(&cuda_block_sum_buffer_, static_cast(num_blocks * num_feature_partitions_) + 1, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint16_t_, static_cast(num_feature_partitions_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint32_t_, static_cast(num_feature_partitions_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint64_t_, static_cast(num_feature_partitions_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_partition_ptr_buffer_, static_cast(num_feature_partitions_) + 1, __FILE__, __LINE__); } else { - // TODO(shiyu1994): copy subrow for sparse data + DeallocateCUDAMemoryOuter(&cuda_used_indices_, __FILE__, __LINE__); + } + AllocateCUDAMemoryOuter(&cuda_used_indices_, static_cast(num_used_indices_), __FILE__, __LINE__); + need_reallocate_row_ptr = true; + } + num_total_elements_ = CalcTotalNumberOfElements(full_set); + if (num_total_elements_ > cur_total_elements_buffer_size_) { + need_reallocate_data = true; + } + if (num_total_elements_ <= std::numeric_limits::max()) { + if (row_ptr_bit_type_ != 16) { + need_reallocate_row_ptr = true; + } + } else if (num_total_elements_ <= std::numeric_limits::max()) { + if (row_ptr_bit_type_ != 32) { + need_reallocate_row_ptr = true; } } else { - if (num_used_indices_ > cur_subset_buffer_size_) { - ResizeWhenCopySubrow(num_used_indices_); - cur_subset_buffer_size_ = num_used_indices_; + if (row_ptr_bit_type_ != 64) { + need_reallocate_row_ptr = true; + } + } + if (need_reallocate_row_ptr) { + if (cur_subset_buffer_size_ > 0) { + if (row_ptr_bit_type_ == 16) { + DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint16_t_, __FILE__, __LINE__); + } else if (row_ptr_bit_type_ == 32) { + DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint32_t_, __FILE__, __LINE__); + } else if (row_ptr_bit_type_ == 64) { + DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint64_t_, __FILE__, __LINE__); + } } - CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices_), __FILE__, __LINE__); - LaunchCopySubrowKernel(full_set); + if (num_total_elements_ <= std::numeric_limits::max()) { + row_ptr_bit_type_ = 16; + AllocateCUDAMemoryOuter(&cuda_row_ptr_uint16_t_, static_cast(num_used_indices_) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_row_ptr_uint16_t_, 0, 1, __FILE__, __LINE__); + } else if (num_total_elements_ <= std::numeric_limits::max()) { + row_ptr_bit_type_ = 32; + AllocateCUDAMemoryOuter(&cuda_row_ptr_uint32_t_, static_cast(num_used_indices_) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_row_ptr_uint32_t_, 0, 1, __FILE__, __LINE__); + } else { + row_ptr_bit_type_ = 64; + AllocateCUDAMemoryOuter(&cuda_row_ptr_uint64_t_, static_cast(num_used_indices_) + 1, __FILE__, __LINE__); + SetCUDAMemoryOuter(cuda_row_ptr_uint64_t_, 0, 1, __FILE__, __LINE__); + } + cur_subset_buffer_size_ = num_used_indices_; } + if (need_reallocate_data) { + if (cur_total_elements_buffer_size_ > 0) { + if (bit_type_ == 8) { + DeallocateCUDAMemoryOuter(&cuda_data_uint8_t_, __FILE__, __LINE__); + } else if (bit_type_ == 16) { + DeallocateCUDAMemoryOuter(&cuda_data_uint16_t_, __FILE__, __LINE__); + } else if (bit_type_ == 32) { + DeallocateCUDAMemoryOuter(&cuda_data_uint32_t_, __FILE__, __LINE__); + } + } + if (bit_type_ == 8) { + AllocateCUDAMemoryOuter(&cuda_data_uint8_t_, num_total_elements_, __FILE__, __LINE__); + } else if (bit_type_ == 16) { + AllocateCUDAMemoryOuter(&cuda_data_uint16_t_, num_total_elements_, __FILE__, __LINE__); + } else if (bit_type_ == 32) { + AllocateCUDAMemoryOuter(&cuda_data_uint32_t_, num_total_elements_, __FILE__, __LINE__); + } + cur_total_elements_buffer_size_ = num_total_elements_; + } + CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices_), __FILE__, __LINE__); + LaunchCopySparseSubrowKernel(full_set); } -// TODO(shiyu1994): implement this -void CUDARowData::ResizeWhenCopySubrow(const data_size_t /*num_used_indices*/) {} - } // namespace LightGBM diff --git a/src/io/cuda/cuda_row_data.cu b/src/io/cuda/cuda_row_data.cu index 22e7a9905934..c1c1483a5f60 100644 --- a/src/io/cuda/cuda_row_data.cu +++ b/src/io/cuda/cuda_row_data.cu @@ -4,8 +4,7 @@ */ #include - -#define COPY_SUBROW_BLOCK_SIZE_ROW_DATA (1024) +#include namespace LightGBM { @@ -23,25 +22,257 @@ __global__ void CopySubrowDenseKernel(const BIN_TYPE* full_set_bin_data, const i } } -void CUDARowData::LaunchCopySubrowKernel(const CUDARowData* full_set) { +void CUDARowData::LaunchCopyDenseSubrowKernel(const CUDARowData* full_set) { const int num_column = feature_partition_column_index_offsets_.back(); - if (!is_sparse_) { - const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; + const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; + if (bit_type_ == 8) { + const uint8_t* full_set_bin_data = full_set->cuda_data_uint8_t_; + CopySubrowDenseKernel<<>>( + full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint8_t_); + } else if (bit_type_ == 16) { + const uint16_t* full_set_bin_data = full_set->cuda_data_uint16_t_; + CopySubrowDenseKernel<<>>( + full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint16_t_); + } else if (bit_type_ == 32) { + const uint32_t* full_set_bin_data = full_set->cuda_data_uint32_t_; + CopySubrowDenseKernel<<>>( + full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint32_t_); + } +} + +template +__global__ void CalcTotalNumberOfElementsKernel( + const data_size_t num_used_indices, + const data_size_t* cuda_used_indices, + const ROW_PTR_TYPE* cuda_row_ptr, + const int num_feature_partitions, + const data_size_t num_data, + uint64_t* block_sum_buffer) { + __shared__ uint64_t shared_mem_buffer[32]; + const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const int partition_index = static_cast(blockIdx.y); + const ROW_PTR_TYPE* partition_row_ptr = cuda_row_ptr + partition_index * (num_data + 1); + uint64_t num_elements_in_row = 0; + if (local_data_index < num_used_indices) { + const data_size_t global_data_index = cuda_used_indices[local_data_index]; + const data_size_t row_start = partition_row_ptr[global_data_index]; + const data_size_t row_end = partition_row_ptr[global_data_index + 1]; + num_elements_in_row += static_cast(row_end - row_start); + } + const uint64_t num_elements_in_block = ShuffleReduceSum(num_elements_in_row, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + block_sum_buffer[partition_index * blockDim.x + blockIdx.x] = num_elements_in_block; + } +} + +__global__ void ReduceBlockSumKernel( + const uint64_t* block_sum_buffer, + const int num_blocks, + const int num_feature_partitions, + uint64_t* cuda_partition_ptr_buffer) { + __shared__ uint64_t shared_mem_buffer[32]; + uint64_t thread_sum = 0; + const int partition_index = static_cast(blockIdx.y); + const uint64_t* block_sum_buffer_ptr = block_sum_buffer + partition_index * blockDim.x; + for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; ++block_index) { + thread_sum += block_sum_buffer_ptr[block_index]; + } + const uint64_t num_total_elements = ShuffleReduceSum(thread_sum, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + cuda_partition_ptr_buffer[partition_index + 1] = num_total_elements; + if (blockIdx.x == 0) { + cuda_partition_ptr_buffer[0] = 0; + } + } +} + +__global__ void ComputePartitionPtr( + uint64_t* cuda_partition_ptr_buffer, + const int num_feature_partitions) { + __shared__ uint64_t shared_mem_buffer[32]; + const int num_partitions_per_thread = (num_feature_partitions + blockDim.x - 1) / (blockDim.x - 1); + int start_partition = threadIdx.x == 0 ? 0 : num_partitions_per_thread * static_cast(threadIdx.x - 1); + int end_partition = threadIdx.x == 0 ? 0 : min(start_partition + num_partitions_per_thread, num_feature_partitions + 1); + uint64_t thread_sum = 0; + for (int partition_index = start_partition; partition_index < end_partition; ++partition_index) { + thread_sum += cuda_partition_ptr_buffer[partition_index]; + } + const uint64_t thread_base = ShufflePrefixSum(thread_sum, shared_mem_buffer); + start_partition = threadIdx.x == blockDim.x - 1 ? 0 : num_partitions_per_thread * static_cast(threadIdx.x); + end_partition = threadIdx.x == blockDim.x - 1 ? 0 : min(start_partition + num_partitions_per_thread, num_feature_partitions + 1); + for (int partition_index = start_partition + 1; partition_index < end_partition; ++partition_index) { + cuda_partition_ptr_buffer[partition_index] += cuda_partition_ptr_buffer[partition_index - 1]; + } + for (int partition_index = start_partition; partition_index < end_partition; ++partition_index) { + cuda_partition_ptr_buffer[partition_index] += thread_base; + } + if (threadIdx.x == blockDim.x - 1) { + cuda_partition_ptr_buffer[num_feature_partitions] = thread_sum; + } +} + +uint64_t CUDARowData::LaunchCalcTotalNumberOfElementsKernel(const CUDARowData* full_set) { + const int num_blocks = (num_data_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; + SetCUDAMemoryOuter(cuda_block_sum_buffer_, 0, static_cast(num_blocks * num_feature_partitions_) + 1, __FILE__, __LINE__); + if (full_set->row_ptr_bit_type_ == 16) { + CalcTotalNumberOfElementsKernel<<>>( + num_used_indices_, + cuda_used_indices_, + full_set->cuda_row_ptr_uint16_t_, + num_feature_partitions_, + num_data_, + cuda_block_sum_buffer_); + } else if (full_set->row_ptr_bit_type_ == 32) { + CalcTotalNumberOfElementsKernel<<>>( + num_used_indices_, + cuda_used_indices_, + full_set->cuda_row_ptr_uint32_t_, + num_feature_partitions_, + num_data_, + cuda_block_sum_buffer_); + } else if (full_set->row_ptr_bit_type_ == 64) { + CalcTotalNumberOfElementsKernel<<>>( + num_used_indices_, + cuda_used_indices_, + full_set->cuda_row_ptr_uint64_t_, + num_feature_partitions_, + num_data_, + cuda_block_sum_buffer_); + } + ReduceBlockSumKernel<<>>( + cuda_block_sum_buffer_, num_blocks, num_feature_partitions_, cuda_partition_ptr_buffer_); + ComputePartitionPtr<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_); + uint64_t num_total_elements = 0; + CopyFromCUDADeviceToHostOuter(&num_total_elements, cuda_partition_ptr_buffer_, num_feature_partitions_, __FILE__, __LINE__); + return num_total_elements; +} + +template +__global__ void CopyPartitionPtrKernel( + const uint64_t* cuda_partition_ptr_buffer, + const int num_feature_partitions, + ROW_PTR_TYPE* cuda_partition_ptr) { + for (int partition_index = static_cast(threadIdx.x); partition_index < num_feature_partitions + 1; partition_index += static_cast(blockDim.x)) { + cuda_partition_ptr[partition_index] = static_cast(cuda_partition_ptr_buffer[partition_index]); + } +} + +template +__global__ void CopySparseSubrowRowPtrKernel( + const IN_ROW_PTR_TYPE* cuda_row_ptr, + const data_size_t num_used_indices, + const data_size_t* cuda_used_indices, + OUT_ROW_PTR_TYPE* out_cuda_row_ptr) { + const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (local_data_index > num_used_indices) { + const data_size_t global_data_index = cuda_used_indices[local_data_index]; + const IN_ROW_PTR_TYPE row_start = cuda_row_ptr[global_data_index]; + const IN_ROW_PTR_TYPE row_end = cuda_row_ptr[global_data_index + 1]; + const OUT_ROW_PTR_TYPE num_elements_in_row = static_cast(row_end - row_start); + out_cuda_row_ptr[local_data_index + 1] = num_elements_in_row; + } +} + +template +__global__ void CopySparseSubrowDataKernel( + const BIN_TYPE* in_cuda_data, + const ROW_PTR_TYPE* cuda_row_ptr, + const data_size_t num_used_indices, + const data_size_t* cuda_used_indices, + BIN_TYPE* out_cuda_data) { + const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (local_data_index < num_used_indices) { + const data_size_t global_data_index = cuda_used_indices[local_data_index]; + const ROW_PTR_TYPE row_start = cuda_row_ptr[global_data_index]; + const ROW_PTR_TYPE row_end = cuda_row_ptr[global_data_index + 1]; + const ROW_PTR_TYPE num_elements_in_row = row_end - row_start; + const BIN_TYPE* in_cuda_data_ptr = in_cuda_data + row_start; + BIN_TYPE* out_cuda_data_ptr = out_cuda_data + row_start; + for (ROW_PTR_TYPE element_index = 0; element_index < num_elements_in_row; ++element_index) { + out_cuda_data_ptr[element_index] = in_cuda_data_ptr[element_index]; + } + } +} + +void CUDARowData::LaunchCopySparseSubrowKernel(const CUDARowData* full_set) { + if (row_ptr_bit_type_ == 16) { + CopyPartitionPtrKernel<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_, cuda_partition_ptr_uint16_t_); + } else if (row_ptr_bit_type_ == 32) { + CopyPartitionPtrKernel<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_, cuda_partition_ptr_uint32_t_); + } else if (row_ptr_bit_type_ == 64) { + CopyPartitionPtrKernel<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_, cuda_partition_ptr_uint64_t_); + } + const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; + if (full_set->row_ptr_bit_type_ == 16) { + CHECK_EQ(row_ptr_bit_type_, 16); + CopySparseSubrowRowPtrKernel<<>>( + full_set->cuda_row_ptr_uint16_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint16_t_); + } else if (full_set->row_ptr_bit_type_ == 32) { + CHECK(row_ptr_bit_type_ == 16 || row_ptr_bit_type_ == 32); + if (row_ptr_bit_type_ == 16) { + CopySparseSubrowRowPtrKernel<<>>( + full_set->cuda_row_ptr_uint32_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint16_t_); + } else if (row_ptr_bit_type_ == 32) { + CopySparseSubrowRowPtrKernel<<>>( + full_set->cuda_row_ptr_uint32_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint32_t_); + } + } else if (full_set->row_ptr_bit_type_ == 64) { + if (row_ptr_bit_type_ == 16) { + CopySparseSubrowRowPtrKernel<<>>( + full_set->cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint16_t_); + } else if (row_ptr_bit_type_ == 32) { + CopySparseSubrowRowPtrKernel<<>>( + full_set->cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint32_t_); + } else if (row_ptr_bit_type_ == 64) { + CopySparseSubrowRowPtrKernel<<>>( + full_set->cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint64_t_); + } + } + if (row_ptr_bit_type_ == 16) { + ShufflePrefixSumGlobal( + cuda_row_ptr_uint16_t_, + static_cast(num_used_indices_) + 1, + reinterpret_cast(cuda_block_sum_buffer_)); + if (bit_type_ == 8) { + CopySparseSubrowDataKernel<<>>( + full_set->cuda_data_uint8_t_, cuda_row_ptr_uint16_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint8_t_); + } else if (bit_type_ == 16) { + CopySparseSubrowDataKernel<<>>( + full_set->cuda_data_uint16_t_, cuda_row_ptr_uint16_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint16_t_); + } else if (bit_type_ == 32) { + CopySparseSubrowDataKernel<<>>( + full_set->cuda_data_uint32_t_, cuda_row_ptr_uint16_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint32_t_); + } + } else if (row_ptr_bit_type_ == 32) { + ShufflePrefixSumGlobal( + cuda_row_ptr_uint32_t_, + static_cast(num_used_indices_) + 1, + reinterpret_cast(cuda_block_sum_buffer_)); + if (bit_type_ == 8) { + CopySparseSubrowDataKernel<<>>( + full_set->cuda_data_uint8_t_, cuda_row_ptr_uint32_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint8_t_); + } else if (bit_type_ == 16) { + CopySparseSubrowDataKernel<<>>( + full_set->cuda_data_uint16_t_, cuda_row_ptr_uint32_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint16_t_); + } else if (bit_type_ == 32) { + CopySparseSubrowDataKernel<<>>( + full_set->cuda_data_uint32_t_, cuda_row_ptr_uint32_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint32_t_); + } + } else if (row_ptr_bit_type_ == 64) { + ShufflePrefixSumGlobal( + cuda_row_ptr_uint64_t_, + static_cast(num_used_indices_) + 1, + reinterpret_cast(cuda_block_sum_buffer_)); if (bit_type_ == 8) { - const uint8_t* full_set_bin_data = full_set->cuda_data_uint8_t_; - CopySubrowDenseKernel<<>>( - full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint8_t_); + CopySparseSubrowDataKernel<<>>( + full_set->cuda_data_uint8_t_, cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint8_t_); } else if (bit_type_ == 16) { - const uint16_t* full_set_bin_data = full_set->cuda_data_uint16_t_; - CopySubrowDenseKernel<<>>( - full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint16_t_); + CopySparseSubrowDataKernel<<>>( + full_set->cuda_data_uint16_t_, cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint16_t_); } else if (bit_type_ == 32) { - const uint32_t* full_set_bin_data = full_set->cuda_data_uint32_t_; - CopySubrowDenseKernel<<>>( - full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint32_t_); + CopySparseSubrowDataKernel<<>>( + full_set->cuda_data_uint32_t_, cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint32_t_); } - } else { - // TODO(shiyu1994): copy subrow for sparse data } } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 64b735d5cf90..a6374797086e 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -54,6 +54,8 @@ CUDAHistogramConstructor::CUDAHistogramConstructor( } num_total_bin_ = offset; cuda_row_data_.reset(nullptr); + cuda_row_data_subset_.reset(nullptr); + use_bagging_subset_ = false; } void CUDAHistogramConstructor::BeforeTrain(const score_t* gradients, const score_t* hessians) { @@ -83,7 +85,6 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat InitCUDAMemoryFromHostMemoryOuter(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemoryOuter(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(), need_fix_histogram_features_num_bin_aligend_.size(), __FILE__, __LINE__); - cuda_used_indices_ = nullptr; } void CUDAHistogramConstructor::ConstructHistogramForLeaf( @@ -110,9 +111,10 @@ void CUDAHistogramConstructor::CalcConstructHistogramKernelDim( int* block_dim_x, int* block_dim_y, const data_size_t num_data_in_smaller_leaf) { - *block_dim_x = cuda_row_data_->max_num_column_per_partition(); - *block_dim_y = NUM_THRADS_PER_BLOCK / cuda_row_data_->max_num_column_per_partition(); - *grid_dim_x = cuda_row_data_->num_feature_partitions(); + const CUDARowData* cuda_row_data = use_bagging_subset_ ? cuda_row_data_subset_.get() : cuda_row_data_.get(); + *block_dim_x = cuda_row_data->max_num_column_per_partition(); + *block_dim_y = NUM_THRADS_PER_BLOCK / cuda_row_data->max_num_column_per_partition(); + *grid_dim_x = cuda_row_data->num_feature_partitions(); *grid_dim_y = std::max(min_grid_dim_y_, ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + (*block_dim_y) - 1) / (*block_dim_y)); } @@ -121,11 +123,12 @@ void CUDAHistogramConstructor::ResetTrainingData(const Dataset* train_data) { num_data_ = train_data->num_data(); } -void CUDAHistogramConstructor::SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_data) { - if (cuda_used_indices_ == nullptr) { - AllocateCUDAMemoryOuter(&cuda_used_indices_, static_cast(num_data_), __FILE__, __LINE__); +void CUDAHistogramConstructor::SetBaggingSubset(const data_size_t* used_indices, const data_size_t num_data) { + if (cuda_row_data_subset_ == nullptr) { + cuda_row_data_subset_.reset(new CUDARowData()); } - CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_data), __FILE__, __LINE__); + cuda_row_data_subset_->CopySubrow(cuda_row_data_.get(), used_indices, num_data); + use_bagging_subset_ = true; } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index bb712f15fb54..206920ccd2da 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -11,7 +11,7 @@ namespace LightGBM { -template +template __global__ void CUDAConstructHistogramDenseKernel( const CUDALeafSplitsStruct* smaller_leaf_splits, const score_t* cuda_gradients, @@ -20,8 +20,7 @@ __global__ void CUDAConstructHistogramDenseKernel( const uint32_t* column_hist_offsets, const uint32_t* column_hist_offsets_full, const int* feature_partition_column_index_offsets, - const data_size_t num_data, - const data_size_t* used_indices) { + const data_size_t num_data) { const int dim_y = static_cast(gridDim.y * blockDim.y); const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; @@ -56,9 +55,7 @@ __global__ void CUDAConstructHistogramDenseKernel( const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; const score_t grad = cuda_gradients[data_index]; const score_t hess = cuda_hessians[data_index]; - const uint32_t bin = USE_SUBSET ? - static_cast(data_ptr[used_indices[data_index] * num_columns_in_partition + threadIdx.x]) : - static_cast(data_ptr[data_index * num_columns_in_partition + threadIdx.x]); + const uint32_t bin = static_cast(data_ptr[data_index * num_columns_in_partition + threadIdx.x]); const uint32_t pos = bin << 1; float* pos_ptr = shared_hist_ptr + pos; atomicAdd_block(pos_ptr, grad); @@ -73,7 +70,7 @@ __global__ void CUDAConstructHistogramDenseKernel( } } -template +template __global__ void CUDAConstructHistogramSparseKernel( const CUDALeafSplitsStruct* smaller_leaf_splits, const score_t* cuda_gradients, @@ -82,8 +79,7 @@ __global__ void CUDAConstructHistogramSparseKernel( const DATA_PTR_TYPE* row_ptr, const DATA_PTR_TYPE* partition_ptr, const uint32_t* column_hist_offsets_full, - const data_size_t num_data, - const data_size_t* used_indices) { + const data_size_t num_data) { const int dim_y = static_cast(gridDim.y * blockDim.y); const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; @@ -111,8 +107,8 @@ __global__ void CUDAConstructHistogramSparseKernel( data_size_t inner_data_index = static_cast(threadIdx_y); for (data_size_t i = 0; i < num_iteration_this; ++i) { const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; - const DATA_PTR_TYPE row_start = USE_SUBSET ? block_row_ptr[used_indices[data_index]] : block_row_ptr[data_index]; - const DATA_PTR_TYPE row_end = USE_SUBSET ? block_row_ptr[used_indices[data_index] + 1] : block_row_ptr[data_index + 1]; + const DATA_PTR_TYPE row_start = block_row_ptr[data_index]; + const DATA_PTR_TYPE row_end = block_row_ptr[data_index + 1]; const DATA_PTR_TYPE row_size = row_end - row_start; if (threadIdx.x < row_size) { const score_t grad = cuda_gradients[data_index]; @@ -142,246 +138,126 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf); dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); - if (cuda_used_indices_ == nullptr) { - if (cuda_row_data_->is_sparse()) { - if (cuda_row_data_->bit_type() == 8) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, nullptr); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, nullptr); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, nullptr); - } - } else if (cuda_row_data_->bit_type() == 16) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, nullptr); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, nullptr); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, nullptr); - } - } else if (cuda_row_data_->bit_type() == 32) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, nullptr); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, nullptr); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, nullptr); - } - } - } else { - if (cuda_row_data_->bit_type() == 8) { - CUDAConstructHistogramDenseKernel<<>>( + const CUDARowData* cuda_row_data = use_bagging_subset_ ? cuda_row_data_subset_.get() : cuda_row_data_.get(); + + if (cuda_row_data->is_sparse()) { + if (cuda_row_data->bit_type() == 8) { + if (cuda_row_data->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_, nullptr); - } else if (cuda_row_data_->bit_type() == 16) { - CUDAConstructHistogramDenseKernel<<>>( + cuda_row_data->cuda_data_uint8(), + cuda_row_data->cuda_row_ptr_uint16(), + cuda_row_data->cuda_partition_ptr_uint16(), + cuda_row_data->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_, nullptr); - } else if (cuda_row_data_->bit_type() == 32) { - CUDAConstructHistogramDenseKernel<<>>( + cuda_row_data->cuda_data_uint8(), + cuda_row_data->cuda_row_ptr_uint32(), + cuda_row_data->cuda_partition_ptr_uint32(), + cuda_row_data->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_, nullptr); + cuda_row_data->cuda_data_uint8(), + cuda_row_data->cuda_row_ptr_uint64(), + cuda_row_data->cuda_partition_ptr_uint64(), + cuda_row_data->cuda_partition_hist_offsets(), + num_data_); } - } - } else { - if (cuda_row_data_->is_sparse()) { - if (cuda_row_data_->bit_type() == 8) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, cuda_used_indices_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, cuda_used_indices_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, cuda_used_indices_); - } - } else if (cuda_row_data_->bit_type() == 16) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, cuda_used_indices_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, cuda_used_indices_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, cuda_used_indices_); - } - } else if (cuda_row_data_->bit_type() == 32) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, cuda_used_indices_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, cuda_used_indices_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, cuda_used_indices_); - } + } else if (cuda_row_data->bit_type() == 16) { + if (cuda_row_data->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data->cuda_data_uint16(), + cuda_row_data->cuda_row_ptr_uint16(), + cuda_row_data->cuda_partition_ptr_uint16(), + cuda_row_data->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data->cuda_data_uint16(), + cuda_row_data->cuda_row_ptr_uint32(), + cuda_row_data->cuda_partition_ptr_uint32(), + cuda_row_data->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data->cuda_data_uint16(), + cuda_row_data->cuda_row_ptr_uint64(), + cuda_row_data->cuda_partition_ptr_uint64(), + cuda_row_data->cuda_partition_hist_offsets(), + num_data_); } - } else { - if (cuda_row_data_->bit_type() == 8) { - CUDAConstructHistogramDenseKernel<<>>( + } else if (cuda_row_data->bit_type() == 32) { + if (cuda_row_data->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_, cuda_used_indices_); - } else if (cuda_row_data_->bit_type() == 16) { - CUDAConstructHistogramDenseKernel<<>>( + cuda_row_data->cuda_data_uint32(), + cuda_row_data->cuda_row_ptr_uint16(), + cuda_row_data->cuda_partition_ptr_uint16(), + cuda_row_data->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_, cuda_used_indices_); - } else if (cuda_row_data_->bit_type() == 32) { - CUDAConstructHistogramDenseKernel<<>>( + cuda_row_data->cuda_data_uint32(), + cuda_row_data->cuda_row_ptr_uint32(), + cuda_row_data->cuda_partition_ptr_uint32(), + cuda_row_data->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_, cuda_used_indices_); + cuda_row_data->cuda_data_uint32(), + cuda_row_data->cuda_row_ptr_uint64(), + cuda_row_data->cuda_partition_ptr_uint64(), + cuda_row_data->cuda_partition_hist_offsets(), + num_data_); } } + } else { + if (cuda_row_data->bit_type() == 8) { + CUDAConstructHistogramDenseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data->cuda_data_uint8(), + cuda_row_data->cuda_column_hist_offsets(), + cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data->cuda_feature_partition_column_index_offsets(), + num_data_); + } else if (cuda_row_data->bit_type() == 16) { + CUDAConstructHistogramDenseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data->cuda_data_uint16(), + cuda_row_data->cuda_column_hist_offsets(), + cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data->cuda_feature_partition_column_index_offsets(), + num_data_); + } else if (cuda_row_data->bit_type() == 32) { + CUDAConstructHistogramDenseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data->cuda_data_uint32(), + cuda_row_data->cuda_column_hist_offsets(), + cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data->cuda_feature_partition_column_index_offsets(), + num_data_); + } } } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 5ca6fb4047e3..aae853b28084 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -51,7 +51,7 @@ class CUDAHistogramConstructor { const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); - void SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_data); + void SetBaggingSubset(const data_size_t* used_indices, const data_size_t num_data); void ResetTrainingData(const Dataset* train_data); @@ -111,6 +111,8 @@ class CUDAHistogramConstructor { std::vector need_fix_histogram_features_num_bin_aligend_; /*! \brief minimum number of blocks allowed in the y dimension */ const int min_grid_dim_y_ = 160; + /*! \brief whether use bagging with subset */ + bool use_bagging_subset_; // CUDA memory, held by this object @@ -131,9 +133,6 @@ class CUDAHistogramConstructor { int* cuda_need_fix_histogram_features_; /*! \brief aligned number of bins of the features whose histograms need to be fixed */ uint32_t* cuda_need_fix_histogram_features_num_bin_aligned_; - /*! \brief used data indices when using subset for bagging */ - data_size_t* cuda_used_indices_; - // CUDA memory, held by other object diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index 0732dccf5cc2..f7d18bf4f5da 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -199,7 +199,7 @@ void NewCUDATreeLearner::SetBaggingData(const Dataset* subset, if (subset == nullptr) { cuda_data_partition_->SetUsedDataIndices(used_indices, num_data); } else { - cuda_histogram_constructor_->SetUsedDataIndices(used_indices, num_data); + cuda_histogram_constructor_->SetBaggingSubset(used_indices, num_data); train_data_ = subset; num_data_ = train_data_->num_data(); CHECK_EQ(num_features_, train_data_->num_features()); From f42e87ed080e7d05861cc994ab4993d1d310ba01 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 24 Sep 2021 03:57:53 +0000 Subject: [PATCH 079/166] refactor copy sparse subrow --- include/LightGBM/cuda/cuda_row_data.hpp | 12 +++ src/io/cuda/cuda_row_data.cpp | 3 +- src/io/cuda/cuda_row_data.cu | 136 ++++++++++++++---------- 3 files changed, 92 insertions(+), 59 deletions(-) diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 73307671738b..27e7c8b64ba5 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -97,6 +97,18 @@ class CUDARowData { void LaunchCopySparseSubrowKernel(const CUDARowData* full_set); + template + void LaunchCopySparseSubrowKernelInner0( + const CUDARowData* full_set, + OUT_ROW_PTR_TYPE* out_cuda_row_ptr); + + template + void LaunchCopySparseSubrowKernelInner1( + const CUDARowData* full_set, + const BIN_TYPE* in_cuda_data, + const OUT_ROW_PTR_TYPE* out_cuda_row_ptr, + BIN_TYPE* out_cuda_data); + /*! \brief number of threads to use */ int num_threads_; /*! \brief number of training data */ diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index e6aaa5bc8c57..9b42c3267001 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -442,7 +442,9 @@ void CUDARowData::CopySparseSubrowData(const CUDARowData* full_set, const data_s AllocateCUDAMemoryOuter(&cuda_used_indices_, static_cast(num_used_indices_), __FILE__, __LINE__); need_reallocate_row_ptr = true; } + CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices_), __FILE__, __LINE__); num_total_elements_ = CalcTotalNumberOfElements(full_set); + Log::Warning("num_total_elements_ = %d", num_total_elements_); if (num_total_elements_ > cur_total_elements_buffer_size_) { need_reallocate_data = true; } @@ -503,7 +505,6 @@ void CUDARowData::CopySparseSubrowData(const CUDARowData* full_set, const data_s } cur_total_elements_buffer_size_ = num_total_elements_; } - CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices_), __FILE__, __LINE__); LaunchCopySparseSubrowKernel(full_set); } diff --git a/src/io/cuda/cuda_row_data.cu b/src/io/cuda/cuda_row_data.cu index c1c1483a5f60..055368283ca6 100644 --- a/src/io/cuda/cuda_row_data.cu +++ b/src/io/cuda/cuda_row_data.cu @@ -57,11 +57,12 @@ __global__ void CalcTotalNumberOfElementsKernel( const data_size_t global_data_index = cuda_used_indices[local_data_index]; const data_size_t row_start = partition_row_ptr[global_data_index]; const data_size_t row_end = partition_row_ptr[global_data_index + 1]; - num_elements_in_row += static_cast(row_end - row_start); + num_elements_in_row = static_cast(row_end - row_start); } const uint64_t num_elements_in_block = ShuffleReduceSum(num_elements_in_row, shared_mem_buffer, blockDim.x); if (threadIdx.x == 0) { - block_sum_buffer[partition_index * blockDim.x + blockIdx.x] = num_elements_in_block; + printf("blockIdx.x = %d, partition_index = %d, num_elements_in_block = %d\n", blockIdx.x, partition_index, num_elements_in_block); + block_sum_buffer[partition_index * gridDim.x + blockIdx.x] = num_elements_in_block; } } @@ -72,13 +73,17 @@ __global__ void ReduceBlockSumKernel( uint64_t* cuda_partition_ptr_buffer) { __shared__ uint64_t shared_mem_buffer[32]; uint64_t thread_sum = 0; - const int partition_index = static_cast(blockIdx.y); - const uint64_t* block_sum_buffer_ptr = block_sum_buffer + partition_index * blockDim.x; - for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; ++block_index) { + const int partition_index = static_cast(blockIdx.x); + const uint64_t* block_sum_buffer_ptr = block_sum_buffer + partition_index * num_blocks; + for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { thread_sum += block_sum_buffer_ptr[block_index]; } + if (threadIdx.x == 0) { + printf("thread_sum = %d\n", thread_sum); + } const uint64_t num_total_elements = ShuffleReduceSum(thread_sum, shared_mem_buffer, blockDim.x); if (threadIdx.x == 0) { + printf("partition_index = %d, num_total_elements = %d\n", partition_index, num_total_elements); cuda_partition_ptr_buffer[partition_index + 1] = num_total_elements; if (blockIdx.x == 0) { cuda_partition_ptr_buffer[0] = 0; @@ -112,7 +117,7 @@ __global__ void ComputePartitionPtr( } uint64_t CUDARowData::LaunchCalcTotalNumberOfElementsKernel(const CUDARowData* full_set) { - const int num_blocks = (num_data_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; + const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; SetCUDAMemoryOuter(cuda_block_sum_buffer_, 0, static_cast(num_blocks * num_feature_partitions_) + 1, __FILE__, __LINE__); if (full_set->row_ptr_bit_type_ == 16) { CalcTotalNumberOfElementsKernel<<>>( @@ -139,11 +144,13 @@ uint64_t CUDARowData::LaunchCalcTotalNumberOfElementsKernel(const CUDARowData* f num_data_, cuda_block_sum_buffer_); } + Log::Warning("num_feature_partitions_ = %d", num_feature_partitions_); ReduceBlockSumKernel<<>>( cuda_block_sum_buffer_, num_blocks, num_feature_partitions_, cuda_partition_ptr_buffer_); ComputePartitionPtr<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_); uint64_t num_total_elements = 0; - CopyFromCUDADeviceToHostOuter(&num_total_elements, cuda_partition_ptr_buffer_, num_feature_partitions_, __FILE__, __LINE__); + CopyFromCUDADeviceToHostOuter(&num_total_elements, cuda_partition_ptr_buffer_ + num_feature_partitions_, 1, __FILE__, __LINE__); + Log::Warning("num_used_indices = %d, num_blocks = %d", num_used_indices_, num_blocks); return num_total_elements; } @@ -164,7 +171,7 @@ __global__ void CopySparseSubrowRowPtrKernel( const data_size_t* cuda_used_indices, OUT_ROW_PTR_TYPE* out_cuda_row_ptr) { const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (local_data_index > num_used_indices) { + if (local_data_index < num_used_indices) { const data_size_t global_data_index = cuda_used_indices[local_data_index]; const IN_ROW_PTR_TYPE row_start = cuda_row_ptr[global_data_index]; const IN_ROW_PTR_TYPE row_end = cuda_row_ptr[global_data_index + 1]; @@ -173,27 +180,76 @@ __global__ void CopySparseSubrowRowPtrKernel( } } -template +template __global__ void CopySparseSubrowDataKernel( const BIN_TYPE* in_cuda_data, - const ROW_PTR_TYPE* cuda_row_ptr, + const IN_ROW_PTR_TYPE* in_cuda_row_ptr, + const OUT_ROW_PTR_TYPE* out_cuda_row_ptr, const data_size_t num_used_indices, const data_size_t* cuda_used_indices, BIN_TYPE* out_cuda_data) { const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (local_data_index < num_used_indices) { const data_size_t global_data_index = cuda_used_indices[local_data_index]; - const ROW_PTR_TYPE row_start = cuda_row_ptr[global_data_index]; - const ROW_PTR_TYPE row_end = cuda_row_ptr[global_data_index + 1]; - const ROW_PTR_TYPE num_elements_in_row = row_end - row_start; - const BIN_TYPE* in_cuda_data_ptr = in_cuda_data + row_start; - BIN_TYPE* out_cuda_data_ptr = out_cuda_data + row_start; - for (ROW_PTR_TYPE element_index = 0; element_index < num_elements_in_row; ++element_index) { + const IN_ROW_PTR_TYPE in_row_start = in_cuda_row_ptr[global_data_index]; + const IN_ROW_PTR_TYPE in_row_end = in_cuda_row_ptr[global_data_index + 1]; + const IN_ROW_PTR_TYPE in_num_elements_in_row = in_row_end - in_row_start; + const OUT_ROW_PTR_TYPE out_row_start = out_cuda_row_ptr[local_data_index]; + const OUT_ROW_PTR_TYPE out_row_end = out_cuda_row_ptr[local_data_index + 1]; + const OUT_ROW_PTR_TYPE out_num_elements_in_row = out_row_end - out_row_start; + if (in_num_elements_in_row != out_num_elements_in_row) { + printf("error !!!!!, in_num_elements_in_row = %d, out_num_elements_in_row = %d\n", in_num_elements_in_row, out_num_elements_in_row); + } + if (out_row_end > in_row_end || out_row_start > in_row_start) { + printf("error !!!!!, out_row_end = %d, in_row_end = %d, out_row_start = %d, in_row_start = %d\n", out_row_end, in_row_end, out_row_start, in_row_start); + } + const BIN_TYPE* in_cuda_data_ptr = in_cuda_data + in_row_start; + BIN_TYPE* out_cuda_data_ptr = out_cuda_data + out_row_start; + for (IN_ROW_PTR_TYPE element_index = 0; element_index < in_num_elements_in_row; ++element_index) { out_cuda_data_ptr[element_index] = in_cuda_data_ptr[element_index]; } } } +template +void CUDARowData::LaunchCopySparseSubrowKernelInner1( + const CUDARowData* full_set, + const BIN_TYPE* in_cuda_data, + const OUT_ROW_PTR_TYPE* out_cuda_row_ptr, + BIN_TYPE* out_cuda_data) { + CHECK_GE(full_set->row_ptr_bit_type_, row_ptr_bit_type_); + const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; + if (full_set->row_ptr_bit_type_ == 16) { + CopySparseSubrowDataKernel<<>>( + in_cuda_data, full_set->cuda_row_ptr_uint16_t_, out_cuda_row_ptr, num_used_indices_, cuda_used_indices_, out_cuda_data); + } else if (full_set->row_ptr_bit_type_ == 32) { + CopySparseSubrowDataKernel<<>>( + in_cuda_data, full_set->cuda_row_ptr_uint32_t_, out_cuda_row_ptr, num_used_indices_, cuda_used_indices_, out_cuda_data); + } else if (full_set->row_ptr_bit_type_ == 64) { + CopySparseSubrowDataKernel<<>>( + in_cuda_data, full_set->cuda_row_ptr_uint64_t_, out_cuda_row_ptr, num_used_indices_, cuda_used_indices_, out_cuda_data); + } +} + +template +void CUDARowData::LaunchCopySparseSubrowKernelInner0( + const CUDARowData* full_set, + OUT_ROW_PTR_TYPE* out_cuda_row_ptr) { + ShufflePrefixSumGlobal( + out_cuda_row_ptr, + static_cast(num_used_indices_) + 1, + reinterpret_cast(cuda_block_sum_buffer_)); + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + if (bit_type_ == 8) { + LaunchCopySparseSubrowKernelInner1(full_set, full_set->cuda_data_uint8_t_, out_cuda_row_ptr, cuda_data_uint8_t_); + } else if (bit_type_ == 16) { + LaunchCopySparseSubrowKernelInner1(full_set, full_set->cuda_data_uint16_t_, out_cuda_row_ptr, cuda_data_uint16_t_); + } else if (bit_type_ == 32) { + LaunchCopySparseSubrowKernelInner1(full_set, full_set->cuda_data_uint32_t_, out_cuda_row_ptr, cuda_data_uint32_t_); + } + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); +} + void CUDARowData::LaunchCopySparseSubrowKernel(const CUDARowData* full_set) { if (row_ptr_bit_type_ == 16) { CopyPartitionPtrKernel<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_, cuda_partition_ptr_uint16_t_); @@ -202,6 +258,7 @@ void CUDARowData::LaunchCopySparseSubrowKernel(const CUDARowData* full_set) { } else if (row_ptr_bit_type_ == 64) { CopyPartitionPtrKernel<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_, cuda_partition_ptr_uint64_t_); } + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; if (full_set->row_ptr_bit_type_ == 16) { CHECK_EQ(row_ptr_bit_type_, 16); @@ -228,52 +285,15 @@ void CUDARowData::LaunchCopySparseSubrowKernel(const CUDARowData* full_set) { full_set->cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint64_t_); } } + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); if (row_ptr_bit_type_ == 16) { - ShufflePrefixSumGlobal( - cuda_row_ptr_uint16_t_, - static_cast(num_used_indices_) + 1, - reinterpret_cast(cuda_block_sum_buffer_)); - if (bit_type_ == 8) { - CopySparseSubrowDataKernel<<>>( - full_set->cuda_data_uint8_t_, cuda_row_ptr_uint16_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint8_t_); - } else if (bit_type_ == 16) { - CopySparseSubrowDataKernel<<>>( - full_set->cuda_data_uint16_t_, cuda_row_ptr_uint16_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint16_t_); - } else if (bit_type_ == 32) { - CopySparseSubrowDataKernel<<>>( - full_set->cuda_data_uint32_t_, cuda_row_ptr_uint16_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint32_t_); - } + LaunchCopySparseSubrowKernelInner0(full_set, cuda_row_ptr_uint16_t_); } else if (row_ptr_bit_type_ == 32) { - ShufflePrefixSumGlobal( - cuda_row_ptr_uint32_t_, - static_cast(num_used_indices_) + 1, - reinterpret_cast(cuda_block_sum_buffer_)); - if (bit_type_ == 8) { - CopySparseSubrowDataKernel<<>>( - full_set->cuda_data_uint8_t_, cuda_row_ptr_uint32_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint8_t_); - } else if (bit_type_ == 16) { - CopySparseSubrowDataKernel<<>>( - full_set->cuda_data_uint16_t_, cuda_row_ptr_uint32_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint16_t_); - } else if (bit_type_ == 32) { - CopySparseSubrowDataKernel<<>>( - full_set->cuda_data_uint32_t_, cuda_row_ptr_uint32_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint32_t_); - } + LaunchCopySparseSubrowKernelInner0(full_set, cuda_row_ptr_uint32_t_); } else if (row_ptr_bit_type_ == 64) { - ShufflePrefixSumGlobal( - cuda_row_ptr_uint64_t_, - static_cast(num_used_indices_) + 1, - reinterpret_cast(cuda_block_sum_buffer_)); - if (bit_type_ == 8) { - CopySparseSubrowDataKernel<<>>( - full_set->cuda_data_uint8_t_, cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint8_t_); - } else if (bit_type_ == 16) { - CopySparseSubrowDataKernel<<>>( - full_set->cuda_data_uint16_t_, cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint16_t_); - } else if (bit_type_ == 32) { - CopySparseSubrowDataKernel<<>>( - full_set->cuda_data_uint32_t_, cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_data_uint32_t_); - } + LaunchCopySparseSubrowKernelInner0(full_set, cuda_row_ptr_uint64_t_); } + SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } } // namespace LightGBM From 0b9ca24cb142eef6355febdb9d6feaac97e52862 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sun, 26 Sep 2021 02:22:30 +0000 Subject: [PATCH 080/166] use copy subset for column subset --- include/LightGBM/cuda/cuda_row_data.hpp | 46 +++- src/io/cuda/cuda_row_data.cpp | 295 +++++++++++++++++++++--- src/io/cuda/cuda_row_data.cu | 88 ++++++- 3 files changed, 382 insertions(+), 47 deletions(-) diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 27e7c8b64ba5..6bf44510960e 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -31,6 +31,11 @@ class CUDARowData { void CopySubrow(const CUDARowData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices); + void CopySubcol(const CUDARowData* full_set, const std::vector& is_feature_used, const Dataset* train_data); + + void CopySubrowAndSubcol(const CUDARowData* full_set, const data_size_t* used_indices, + const data_size_t num_used_indices, const std::vector& is_feature_used, const Dataset* train_data); + int num_feature_partitions() const { return num_feature_partitions_; } int max_num_column_per_partition() const { return max_num_column_per_partition_; } @@ -85,18 +90,34 @@ class CUDARowData { ROW_PTR_TYPE** cuda_row_ptr, ROW_PTR_TYPE** cuda_partition_ptr); + void InitMetaInfoBeforeCopy(const CUDARowData* full_set); + + void PrepareSubsetColumnInfo(const std::vector& is_feature_used, const CUDARowData* full_set); + void CopyDenseSubrowData(const CUDARowData* full_set, const data_size_t num_used_indices, const data_size_t* used_indices); void CopySparseSubrowData(const CUDARowData* full_set, const data_size_t num_used_indices, const data_size_t* used_indices); + void CopyDenseSubcolData(const CUDARowData* full_set); + + void CopySparseSubcolData(const CUDARowData* full_set); + uint64_t CalcTotalNumberOfElements(const CUDARowData* full_set); uint64_t LaunchCalcTotalNumberOfElementsKernel(const CUDARowData* full_set); + uint64_t CalcTotalNumberOfElementsSubcol(const CUDARowData* full_set); + + uint64_t LaunchCalcTotalNumberOfElementsSubcolKernel(const CUDARowData* full_set); + void LaunchCopyDenseSubrowKernel(const CUDARowData* full_set); void LaunchCopySparseSubrowKernel(const CUDARowData* full_set); + void LaunchCopyDenseSubcolKernel(const CUDARowData* full_set); + + void BuildBinToColumnMap(const CUDARowData* full_set); + template void LaunchCopySparseSubrowKernelInner0( const CUDARowData* full_set, @@ -141,12 +162,23 @@ class CUDARowData { uint64_t num_total_elements_; /*! \brief used when bagging with subset, the size of buffer for copy subrow */ data_size_t cur_subset_buffer_size_; - /*! \brief used when bagging with subset, the size of buffer for copy subrow */ + /*! \brief used when bagging with subset, the size of buffer for copy subrow of sparse data */ uint64_t cur_total_elements_buffer_size_; - /*! \brief used when bagging with subset, block buffer when reducing the number of elements in the subset */ - uint64_t* cuda_block_sum_buffer_; + /*! \brief used when bagging with column subset, the size of maximum number of feature partitions */ + int cur_num_feature_partition_buffer_size_; /*! \brief CUDA device ID */ int gpu_device_id_; + /*! \brief whether data is initialized */ + bool is_data_initialized_; + + /*! \brief used column indices when copying sub column */ + std::vector used_columns_; + /*! \brief a map from feature_index to column index, used when copying sub column */ + std::vector feature_index_to_column_index_; + /*! \brief complete histogram offset of each column, which equals that in train share states, used in copying sub columns */ + std::vector complete_column_hist_offsets_; + /*! \brief used when bagging with column subset, the size of buffer for columns */ + data_size_t cur_subcol_buffer_size_; // CUDA memory @@ -184,6 +216,14 @@ class CUDARowData { uint64_t* cuda_block_buffer_uint64_t_; /*! \brief partition ptr buffer */ uint64_t* cuda_partition_ptr_buffer_; + /*! \brief used when bagging with subset, block buffer when reducing the number of elements in the subset */ + uint64_t* cuda_block_sum_buffer_; + /*! \brief used with column subset, maps the original column_index to partition index */ + int* cuda_column_index_to_partition_index_; + /*! \brief used column indices */ + int* cuda_used_columns_; + /*! \brief maps bin to column index, used when copy column subsets in sparse data */ + int* cuda_bin_to_column_index_; }; } // namespace LightGBM diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 9b42c3267001..3fd3c7b68362 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -23,12 +23,18 @@ CUDARowData::CUDARowData(const Dataset* train_data, cuda_used_indices_ = nullptr; cur_subset_buffer_size_ = 0; cur_total_elements_buffer_size_ = 0; + cur_subcol_buffer_size_ = 0; + cur_num_feature_partition_buffer_size_ = 0; + is_data_initialized_ = false; } CUDARowData::CUDARowData() { cuda_used_indices_ = nullptr; cur_subset_buffer_size_ = 0; cur_total_elements_buffer_size_ = 0; + cur_subcol_buffer_size_ = 0; + cur_num_feature_partition_buffer_size_ = 0; + is_data_initialized_ = false; } void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_share_state) { @@ -135,6 +141,7 @@ void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_sha Log::Fatal("Unknow bit type = %d", bit_type_); } SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + is_data_initialized_ = true; } void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state) { @@ -209,6 +216,7 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha } } } + column_hist_offsets_.emplace_back(column_hist_offsets.back() - start_hist_offset); max_num_column_per_partition_ = 0; for (size_t i = 0; i < feature_partition_column_index_offsets_.size() - 1; ++i) { const int num_column = feature_partition_column_index_offsets_[i + 1] - feature_partition_column_index_offsets_[i]; @@ -342,41 +350,46 @@ void CUDARowData::InitSparseData(const BIN_TYPE* host_data, } } +void CUDARowData::InitMetaInfoBeforeCopy(const CUDARowData* full_set) { + CHECK_EQ(cur_subset_buffer_size_, 0); + CHECK_EQ(cur_total_elements_buffer_size_, 0); + // initialize meta information + num_threads_ = full_set->num_threads_; + num_data_ = full_set->num_data_; + num_total_bin_ = full_set->num_total_bin_; + num_feature_group_ = full_set->num_feature_group_; + num_feature_ = full_set->num_feature_; + gpu_device_id_ = full_set->gpu_device_id_; + if (gpu_device_id_ >= 0) { + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id_)); + } else { + CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + } + bit_type_ = full_set->bit_type_; + is_sparse_ = full_set->is_sparse_; + feature_partition_column_index_offsets_ = full_set->feature_partition_column_index_offsets_; + column_hist_offsets_ = full_set->column_hist_offsets_; + partition_hist_offsets_ = full_set->partition_hist_offsets_; + max_num_column_per_partition_ = full_set->max_num_column_per_partition_; + num_feature_partitions_ = full_set->num_feature_partitions_; + + InitCUDAMemoryFromHostMemoryOuter( + &cuda_feature_partition_column_index_offsets_, + feature_partition_column_index_offsets_.data(), + feature_partition_column_index_offsets_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_column_hist_offsets_, + column_hist_offsets_.data(), column_hist_offsets_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_partition_hist_offsets_, + partition_hist_offsets_.data(), partition_hist_offsets_.size(), __FILE__, __LINE__); +} + void CUDARowData::CopySubrow( const CUDARowData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices) { - if (cuda_used_indices_ == nullptr) { - CHECK_EQ(cur_subset_buffer_size_, 0); - CHECK_EQ(cur_total_elements_buffer_size_, 0); - // initialize meta information - num_threads_ = full_set->num_threads_; - num_data_ = full_set->num_data_; - num_total_bin_ = full_set->num_total_bin_; - num_feature_group_ = full_set->num_feature_group_; - num_feature_ = full_set->num_feature_; - gpu_device_id_ = full_set->gpu_device_id_; - if (gpu_device_id_ >= 0) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id_)); - } else { - CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); - } - bit_type_ = full_set->bit_type_; - is_sparse_ = full_set->is_sparse_; - feature_partition_column_index_offsets_ = full_set->feature_partition_column_index_offsets_; - column_hist_offsets_ = full_set->column_hist_offsets_; - partition_hist_offsets_ = full_set->partition_hist_offsets_; - max_num_column_per_partition_ = full_set->max_num_column_per_partition_; - num_feature_partitions_ = full_set->num_feature_partitions_; - - InitCUDAMemoryFromHostMemoryOuter( - &cuda_feature_partition_column_index_offsets_, - feature_partition_column_index_offsets_.data(), - feature_partition_column_index_offsets_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_column_hist_offsets_, - column_hist_offsets_.data(), column_hist_offsets_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_partition_hist_offsets_, - partition_hist_offsets_.data(), partition_hist_offsets_.size(), __FILE__, __LINE__); + if (!is_data_initialized_) { + InitMetaInfoBeforeCopy(full_set); + is_data_initialized_ = true; } if (!full_set->is_sparse_) { CopyDenseSubrowData(full_set, num_used_indices, used_indices); @@ -385,6 +398,224 @@ void CUDARowData::CopySubrow( } } +void CUDARowData::PrepareSubsetColumnInfo(const std::vector& is_feature_used, const CUDARowData* full_set) { + // get used columns + const int num_column = feature_partition_column_index_offsets_.back(); + std::vector is_column_used(num_column, 0); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { + OMP_LOOP_EX_BEGIN(); + if (is_feature_used[feature_index]) { + const int column_index = feature_index_to_column_index_[feature_index]; + is_column_used[column_index] = 1; + } + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + used_columns_.clear(); + for (int column_index = 0; column_index < num_column; ++column_index) { + if (is_column_used[column_index]) { + used_columns_.emplace_back(column_index); + } + } + + std::vector column_index_to_partition_index(full_set->feature_partition_column_index_offsets_.back()); + // get column index to partition index map + for (int partition_index = 0; partition_index < full_set->num_feature_partitions_; ++partition_index) { + const int partition_column_start = full_set->feature_partition_column_index_offsets_[partition_index]; + const int partition_column_end = full_set->feature_partition_column_index_offsets_[partition_index + 1]; + for (int column_index = partition_column_start; column_index < partition_column_end; ++column_index) { + column_index_to_partition_index[column_index] = partition_index; + } + } + InitCUDAMemoryFromHostMemoryOuter(&cuda_column_index_to_partition_index_, + column_index_to_partition_index.data(), + column_index_to_partition_index.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemoryOuter(&cuda_used_columns_, used_columns_.data(), used_columns_.size(), __FILE__, __LINE__); + + const uint32_t max_num_bin_per_partition = SHRAE_HIST_SIZE / 2; + int cur_partition_index = 0; + feature_partition_column_index_offsets_.clear(); + column_hist_offsets_.clear(); + partition_hist_offsets_.clear(); + max_num_column_per_partition_ = 0; + num_feature_partitions_ = 0; + int cur_num_bin_in_partition = 0; + int cur_num_bin = 0; + feature_partition_column_index_offsets_.emplace_back(0); + partition_hist_offsets_.emplace_back(0); + for (int column_index = 0; column_index < static_cast(used_columns_.size()); ++column_index) { + const int real_column_index = used_columns_[column_index]; + const int num_bin_in_column = complete_column_hist_offsets_[real_column_index + 1] - complete_column_hist_offsets_[real_column_index]; + const int next_num_bin_in_partition = cur_num_bin_in_partition + num_bin_in_column; + if (next_num_bin_in_partition >= max_num_bin_per_partition) { + feature_partition_column_index_offsets_.emplace_back(column_index); + partition_hist_offsets_.emplace_back(cur_num_bin); + cur_num_bin_in_partition = num_bin_in_column; + ++num_feature_partitions_; + } + cur_num_bin += num_bin_in_column; + column_hist_offsets_.emplace_back(complete_column_hist_offsets_[column_index] - partition_hist_offsets_.back()); + } + feature_partition_column_index_offsets_.emplace_back(static_cast(used_columns_.size())); + partition_hist_offsets_.emplace_back(cur_num_bin); + ++num_feature_partitions_; + + for (int partition_index = 0; partition_index < num_feature_partitions_; ++partition_index) { + const int num_column_in_partition = feature_partition_column_index_offsets_[partition_index + 1] - feature_partition_column_index_offsets_[partition_index]; + if (num_column_in_partition > max_num_column_per_partition_) { + max_num_column_per_partition_ = num_column_in_partition; + } + } + + CopyFromHostToCUDADeviceOuter(cuda_feature_partition_column_index_offsets_, + feature_partition_column_index_offsets_.data(), + feature_partition_column_index_offsets_.size(), __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_column_hist_offsets_, + column_hist_offsets_.data(), + column_hist_offsets_.size(), __FILE__, __LINE__); + CopyFromHostToCUDADeviceOuter(cuda_partition_hist_offsets_, + partition_hist_offsets_.data(), + partition_hist_offsets_.size(), __FILE__, __LINE__); + +} + +void CUDARowData::CopyDenseSubcolData(const CUDARowData* full_set) { + const data_size_t num_used_column = static_cast(used_columns_.size()); + if (cur_subcol_buffer_size_ > num_used_column) { + const uint64_t num_total_elements = used_columns_.size() * static_cast(num_data_); + if (cur_subcol_buffer_size_ > 0) { + if (bit_type_ == 8) { + DeallocateCUDAMemoryOuter(&cuda_data_uint8_t_, __FILE__, __LINE__); + } else if (bit_type_ == 16) { + DeallocateCUDAMemoryOuter(&cuda_data_uint16_t_, __FILE__, __LINE__); + } else if (bit_type_ == 32) { + DeallocateCUDAMemoryOuter(&cuda_data_uint32_t_, __FILE__, __LINE__); + } + } + if (bit_type_ == 8) { + AllocateCUDAMemoryOuter(&cuda_data_uint8_t_, num_total_elements, __FILE__, __LINE__); + } else if (bit_type_ == 16) { + AllocateCUDAMemoryOuter(&cuda_data_uint16_t_, num_total_elements, __FILE__, __LINE__); + } else if (bit_type_ == 32) { + AllocateCUDAMemoryOuter(&cuda_data_uint32_t_, num_total_elements, __FILE__, __LINE__); + } + } + LaunchCopyDenseSubcolKernel(full_set); +} + +void CUDARowData::BuildBinToColumnMap(const CUDARowData* full_set) { + std::vector bin_to_column(complete_column_hist_offsets_.back()); + const int num_column = full_set->feature_partition_column_index_offsets_.back(); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(num_threads_) + for (int column_index = 0; column_index < num_column; ++column_index) { + OMP_LOOP_EX_BEGIN(); + const uint32_t column_hist_bin_start = complete_column_hist_offsets_[column_index]; + const uint32_t column_hist_bin_end = complete_column_hist_offsets_[column_index + 1]; + for (uint32_t bin = column_hist_bin_start; bin < column_hist_bin_end; ++bin) { + bin_to_column[bin] = column_index; + } + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + InitCUDAMemoryFromHostMemoryOuter(&cuda_bin_to_column_index_, bin_to_column.data(), bin_to_column.size(), __FILE__, __LINE__); +} + +uint64_t CUDARowData::CalcTotalNumberOfElementsSubcol(const CUDARowData* full_set) { + return LaunchCalcTotalNumberOfElementsSubcolKernel(full_set); +} + +void CUDARowData::CopySparseSubcolData(const CUDARowData* full_set) { + if (!is_data_initialized_) { + const int num_blocks = (num_data_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; + AllocateCUDAMemoryOuter(&cuda_block_sum_buffer_, static_cast(num_blocks * full_set->num_feature_partitions_) + 1, __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint16_t_, static_cast(full_set->num_feature_partitions_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint32_t_, static_cast(full_set->num_feature_partitions_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint64_t_, static_cast(full_set->num_feature_partitions_), __FILE__, __LINE__); + AllocateCUDAMemoryOuter(&cuda_partition_ptr_buffer_, static_cast(full_set->num_feature_partitions_) + 1, __FILE__, __LINE__); + is_data_initialized_ = true; + BuildBinToColumnMap(full_set); + } + num_total_elements_ = CalcTotalNumberOfElementsSubcol(full_set); + if (num_total_elements_ > cur_total_elements_buffer_size_) { + if (cur_total_elements_buffer_size_ > 0) { + if (bit_type_ == 8) { + DeallocateCUDAMemoryOuter(&cuda_data_uint8_t_, __FILE__, __LINE__); + } else if (bit_type_ == 16) { + DeallocateCUDAMemoryOuter(&cuda_data_uint16_t_, __FILE__, __LINE__); + } else if (bit_type_ == 32) { + DeallocateCUDAMemoryOuter(&cuda_data_uint32_t_, __FILE__, __LINE__); + } + } + if (bit_type_ == 8) { + AllocateCUDAMemoryOuter(&cuda_data_uint8_t_, num_total_elements_, __FILE__, __LINE__); + } else if (bit_type_ == 16) { + AllocateCUDAMemoryOuter(&cuda_data_uint16_t_, num_total_elements_, __FILE__, __LINE__); + } else if (bit_type_ == 32) { + AllocateCUDAMemoryOuter(&cuda_data_uint32_t_, num_total_elements_, __FILE__, __LINE__); + } + cur_total_elements_buffer_size_ = num_total_elements_; + } + if (num_feature_partitions_ > cur_num_feature_partition_buffer_size_) { + if (cur_num_feature_partition_buffer_size_ > 0) { + if (row_ptr_bit_type_ == 16) { + DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint16_t_, __FILE__, __LINE__); + } else if (row_ptr_bit_type_ == 32) { + DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint32_t_, __FILE__, __LINE__); + } else if (row_ptr_bit_type_ == 64) { + DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint64_t_, __FILE__, __LINE__); + } + } + const size_t row_ptr_size = static_cast(num_feature_partitions_) * (static_cast(num_data_) + 1); + if (row_ptr_bit_type_ == 16) { + AllocateCUDAMemoryOuter(&cuda_row_ptr_uint16_t_, row_ptr_size, __FILE__, __LINE__); + } else if (row_ptr_bit_type_ == 32) { + AllocateCUDAMemoryOuter(&cuda_row_ptr_uint32_t_, row_ptr_size, __FILE__, __LINE__); + } else if (row_ptr_bit_type_ == 64) { + AllocateCUDAMemoryOuter(&cuda_row_ptr_uint64_t_, row_ptr_size, __FILE__, __LINE__); + } + cur_num_feature_partition_buffer_size_ = num_feature_partitions_; + } +} + +void CUDARowData::CopySubcol(const CUDARowData* full_set, const std::vector& is_feature_used, const Dataset* train_data) { + if (!is_data_initialized_) { + InitMetaInfoBeforeCopy(full_set); + is_data_initialized_ = true; + feature_index_to_column_index_.resize(num_feature_, -1); + int cur_group_index = -1; + int cur_column_index = -1; + for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { + const int group_index = train_data->Feature2Group(feature_index); + if (!train_data->IsMultiGroup(group_index)) { + if (group_index != cur_group_index) { + ++cur_column_index; + } + } else { + ++cur_column_index; + } + feature_index_to_column_index_[feature_index] = cur_column_index; + } + complete_column_hist_offsets_ = column_hist_offsets_; + const int num_column = feature_partition_column_index_offsets_.back(); + uint32_t offset = 0; + for (int i = 1; i <= num_column; ++i) { + if (complete_column_hist_offsets_[i] == 0) { + offset = complete_column_hist_offsets_[i - 1]; + } + complete_column_hist_offsets_[i] += offset; + } + } + PrepareSubsetColumnInfo(is_feature_used, full_set); + if (!full_set->is_sparse_) { + CopyDenseSubcolData(full_set); + } else { + CopySparseSubcolData(full_set); + } +} + uint64_t CUDARowData::CalcTotalNumberOfElements(const CUDARowData* full_set) { return LaunchCalcTotalNumberOfElementsKernel(full_set); } @@ -444,7 +675,7 @@ void CUDARowData::CopySparseSubrowData(const CUDARowData* full_set, const data_s } CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices_), __FILE__, __LINE__); num_total_elements_ = CalcTotalNumberOfElements(full_set); - Log::Warning("num_total_elements_ = %d", num_total_elements_); + //Log::Warning("num_total_elements_ = %d", num_total_elements_); if (num_total_elements_ > cur_total_elements_buffer_size_) { need_reallocate_data = true; } diff --git a/src/io/cuda/cuda_row_data.cu b/src/io/cuda/cuda_row_data.cu index 055368283ca6..b0b5230948f7 100644 --- a/src/io/cuda/cuda_row_data.cu +++ b/src/io/cuda/cuda_row_data.cu @@ -61,7 +61,7 @@ __global__ void CalcTotalNumberOfElementsKernel( } const uint64_t num_elements_in_block = ShuffleReduceSum(num_elements_in_row, shared_mem_buffer, blockDim.x); if (threadIdx.x == 0) { - printf("blockIdx.x = %d, partition_index = %d, num_elements_in_block = %d\n", blockIdx.x, partition_index, num_elements_in_block); + //printf("blockIdx.x = %d, partition_index = %d, num_elements_in_block = %lu\n", blockIdx.x, partition_index, num_elements_in_block); block_sum_buffer[partition_index * gridDim.x + blockIdx.x] = num_elements_in_block; } } @@ -78,12 +78,12 @@ __global__ void ReduceBlockSumKernel( for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { thread_sum += block_sum_buffer_ptr[block_index]; } - if (threadIdx.x == 0) { - printf("thread_sum = %d\n", thread_sum); - } + /*if (threadIdx.x == 0) { + printf("thread_sum = %lu\n", thread_sum); + }*/ const uint64_t num_total_elements = ShuffleReduceSum(thread_sum, shared_mem_buffer, blockDim.x); if (threadIdx.x == 0) { - printf("partition_index = %d, num_total_elements = %d\n", partition_index, num_total_elements); + //printf("partition_index = %d, num_total_elements = %lu\n", partition_index, num_total_elements); cuda_partition_ptr_buffer[partition_index + 1] = num_total_elements; if (blockIdx.x == 0) { cuda_partition_ptr_buffer[0] = 0; @@ -112,7 +112,7 @@ __global__ void ComputePartitionPtr( cuda_partition_ptr_buffer[partition_index] += thread_base; } if (threadIdx.x == blockDim.x - 1) { - cuda_partition_ptr_buffer[num_feature_partitions] = thread_sum; + cuda_partition_ptr_buffer[num_feature_partitions] = thread_base; } } @@ -144,13 +144,13 @@ uint64_t CUDARowData::LaunchCalcTotalNumberOfElementsKernel(const CUDARowData* f num_data_, cuda_block_sum_buffer_); } - Log::Warning("num_feature_partitions_ = %d", num_feature_partitions_); + //Log::Warning("num_feature_partitions_ = %d", num_feature_partitions_); ReduceBlockSumKernel<<>>( cuda_block_sum_buffer_, num_blocks, num_feature_partitions_, cuda_partition_ptr_buffer_); ComputePartitionPtr<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_); uint64_t num_total_elements = 0; CopyFromCUDADeviceToHostOuter(&num_total_elements, cuda_partition_ptr_buffer_ + num_feature_partitions_, 1, __FILE__, __LINE__); - Log::Warning("num_used_indices = %d, num_blocks = %d", num_used_indices_, num_blocks); + //Log::Warning("num_used_indices = %d, num_blocks = %d, num_total_elements = %d", num_used_indices_, num_blocks, num_total_elements); return num_total_elements; } @@ -197,12 +197,13 @@ __global__ void CopySparseSubrowDataKernel( const OUT_ROW_PTR_TYPE out_row_start = out_cuda_row_ptr[local_data_index]; const OUT_ROW_PTR_TYPE out_row_end = out_cuda_row_ptr[local_data_index + 1]; const OUT_ROW_PTR_TYPE out_num_elements_in_row = out_row_end - out_row_start; - if (in_num_elements_in_row != out_num_elements_in_row) { - printf("error !!!!!, in_num_elements_in_row = %d, out_num_elements_in_row = %d\n", in_num_elements_in_row, out_num_elements_in_row); + /*if (in_num_elements_in_row != out_num_elements_in_row) { + printf("error !!!!!, in_num_elements_in_row = %d, out_num_elements_in_row = %d\n", static_cast(in_num_elements_in_row), static_cast(out_num_elements_in_row)); } if (out_row_end > in_row_end || out_row_start > in_row_start) { - printf("error !!!!!, out_row_end = %d, in_row_end = %d, out_row_start = %d, in_row_start = %d\n", out_row_end, in_row_end, out_row_start, in_row_start); - } + printf("error !!!!!, out_row_end = %d, in_row_end = %d, out_row_start = %d, in_row_start = %d\n", + static_cast(out_row_end), static_cast(in_row_end), static_cast(out_row_start), static_cast(in_row_start)); + }*/ const BIN_TYPE* in_cuda_data_ptr = in_cuda_data + in_row_start; BIN_TYPE* out_cuda_data_ptr = out_cuda_data + out_row_start; for (IN_ROW_PTR_TYPE element_index = 0; element_index < in_num_elements_in_row; ++element_index) { @@ -296,4 +297,67 @@ void CUDARowData::LaunchCopySparseSubrowKernel(const CUDARowData* full_set) { SynchronizeCUDADeviceOuter(__FILE__, __LINE__); } +template +void __global__ CopyDenseSubcolKernel( + const BIN_TYPE* in_cuda_data, + const int out_num_feature_partitions, + const int* cuda_used_columns, + const int* cuda_column_index_to_partition_index, + const int* in_cuda_feature_partition_column_index_offsets, + const int* out_cuda_feature_partition_column_index_offsets, + const data_size_t num_data, + BIN_TYPE* out_cuda_data) { + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (data_index < num_data) { + for (int out_partition_index = 0; out_partition_index < out_num_feature_partitions; ++out_partition_index) { + const int out_partition_column_start = out_cuda_feature_partition_column_index_offsets[out_partition_index]; + const int out_partition_column_end = out_cuda_feature_partition_column_index_offsets[out_partition_index + 1]; + BIN_TYPE* out_cuda_data_ptr = out_cuda_data + out_partition_column_start * num_data + data_index * (out_partition_column_end - out_partition_column_start); + for (int local_column_index = out_partition_column_start; local_column_index < out_partition_column_end; ++local_column_index) { + const int global_column_index = cuda_used_columns[local_column_index]; + const int global_partition_index = cuda_column_index_to_partition_index[global_column_index]; + const int in_partition_column_start = in_cuda_feature_partition_column_index_offsets[global_partition_index]; + const int in_partition_column_end = in_cuda_feature_partition_column_index_offsets[global_partition_index + 1]; + const BIN_TYPE* in_cuda_data_ptr = in_cuda_data + in_partition_column_start * num_data + data_index * (in_partition_column_end - in_partition_column_start); + out_cuda_data_ptr[local_column_index - out_partition_column_start] = in_cuda_data_ptr[global_column_index - in_partition_column_start]; + } + } + } +} + +void CUDARowData::LaunchCopyDenseSubcolKernel(const CUDARowData* full_set) { + const int num_blocks = (num_data_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; + if (bit_type_ == 8) { + CopyDenseSubcolKernel<<>>( + full_set->cuda_data_uint8_t_, + num_feature_partitions_, + cuda_used_columns_, + cuda_column_index_to_partition_index_, + cuda_feature_partition_column_index_offsets_, + full_set->cuda_feature_partition_column_index_offsets_, + num_data_, + cuda_data_uint8_t_); + } else if (bit_type_ == 16) { + CopyDenseSubcolKernel<<>>( + full_set->cuda_data_uint16_t_, + num_feature_partitions_, + cuda_used_columns_, + cuda_column_index_to_partition_index_, + cuda_feature_partition_column_index_offsets_, + full_set->cuda_feature_partition_column_index_offsets_, + num_data_, + cuda_data_uint16_t_); + } else if (bit_type_ == 32) { + CopyDenseSubcolKernel<<>>( + full_set->cuda_data_uint32_t_, + num_feature_partitions_, + cuda_used_columns_, + cuda_column_index_to_partition_index_, + cuda_feature_partition_column_index_offsets_, + full_set->cuda_feature_partition_column_index_offsets_, + num_data_, + cuda_data_uint32_t_); + } +} + } // namespace LightGBM From 9a942402717cdd063546969e40ea38ef2bc0c87c Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sun, 26 Sep 2021 08:37:43 +0000 Subject: [PATCH 081/166] add reset train data and reset config for CUDA tree learner add deconstructors for cuda tree learner --- include/LightGBM/cuda/cuda_row_data.hpp | 70 +-- include/LightGBM/cuda/cuda_utils.h | 48 +- src/boosting/gbdt.cpp | 18 +- src/cuda/cuda_utils.cpp | 11 +- src/io/cuda/cuda_column_data.cpp | 94 ++-- src/io/cuda/cuda_metadata.cpp | 29 +- src/io/cuda/cuda_row_data.cpp | 453 ++---------------- src/io/cuda/cuda_row_data.cu | 363 -------------- src/io/cuda/cuda_tree.cpp | 114 +++-- .../cuda/cuda_best_split_finder.cpp | 141 ++++-- .../cuda/cuda_best_split_finder.cu | 69 +-- .../cuda/cuda_best_split_finder.hpp | 31 +- src/treelearner/cuda/cuda_data_partition.cpp | 199 ++++---- src/treelearner/cuda/cuda_data_partition.cu | 10 +- src/treelearner/cuda/cuda_data_partition.hpp | 26 +- .../cuda/cuda_histogram_constructor.cpp | 103 ++-- .../cuda/cuda_histogram_constructor.cu | 130 +++-- .../cuda/cuda_histogram_constructor.hpp | 24 +- src/treelearner/cuda/cuda_leaf_splits.cpp | 32 +- src/treelearner/cuda/cuda_leaf_splits.cu | 4 +- src/treelearner/cuda/cuda_leaf_splits.hpp | 2 +- .../cuda/new_cuda_tree_learner.cpp | 91 ++-- .../cuda/new_cuda_tree_learner.hpp | 7 +- 23 files changed, 749 insertions(+), 1320 deletions(-) delete mode 100644 src/io/cuda/cuda_row_data.cu diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 6bf44510960e..e4a67ea4b643 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -90,46 +90,6 @@ class CUDARowData { ROW_PTR_TYPE** cuda_row_ptr, ROW_PTR_TYPE** cuda_partition_ptr); - void InitMetaInfoBeforeCopy(const CUDARowData* full_set); - - void PrepareSubsetColumnInfo(const std::vector& is_feature_used, const CUDARowData* full_set); - - void CopyDenseSubrowData(const CUDARowData* full_set, const data_size_t num_used_indices, const data_size_t* used_indices); - - void CopySparseSubrowData(const CUDARowData* full_set, const data_size_t num_used_indices, const data_size_t* used_indices); - - void CopyDenseSubcolData(const CUDARowData* full_set); - - void CopySparseSubcolData(const CUDARowData* full_set); - - uint64_t CalcTotalNumberOfElements(const CUDARowData* full_set); - - uint64_t LaunchCalcTotalNumberOfElementsKernel(const CUDARowData* full_set); - - uint64_t CalcTotalNumberOfElementsSubcol(const CUDARowData* full_set); - - uint64_t LaunchCalcTotalNumberOfElementsSubcolKernel(const CUDARowData* full_set); - - void LaunchCopyDenseSubrowKernel(const CUDARowData* full_set); - - void LaunchCopySparseSubrowKernel(const CUDARowData* full_set); - - void LaunchCopyDenseSubcolKernel(const CUDARowData* full_set); - - void BuildBinToColumnMap(const CUDARowData* full_set); - - template - void LaunchCopySparseSubrowKernelInner0( - const CUDARowData* full_set, - OUT_ROW_PTR_TYPE* out_cuda_row_ptr); - - template - void LaunchCopySparseSubrowKernelInner1( - const CUDARowData* full_set, - const BIN_TYPE* in_cuda_data, - const OUT_ROW_PTR_TYPE* out_cuda_row_ptr, - BIN_TYPE* out_cuda_data); - /*! \brief number of threads to use */ int num_threads_; /*! \brief number of training data */ @@ -160,25 +120,11 @@ class CUDARowData { data_size_t num_used_indices_; /*! \brief used when bagging with subset, number of total elements */ uint64_t num_total_elements_; - /*! \brief used when bagging with subset, the size of buffer for copy subrow */ - data_size_t cur_subset_buffer_size_; - /*! \brief used when bagging with subset, the size of buffer for copy subrow of sparse data */ - uint64_t cur_total_elements_buffer_size_; /*! \brief used when bagging with column subset, the size of maximum number of feature partitions */ int cur_num_feature_partition_buffer_size_; /*! \brief CUDA device ID */ int gpu_device_id_; - /*! \brief whether data is initialized */ - bool is_data_initialized_; - - /*! \brief used column indices when copying sub column */ - std::vector used_columns_; - /*! \brief a map from feature_index to column index, used when copying sub column */ - std::vector feature_index_to_column_index_; - /*! \brief complete histogram offset of each column, which equals that in train share states, used in copying sub columns */ - std::vector complete_column_hist_offsets_; - /*! \brief used when bagging with column subset, the size of buffer for columns */ - data_size_t cur_subcol_buffer_size_; + // CUDA memory @@ -206,24 +152,12 @@ class CUDARowData { uint32_t* cuda_column_hist_offsets_; /*! \brief hisotgram offset of each partition */ uint32_t* cuda_partition_hist_offsets_; - /*! \brief used when bagging with subset, used indice */ - data_size_t* cuda_used_indices_; /*! \brief block buffer when calculating prefix sum */ uint16_t* cuda_block_buffer_uint16_t_; /*! \brief block buffer when calculating prefix sum */ uint32_t* cuda_block_buffer_uint32_t_; /*! \brief block buffer when calculating prefix sum */ - uint64_t* cuda_block_buffer_uint64_t_; - /*! \brief partition ptr buffer */ - uint64_t* cuda_partition_ptr_buffer_; - /*! \brief used when bagging with subset, block buffer when reducing the number of elements in the subset */ - uint64_t* cuda_block_sum_buffer_; - /*! \brief used with column subset, maps the original column_index to partition index */ - int* cuda_column_index_to_partition_index_; - /*! \brief used column indices */ - int* cuda_used_columns_; - /*! \brief maps bin to column index, used when copy column subsets in sparse data */ - int* cuda_bin_to_column_index_; + uint64_t* cuda_block_buffer_uint64_t_; }; } // namespace LightGBM diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 3337373827d5..7809e7ac0498 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -26,14 +26,14 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = #define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); } template -void AllocateCUDAMemoryOuter(T** out_ptr, size_t size, const char* file, const int line) { +void AllocateCUDAMemory(T** out_ptr, size_t size, const char* file, const int line) { void* tmp_ptr = nullptr; CUDASUCCESS_OR_FATAL_OUTER(cudaMalloc(&tmp_ptr, size * sizeof(T))); *out_ptr = reinterpret_cast(tmp_ptr); } template -void CopyFromHostToCUDADeviceOuter(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { +void CopyFromHostToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { void* void_dst_ptr = reinterpret_cast(dst_ptr); const void* void_src_ptr = reinterpret_cast(src_ptr); size_t size_in_bytes = size * sizeof(T); @@ -41,27 +41,13 @@ void CopyFromHostToCUDADeviceOuter(T* dst_ptr, const T* src_ptr, size_t size, co } template -void CopyFromHostToCUDADeviceAsyncOuter(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream, const char* file, const int line) { - void* void_dst_ptr = reinterpret_cast(dst_ptr); - const void* void_src_ptr = reinterpret_cast(src_ptr); - size_t size_in_bytes = size * sizeof(T); - CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice, stream)); -} - -template -void InitCUDAMemoryFromHostMemoryOuter(T** dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { - AllocateCUDAMemoryOuter(dst_ptr, size, file, line); - CopyFromHostToCUDADeviceOuter(*dst_ptr, src_ptr, size, file, line); +void InitCUDAMemoryFromHostMemory(T** dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { + AllocateCUDAMemory(dst_ptr, size, file, line); + CopyFromHostToCUDADevice(*dst_ptr, src_ptr, size, file, line); } template -void InitCUDAValueFromConstantOuter(T** dst_ptr, const T value, const char* file, const int line) { - AllocateCUDAMemoryOuter(1, dst_ptr, file, line); - CopyFromHostToCUDADeviceOuter(*dst_ptr, &value, 1, file, line); -} - -template -void CopyFromCUDADeviceToHostOuter(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { +void CopyFromCUDADeviceToHost(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { void* void_dst_ptr = reinterpret_cast(dst_ptr); const void* void_src_ptr = reinterpret_cast(src_ptr); size_t size_in_bytes = size * sizeof(T); @@ -69,7 +55,7 @@ void CopyFromCUDADeviceToHostOuter(T* dst_ptr, const T* src_ptr, size_t size, co } template -void CopyFromCUDADeviceToHostAsyncOuter(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream, const char* file, const int line) { +void CopyFromCUDADeviceToHostAsync(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream, const char* file, const int line) { void* void_dst_ptr = reinterpret_cast(dst_ptr); const void* void_src_ptr = reinterpret_cast(src_ptr); size_t size_in_bytes = size * sizeof(T); @@ -77,7 +63,7 @@ void CopyFromCUDADeviceToHostAsyncOuter(T* dst_ptr, const T* src_ptr, size_t siz } template -void CopyFromCUDADeviceToCUDADeviceOuter(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { +void CopyFromCUDADeviceToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { void* void_dst_ptr = reinterpret_cast(dst_ptr); const void* void_src_ptr = reinterpret_cast(src_ptr); size_t size_in_bytes = size * sizeof(T); @@ -85,28 +71,26 @@ void CopyFromCUDADeviceToCUDADeviceOuter(T* dst_ptr, const T* src_ptr, size_t si } template -void CopyFromCUDADeviceToCUDADeviceAsyncOuter(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { +void CopyFromCUDADeviceToCUDADeviceAsync(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { void* void_dst_ptr = reinterpret_cast(dst_ptr); const void* void_src_ptr = reinterpret_cast(src_ptr); size_t size_in_bytes = size * sizeof(T); CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice)); } -void SynchronizeCUDADeviceOuter(const char* file, const int line); - -void SynchronizeCUDADeviceOuter(cudaStream_t cuda_stream, const char* file, const int line); +void SynchronizeCUDADevice(const char* file, const int line); template -void SetCUDAMemoryOuter(T* dst_ptr, int value, size_t size, const char* file, const int line) { +void SetCUDAMemory(T* dst_ptr, int value, size_t size, const char* file, const int line) { CUDASUCCESS_OR_FATAL_OUTER(cudaMemset(reinterpret_cast(dst_ptr), value, size * sizeof(T))); } -void PrintLastCUDAErrorOuter(const char* /*file*/, const int /*line*/); - template -void DeallocateCUDAMemoryOuter(T** ptr, const char* file, const int line) { - CUDASUCCESS_OR_FATAL_OUTER(cudaFree(reinterpret_cast(*ptr))); - *ptr = nullptr; +void DeallocateCUDAMemory(T** ptr, const char* file, const int line) { + if (*ptr != nullptr) { + CUDASUCCESS_OR_FATAL_OUTER(cudaFree(reinterpret_cast(*ptr))); + *ptr = nullptr; + } } } diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 8d74ea125b3a..8f64a87d5714 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -811,15 +811,17 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { double average_bag_rate = (static_cast(bag_data_cnt_) / num_data_) / config->bagging_freq; is_use_subset_ = false; - const int group_threshold_usesubset = 100; - if (average_bag_rate <= 0.5 - && (train_data_->num_feature_groups() < group_threshold_usesubset)) { - if (tmp_subset_ == nullptr || is_change_dataset) { - tmp_subset_.reset(new Dataset(bag_data_cnt_)); - tmp_subset_->CopyFeatureMapperFrom(train_data_); + if (config_->device_type != std::string("cuda")) { + const int group_threshold_usesubset = 100; + if (average_bag_rate <= 0.5 + && (train_data_->num_feature_groups() < group_threshold_usesubset)) { + if (tmp_subset_ == nullptr || is_change_dataset) { + tmp_subset_.reset(new Dataset(bag_data_cnt_)); + tmp_subset_->CopyFeatureMapperFrom(train_data_); + } + is_use_subset_ = true; + Log::Debug("Use subset for bagging"); } - is_use_subset_ = true; - Log::Debug("Use subset for bagging"); } need_re_bagging_ = true; diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp index 1ccff8d76257..902bc6eae4c4 100644 --- a/src/cuda/cuda_utils.cpp +++ b/src/cuda/cuda_utils.cpp @@ -9,19 +9,10 @@ namespace LightGBM { -void SynchronizeCUDADeviceOuter(const char* file, const int line) { +void SynchronizeCUDADevice(const char* file, const int line) { CUDASUCCESS_OR_FATAL_OUTER(cudaDeviceSynchronize()); } -void SynchronizeCUDADeviceOuter(cudaStream_t cuda_stream, const char* file, const int line) { - CUDASUCCESS_OR_FATAL_OUTER(cudaStreamSynchronize(cuda_stream)); -} - -void PrintLastCUDAErrorOuter(const char* /*file*/, const int /*line*/) { - const char* error_name = cudaGetErrorName(cudaGetLastError()); - Log::Warning(error_name); -} - } // namespace LightGBM //#endif // USE_CUDA diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp index 7ad60e213060..2989074ac392 100644 --- a/src/io/cuda/cuda_column_data.cpp +++ b/src/io/cuda/cuda_column_data.cpp @@ -17,9 +17,39 @@ CUDAColumnData::CUDAColumnData(const data_size_t num_data, const int gpu_device_ } cuda_used_indices_ = nullptr; cuda_data_by_column_ = nullptr; + cuda_column_bit_type_ = nullptr; + cuda_feature_min_bin_ = nullptr; + cuda_feature_max_bin_ = nullptr; + cuda_feature_offset_ = nullptr; + cuda_feature_most_freq_bin_ = nullptr; + cuda_feature_default_bin_ = nullptr; + cuda_feature_missing_is_zero_ = nullptr; + cuda_feature_missing_is_na_ = nullptr; + cuda_feature_mfb_is_zero_ = nullptr; + cuda_feature_mfb_is_na_ = nullptr; + cuda_feature_to_column_ = nullptr; + data_by_column_.clear(); } -CUDAColumnData::~CUDAColumnData() {} +CUDAColumnData::~CUDAColumnData() { + DeallocateCUDAMemory(&cuda_used_indices_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_data_by_column_, __FILE__, __LINE__); + for (size_t i = 0; i < data_by_column_.size(); ++i) { + DeallocateCUDAMemory(&data_by_column_[i], __FILE__, __LINE__); + } + DeallocateCUDAMemory(&cuda_column_bit_type_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_min_bin_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_max_bin_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_offset_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_most_freq_bin_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_default_bin_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_missing_is_zero_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_missing_is_na_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_mfb_is_zero_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_mfb_is_na_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_to_column_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_used_indices_, __FILE__, __LINE__); +} template void CUDAColumnData::InitOneColumnData(const void* in_column_data, BinIterator* bin_iterator, void** out_column_data_pointer) { @@ -31,13 +61,13 @@ void CUDAColumnData::InitOneColumnData(const void* in_column_data, BinIterator* for (data_size_t i = 0; i < num_data_; ++i) { expanded_column_data[i] = static_cast((in_column_data_reintrepreted[i >> 1] >> ((i & 1) << 2)) & 0xf); } - InitCUDAMemoryFromHostMemoryOuter(&cuda_column_data, + InitCUDAMemoryFromHostMemory(&cuda_column_data, expanded_column_data.data(), static_cast(num_data_), __FILE__, __LINE__); } else { - InitCUDAMemoryFromHostMemoryOuter(&cuda_column_data, + InitCUDAMemoryFromHostMemory(&cuda_column_data, reinterpret_cast(in_column_data), static_cast(num_data_), __FILE__, @@ -49,7 +79,7 @@ void CUDAColumnData::InitOneColumnData(const void* in_column_data, BinIterator* for (data_size_t i = 0; i < num_data_; ++i) { expanded_column_data[i] = static_cast(bin_iterator->RawGet(i)); } - InitCUDAMemoryFromHostMemoryOuter(&cuda_column_data, + InitCUDAMemoryFromHostMemory(&cuda_column_data, expanded_column_data.data(), static_cast(num_data_), __FILE__, @@ -119,7 +149,7 @@ void CUDAColumnData::Init(const int num_columns, } OMP_THROW_EX(); feature_to_column_ = feature_to_column; - InitCUDAMemoryFromHostMemoryOuter(&cuda_data_by_column_, + InitCUDAMemoryFromHostMemory(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, @@ -147,7 +177,7 @@ void CUDAColumnData::CopySubrow( if (cuda_used_indices_ == nullptr) { // initialize the subset cuda column data const size_t num_used_indices_size = static_cast(num_used_indices); - AllocateCUDAMemoryOuter(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__); data_by_column_.resize(num_columns_, nullptr); OMP_INIT_EX(); #pragma omp parallel for schedule(static) num_threads(num_threads_) @@ -156,21 +186,21 @@ void CUDAColumnData::CopySubrow( const uint8_t bit_type = column_bit_type_[column_index]; if (bit_type == 8) { uint8_t* column_data = nullptr; - AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); data_by_column_[column_index] = reinterpret_cast(column_data); } else if (bit_type == 16) { uint16_t* column_data = nullptr; - AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); data_by_column_[column_index] = reinterpret_cast(column_data); } else if (bit_type == 32) { uint32_t* column_data = nullptr; - AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); data_by_column_[column_index] = reinterpret_cast(column_data); } OMP_LOOP_EX_END(); } OMP_THROW_EX(); - InitCUDAMemoryFromHostMemoryOuter(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__); InitColumnMetaInfo(); cur_subset_buffer_size_ = num_used_indices; } else { @@ -179,15 +209,15 @@ void CUDAColumnData::CopySubrow( cur_subset_buffer_size_ = num_used_indices; } } - CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_used_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); num_used_indices_ = num_used_indices; LaunchCopySubrowKernel(full_set->cuda_data_by_column()); } void CUDAColumnData::ResizeWhenCopySubrow(const data_size_t num_used_indices) { const size_t num_used_indices_size = static_cast(num_used_indices); - DeallocateCUDAMemoryOuter(&cuda_used_indices_, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_used_indices_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__); OMP_INIT_EX(); #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int column_index = 0; column_index < num_columns_; ++column_index) { @@ -195,79 +225,79 @@ void CUDAColumnData::ResizeWhenCopySubrow(const data_size_t num_used_indices) { const uint8_t bit_type = column_bit_type_[column_index]; if (bit_type == 8) { uint8_t* column_data = reinterpret_cast(data_by_column_[column_index]); - DeallocateCUDAMemoryOuter(&column_data, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); + DeallocateCUDAMemory(&column_data, __FILE__, __LINE__); + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); data_by_column_[column_index] = reinterpret_cast(column_data); } else if (bit_type == 16) { uint16_t* column_data = reinterpret_cast(data_by_column_[column_index]); - DeallocateCUDAMemoryOuter(&column_data, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); + DeallocateCUDAMemory(&column_data, __FILE__, __LINE__); + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); data_by_column_[column_index] = reinterpret_cast(column_data); } else if (bit_type == 32) { uint32_t* column_data = reinterpret_cast(data_by_column_[column_index]); - DeallocateCUDAMemoryOuter(&column_data, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&column_data, num_used_indices_size, __FILE__, __LINE__); + DeallocateCUDAMemory(&column_data, __FILE__, __LINE__); + AllocateCUDAMemory(&column_data, num_used_indices_size, __FILE__, __LINE__); data_by_column_[column_index] = reinterpret_cast(column_data); } OMP_LOOP_EX_END(); } OMP_THROW_EX(); - DeallocateCUDAMemoryOuter(&cuda_data_by_column_, __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_data_by_column_, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__); } void CUDAColumnData::InitColumnMetaInfo() { - InitCUDAMemoryFromHostMemoryOuter(&cuda_column_bit_type_, + InitCUDAMemoryFromHostMemory(&cuda_column_bit_type_, column_bit_type_.data(), column_bit_type_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_max_bin_, + InitCUDAMemoryFromHostMemory(&cuda_feature_max_bin_, feature_max_bin_.data(), feature_max_bin_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_min_bin_, + InitCUDAMemoryFromHostMemory(&cuda_feature_min_bin_, feature_min_bin_.data(), feature_min_bin_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_offset_, + InitCUDAMemoryFromHostMemory(&cuda_feature_offset_, feature_offset_.data(), feature_offset_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_most_freq_bin_, + InitCUDAMemoryFromHostMemory(&cuda_feature_most_freq_bin_, feature_most_freq_bin_.data(), feature_most_freq_bin_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_default_bin_, + InitCUDAMemoryFromHostMemory(&cuda_feature_default_bin_, feature_default_bin_.data(), feature_default_bin_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_missing_is_zero_, + InitCUDAMemoryFromHostMemory(&cuda_feature_missing_is_zero_, feature_missing_is_zero_.data(), feature_missing_is_zero_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_missing_is_na_, + InitCUDAMemoryFromHostMemory(&cuda_feature_missing_is_na_, feature_missing_is_na_.data(), feature_missing_is_na_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_mfb_is_zero_, + InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_zero_, feature_mfb_is_zero_.data(), feature_mfb_is_zero_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_mfb_is_na_, + InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_is_na_, feature_mfb_is_na_.data(), feature_mfb_is_na_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_to_column_, + InitCUDAMemoryFromHostMemory(&cuda_feature_to_column_, feature_to_column_.data(), feature_to_column_.size(), __FILE__, diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp index 781a5d7a5797..28aeade0b919 100644 --- a/src/io/cuda/cuda_metadata.cpp +++ b/src/io/cuda/cuda_metadata.cpp @@ -13,9 +13,22 @@ CUDAMetadata::CUDAMetadata(const int gpu_device_id) { } else { CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); } + cuda_label_ = nullptr; + cuda_weights_ = nullptr; + cuda_query_boundaries_ = nullptr; + cuda_query_weights_ = nullptr; + cuda_init_score_ = nullptr; + cuda_queries_ = nullptr; } -CUDAMetadata::~CUDAMetadata() {} +CUDAMetadata::~CUDAMetadata() { + DeallocateCUDAMemory(&cuda_label_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_weights_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_query_boundaries_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_query_weights_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_init_score_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_queries_, __FILE__, __LINE__); +} void CUDAMetadata::Init(const std::vector& label, const std::vector& weight, @@ -26,34 +39,34 @@ void CUDAMetadata::Init(const std::vector& label, if (label.size() == 0) { cuda_label_ = nullptr; } else { - InitCUDAMemoryFromHostMemoryOuter(&cuda_label_, label.data(), label.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_label_, label.data(), label.size(), __FILE__, __LINE__); } if (weight.size() == 0) { cuda_weights_ = nullptr; } else { - InitCUDAMemoryFromHostMemoryOuter(&cuda_weights_, weight.data(), weight.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_weights_, weight.data(), weight.size(), __FILE__, __LINE__); } if (query_boundaries.size() == 0) { cuda_query_boundaries_ = nullptr; } else { - InitCUDAMemoryFromHostMemoryOuter(&cuda_query_boundaries_, query_boundaries.data(), query_boundaries.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_query_boundaries_, query_boundaries.data(), query_boundaries.size(), __FILE__, __LINE__); } if (query_weights.size() == 0) { cuda_query_weights_ = nullptr; } else { - InitCUDAMemoryFromHostMemoryOuter(&cuda_query_weights_, query_weights.data(), query_weights.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_query_weights_, query_weights.data(), query_weights.size(), __FILE__, __LINE__); } if (init_score.size() == 0) { cuda_init_score_ = nullptr; } else { - InitCUDAMemoryFromHostMemoryOuter(&cuda_init_score_, init_score.data(), init_score.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_init_score_, init_score.data(), init_score.size(), __FILE__, __LINE__); } if (queries.size() == 0) { cuda_queries_ = nullptr; } else { - InitCUDAMemoryFromHostMemoryOuter(&cuda_queries_, queries.data(), queries.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_queries_, queries.data(), queries.size(), __FILE__, __LINE__); } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); } } // namespace LightGBM diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 3fd3c7b68362..2019c939abd4 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -20,21 +20,36 @@ CUDARowData::CUDARowData(const Dataset* train_data, } else { CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); } - cuda_used_indices_ = nullptr; - cur_subset_buffer_size_ = 0; - cur_total_elements_buffer_size_ = 0; - cur_subcol_buffer_size_ = 0; - cur_num_feature_partition_buffer_size_ = 0; - is_data_initialized_ = false; + cuda_data_uint8_t_ = nullptr; + cuda_data_uint16_t_ = nullptr; + cuda_data_uint32_t_ = nullptr; + cuda_row_ptr_uint16_t_ = nullptr; + cuda_row_ptr_uint32_t_ = nullptr; + cuda_row_ptr_uint64_t_ = nullptr; + cuda_partition_ptr_uint16_t_ = nullptr; + cuda_partition_ptr_uint32_t_ = nullptr; + cuda_partition_ptr_uint64_t_ = nullptr; + cuda_feature_partition_column_index_offsets_ = nullptr; + cuda_column_hist_offsets_ = nullptr; + cuda_partition_hist_offsets_ = nullptr; + cuda_block_buffer_uint16_t_ = nullptr; + cuda_block_buffer_uint32_t_ = nullptr; + cuda_block_buffer_uint64_t_ = nullptr; } CUDARowData::CUDARowData() { - cuda_used_indices_ = nullptr; - cur_subset_buffer_size_ = 0; - cur_total_elements_buffer_size_ = 0; - cur_subcol_buffer_size_ = 0; - cur_num_feature_partition_buffer_size_ = 0; - is_data_initialized_ = false; + DeallocateCUDAMemory(&cuda_data_uint8_t_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_data_uint16_t_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_data_uint32_t_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_row_ptr_uint16_t_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_row_ptr_uint32_t_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_row_ptr_uint64_t_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_partition_column_index_offsets_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_column_hist_offsets_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_partition_hist_offsets_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_block_buffer_uint16_t_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_block_buffer_uint32_t_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_block_buffer_uint64_t_, __FILE__, __LINE__); } void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_share_state) { @@ -48,7 +63,7 @@ void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_sha if (!is_sparse_) { std::vector partitioned_data; GetDenseDataPartitioned(reinterpret_cast(host_data), &partitioned_data); - InitCUDAMemoryFromHostMemoryOuter(&cuda_data_uint8_t_, partitioned_data.data(), total_size, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_data_uint8_t_, partitioned_data.data(), total_size, __FILE__, __LINE__); } else { if (row_ptr_bit_type_ == 16) { InitSparseData( @@ -79,7 +94,7 @@ void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_sha if (!is_sparse_) { std::vector partitioned_data; GetDenseDataPartitioned(reinterpret_cast(host_data), &partitioned_data); - InitCUDAMemoryFromHostMemoryOuter(&cuda_data_uint16_t_, partitioned_data.data(), total_size, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_data_uint16_t_, partitioned_data.data(), total_size, __FILE__, __LINE__); } else { if (row_ptr_bit_type_ == 16) { InitSparseData( @@ -110,7 +125,7 @@ void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_sha if (!is_sparse_) { std::vector partitioned_data; GetDenseDataPartitioned(reinterpret_cast(host_data), &partitioned_data); - InitCUDAMemoryFromHostMemoryOuter(&cuda_data_uint32_t_, partitioned_data.data(), total_size, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_data_uint32_t_, partitioned_data.data(), total_size, __FILE__, __LINE__); } else { if (row_ptr_bit_type_ == 16) { InitSparseData( @@ -140,8 +155,7 @@ void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_sha } else { Log::Fatal("Unknow bit type = %d", bit_type_); } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - is_data_initialized_ = true; + SynchronizeCUDADevice(__FILE__, __LINE__); } void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state) { @@ -225,19 +239,19 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha } } - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_partition_column_index_offsets_, + InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, feature_partition_column_index_offsets_.data(), feature_partition_column_index_offsets_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_column_hist_offsets_, + InitCUDAMemoryFromHostMemory(&cuda_column_hist_offsets_, column_hist_offsets_.data(), column_hist_offsets_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_partition_hist_offsets_, + InitCUDAMemoryFromHostMemory(&cuda_partition_hist_offsets_, partition_hist_offsets_.data(), partition_hist_offsets_.size(), __FILE__, @@ -339,404 +353,15 @@ void CUDARowData::InitSparseData(const BIN_TYPE* host_data, std::vector> partitioned_data_ptr; std::vector partition_ptr; GetSparseDataPartitioned(host_data, host_row_ptr, &partitioned_data, &partitioned_data_ptr, &partition_ptr); - InitCUDAMemoryFromHostMemoryOuter(cuda_partition_ptr, partition_ptr.data(), partition_ptr.size(), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(cuda_data, partition_ptr.back(), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(cuda_row_ptr, (num_data_ + 1) * partitioned_data_ptr.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(cuda_partition_ptr, partition_ptr.data(), partition_ptr.size(), __FILE__, __LINE__); + AllocateCUDAMemory(cuda_data, partition_ptr.back(), __FILE__, __LINE__); + AllocateCUDAMemory(cuda_row_ptr, (num_data_ + 1) * partitioned_data_ptr.size(), __FILE__, __LINE__); for (size_t i = 0; i < partitioned_data.size(); ++i) { const std::vector& data_ptr_for_this_partition = partitioned_data_ptr[i]; const std::vector& data_for_this_partition = partitioned_data[i]; - CopyFromHostToCUDADeviceOuter((*cuda_data) + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size(), __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter((*cuda_row_ptr) + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size(), __FILE__, __LINE__); + CopyFromHostToCUDADevice((*cuda_data) + partition_ptr[i], data_for_this_partition.data(), data_for_this_partition.size(), __FILE__, __LINE__); + CopyFromHostToCUDADevice((*cuda_row_ptr) + i * (num_data_ + 1), data_ptr_for_this_partition.data(), data_ptr_for_this_partition.size(), __FILE__, __LINE__); } } -void CUDARowData::InitMetaInfoBeforeCopy(const CUDARowData* full_set) { - CHECK_EQ(cur_subset_buffer_size_, 0); - CHECK_EQ(cur_total_elements_buffer_size_, 0); - // initialize meta information - num_threads_ = full_set->num_threads_; - num_data_ = full_set->num_data_; - num_total_bin_ = full_set->num_total_bin_; - num_feature_group_ = full_set->num_feature_group_; - num_feature_ = full_set->num_feature_; - gpu_device_id_ = full_set->gpu_device_id_; - if (gpu_device_id_ >= 0) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id_)); - } else { - CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); - } - bit_type_ = full_set->bit_type_; - is_sparse_ = full_set->is_sparse_; - feature_partition_column_index_offsets_ = full_set->feature_partition_column_index_offsets_; - column_hist_offsets_ = full_set->column_hist_offsets_; - partition_hist_offsets_ = full_set->partition_hist_offsets_; - max_num_column_per_partition_ = full_set->max_num_column_per_partition_; - num_feature_partitions_ = full_set->num_feature_partitions_; - - InitCUDAMemoryFromHostMemoryOuter( - &cuda_feature_partition_column_index_offsets_, - feature_partition_column_index_offsets_.data(), - feature_partition_column_index_offsets_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_column_hist_offsets_, - column_hist_offsets_.data(), column_hist_offsets_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_partition_hist_offsets_, - partition_hist_offsets_.data(), partition_hist_offsets_.size(), __FILE__, __LINE__); -} - -void CUDARowData::CopySubrow( - const CUDARowData* full_set, - const data_size_t* used_indices, - const data_size_t num_used_indices) { - if (!is_data_initialized_) { - InitMetaInfoBeforeCopy(full_set); - is_data_initialized_ = true; - } - if (!full_set->is_sparse_) { - CopyDenseSubrowData(full_set, num_used_indices, used_indices); - } else { - CopySparseSubrowData(full_set, num_used_indices, used_indices); - } -} - -void CUDARowData::PrepareSubsetColumnInfo(const std::vector& is_feature_used, const CUDARowData* full_set) { - // get used columns - const int num_column = feature_partition_column_index_offsets_.back(); - std::vector is_column_used(num_column, 0); - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { - OMP_LOOP_EX_BEGIN(); - if (is_feature_used[feature_index]) { - const int column_index = feature_index_to_column_index_[feature_index]; - is_column_used[column_index] = 1; - } - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - used_columns_.clear(); - for (int column_index = 0; column_index < num_column; ++column_index) { - if (is_column_used[column_index]) { - used_columns_.emplace_back(column_index); - } - } - - std::vector column_index_to_partition_index(full_set->feature_partition_column_index_offsets_.back()); - // get column index to partition index map - for (int partition_index = 0; partition_index < full_set->num_feature_partitions_; ++partition_index) { - const int partition_column_start = full_set->feature_partition_column_index_offsets_[partition_index]; - const int partition_column_end = full_set->feature_partition_column_index_offsets_[partition_index + 1]; - for (int column_index = partition_column_start; column_index < partition_column_end; ++column_index) { - column_index_to_partition_index[column_index] = partition_index; - } - } - InitCUDAMemoryFromHostMemoryOuter(&cuda_column_index_to_partition_index_, - column_index_to_partition_index.data(), - column_index_to_partition_index.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_used_columns_, used_columns_.data(), used_columns_.size(), __FILE__, __LINE__); - - const uint32_t max_num_bin_per_partition = SHRAE_HIST_SIZE / 2; - int cur_partition_index = 0; - feature_partition_column_index_offsets_.clear(); - column_hist_offsets_.clear(); - partition_hist_offsets_.clear(); - max_num_column_per_partition_ = 0; - num_feature_partitions_ = 0; - int cur_num_bin_in_partition = 0; - int cur_num_bin = 0; - feature_partition_column_index_offsets_.emplace_back(0); - partition_hist_offsets_.emplace_back(0); - for (int column_index = 0; column_index < static_cast(used_columns_.size()); ++column_index) { - const int real_column_index = used_columns_[column_index]; - const int num_bin_in_column = complete_column_hist_offsets_[real_column_index + 1] - complete_column_hist_offsets_[real_column_index]; - const int next_num_bin_in_partition = cur_num_bin_in_partition + num_bin_in_column; - if (next_num_bin_in_partition >= max_num_bin_per_partition) { - feature_partition_column_index_offsets_.emplace_back(column_index); - partition_hist_offsets_.emplace_back(cur_num_bin); - cur_num_bin_in_partition = num_bin_in_column; - ++num_feature_partitions_; - } - cur_num_bin += num_bin_in_column; - column_hist_offsets_.emplace_back(complete_column_hist_offsets_[column_index] - partition_hist_offsets_.back()); - } - feature_partition_column_index_offsets_.emplace_back(static_cast(used_columns_.size())); - partition_hist_offsets_.emplace_back(cur_num_bin); - ++num_feature_partitions_; - - for (int partition_index = 0; partition_index < num_feature_partitions_; ++partition_index) { - const int num_column_in_partition = feature_partition_column_index_offsets_[partition_index + 1] - feature_partition_column_index_offsets_[partition_index]; - if (num_column_in_partition > max_num_column_per_partition_) { - max_num_column_per_partition_ = num_column_in_partition; - } - } - - CopyFromHostToCUDADeviceOuter(cuda_feature_partition_column_index_offsets_, - feature_partition_column_index_offsets_.data(), - feature_partition_column_index_offsets_.size(), __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_column_hist_offsets_, - column_hist_offsets_.data(), - column_hist_offsets_.size(), __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_partition_hist_offsets_, - partition_hist_offsets_.data(), - partition_hist_offsets_.size(), __FILE__, __LINE__); - -} - -void CUDARowData::CopyDenseSubcolData(const CUDARowData* full_set) { - const data_size_t num_used_column = static_cast(used_columns_.size()); - if (cur_subcol_buffer_size_ > num_used_column) { - const uint64_t num_total_elements = used_columns_.size() * static_cast(num_data_); - if (cur_subcol_buffer_size_ > 0) { - if (bit_type_ == 8) { - DeallocateCUDAMemoryOuter(&cuda_data_uint8_t_, __FILE__, __LINE__); - } else if (bit_type_ == 16) { - DeallocateCUDAMemoryOuter(&cuda_data_uint16_t_, __FILE__, __LINE__); - } else if (bit_type_ == 32) { - DeallocateCUDAMemoryOuter(&cuda_data_uint32_t_, __FILE__, __LINE__); - } - } - if (bit_type_ == 8) { - AllocateCUDAMemoryOuter(&cuda_data_uint8_t_, num_total_elements, __FILE__, __LINE__); - } else if (bit_type_ == 16) { - AllocateCUDAMemoryOuter(&cuda_data_uint16_t_, num_total_elements, __FILE__, __LINE__); - } else if (bit_type_ == 32) { - AllocateCUDAMemoryOuter(&cuda_data_uint32_t_, num_total_elements, __FILE__, __LINE__); - } - } - LaunchCopyDenseSubcolKernel(full_set); -} - -void CUDARowData::BuildBinToColumnMap(const CUDARowData* full_set) { - std::vector bin_to_column(complete_column_hist_offsets_.back()); - const int num_column = full_set->feature_partition_column_index_offsets_.back(); - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) num_threads(num_threads_) - for (int column_index = 0; column_index < num_column; ++column_index) { - OMP_LOOP_EX_BEGIN(); - const uint32_t column_hist_bin_start = complete_column_hist_offsets_[column_index]; - const uint32_t column_hist_bin_end = complete_column_hist_offsets_[column_index + 1]; - for (uint32_t bin = column_hist_bin_start; bin < column_hist_bin_end; ++bin) { - bin_to_column[bin] = column_index; - } - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - InitCUDAMemoryFromHostMemoryOuter(&cuda_bin_to_column_index_, bin_to_column.data(), bin_to_column.size(), __FILE__, __LINE__); -} - -uint64_t CUDARowData::CalcTotalNumberOfElementsSubcol(const CUDARowData* full_set) { - return LaunchCalcTotalNumberOfElementsSubcolKernel(full_set); -} - -void CUDARowData::CopySparseSubcolData(const CUDARowData* full_set) { - if (!is_data_initialized_) { - const int num_blocks = (num_data_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; - AllocateCUDAMemoryOuter(&cuda_block_sum_buffer_, static_cast(num_blocks * full_set->num_feature_partitions_) + 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint16_t_, static_cast(full_set->num_feature_partitions_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint32_t_, static_cast(full_set->num_feature_partitions_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint64_t_, static_cast(full_set->num_feature_partitions_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_partition_ptr_buffer_, static_cast(full_set->num_feature_partitions_) + 1, __FILE__, __LINE__); - is_data_initialized_ = true; - BuildBinToColumnMap(full_set); - } - num_total_elements_ = CalcTotalNumberOfElementsSubcol(full_set); - if (num_total_elements_ > cur_total_elements_buffer_size_) { - if (cur_total_elements_buffer_size_ > 0) { - if (bit_type_ == 8) { - DeallocateCUDAMemoryOuter(&cuda_data_uint8_t_, __FILE__, __LINE__); - } else if (bit_type_ == 16) { - DeallocateCUDAMemoryOuter(&cuda_data_uint16_t_, __FILE__, __LINE__); - } else if (bit_type_ == 32) { - DeallocateCUDAMemoryOuter(&cuda_data_uint32_t_, __FILE__, __LINE__); - } - } - if (bit_type_ == 8) { - AllocateCUDAMemoryOuter(&cuda_data_uint8_t_, num_total_elements_, __FILE__, __LINE__); - } else if (bit_type_ == 16) { - AllocateCUDAMemoryOuter(&cuda_data_uint16_t_, num_total_elements_, __FILE__, __LINE__); - } else if (bit_type_ == 32) { - AllocateCUDAMemoryOuter(&cuda_data_uint32_t_, num_total_elements_, __FILE__, __LINE__); - } - cur_total_elements_buffer_size_ = num_total_elements_; - } - if (num_feature_partitions_ > cur_num_feature_partition_buffer_size_) { - if (cur_num_feature_partition_buffer_size_ > 0) { - if (row_ptr_bit_type_ == 16) { - DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint16_t_, __FILE__, __LINE__); - } else if (row_ptr_bit_type_ == 32) { - DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint32_t_, __FILE__, __LINE__); - } else if (row_ptr_bit_type_ == 64) { - DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint64_t_, __FILE__, __LINE__); - } - } - const size_t row_ptr_size = static_cast(num_feature_partitions_) * (static_cast(num_data_) + 1); - if (row_ptr_bit_type_ == 16) { - AllocateCUDAMemoryOuter(&cuda_row_ptr_uint16_t_, row_ptr_size, __FILE__, __LINE__); - } else if (row_ptr_bit_type_ == 32) { - AllocateCUDAMemoryOuter(&cuda_row_ptr_uint32_t_, row_ptr_size, __FILE__, __LINE__); - } else if (row_ptr_bit_type_ == 64) { - AllocateCUDAMemoryOuter(&cuda_row_ptr_uint64_t_, row_ptr_size, __FILE__, __LINE__); - } - cur_num_feature_partition_buffer_size_ = num_feature_partitions_; - } -} - -void CUDARowData::CopySubcol(const CUDARowData* full_set, const std::vector& is_feature_used, const Dataset* train_data) { - if (!is_data_initialized_) { - InitMetaInfoBeforeCopy(full_set); - is_data_initialized_ = true; - feature_index_to_column_index_.resize(num_feature_, -1); - int cur_group_index = -1; - int cur_column_index = -1; - for (int feature_index = 0; feature_index < num_feature_; ++feature_index) { - const int group_index = train_data->Feature2Group(feature_index); - if (!train_data->IsMultiGroup(group_index)) { - if (group_index != cur_group_index) { - ++cur_column_index; - } - } else { - ++cur_column_index; - } - feature_index_to_column_index_[feature_index] = cur_column_index; - } - complete_column_hist_offsets_ = column_hist_offsets_; - const int num_column = feature_partition_column_index_offsets_.back(); - uint32_t offset = 0; - for (int i = 1; i <= num_column; ++i) { - if (complete_column_hist_offsets_[i] == 0) { - offset = complete_column_hist_offsets_[i - 1]; - } - complete_column_hist_offsets_[i] += offset; - } - } - PrepareSubsetColumnInfo(is_feature_used, full_set); - if (!full_set->is_sparse_) { - CopyDenseSubcolData(full_set); - } else { - CopySparseSubcolData(full_set); - } -} - -uint64_t CUDARowData::CalcTotalNumberOfElements(const CUDARowData* full_set) { - return LaunchCalcTotalNumberOfElementsKernel(full_set); -} - -void CUDARowData::CopyDenseSubrowData(const CUDARowData* full_set, const data_size_t num_used_indices, const data_size_t* used_indices) { - num_used_indices_ = num_used_indices; - if (num_used_indices_ > cur_subset_buffer_size_) { - // allocate cuda memory - if (cur_subset_buffer_size_ == 0) { - CHECK_EQ(cuda_used_indices_, nullptr); - CHECK_EQ(cur_total_elements_buffer_size_, 0); - } else { - DeallocateCUDAMemoryOuter(&cuda_used_indices_, __FILE__, __LINE__); - if (bit_type_ == 8) { - DeallocateCUDAMemoryOuter(&cuda_data_uint8_t_, __FILE__, __LINE__); - } else if (bit_type_ == 16) { - DeallocateCUDAMemoryOuter(&cuda_data_uint16_t_, __FILE__, __LINE__); - } else if (bit_type_ == 32) { - DeallocateCUDAMemoryOuter(&cuda_data_uint32_t_, __FILE__, __LINE__); - } - } - AllocateCUDAMemoryOuter(&cuda_used_indices_, static_cast(num_used_indices_), __FILE__, __LINE__); - const int num_column = feature_partition_column_index_offsets_.back(); - size_t total_size = static_cast(num_used_indices_ * num_column); - if (bit_type_ == 8) { - AllocateCUDAMemoryOuter(&cuda_data_uint8_t_, total_size, __FILE__, __LINE__); - } else if (bit_type_ == 16) { - AllocateCUDAMemoryOuter(&cuda_data_uint16_t_, total_size, __FILE__, __LINE__); - } else if (bit_type_ == 32) { - AllocateCUDAMemoryOuter(&cuda_data_uint32_t_, total_size, __FILE__, __LINE__); - } - cur_subset_buffer_size_ = num_used_indices_; - } - CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); - LaunchCopyDenseSubrowKernel(full_set); -} - -void CUDARowData::CopySparseSubrowData(const CUDARowData* full_set, const data_size_t num_used_indices, const data_size_t* used_indices) { - num_used_indices_ = num_used_indices; - bool need_reallocate_row_ptr = false; - bool need_reallocate_data = false; - if (num_used_indices_ > cur_subset_buffer_size_) { - if (cur_subset_buffer_size_ == 0) { - CHECK_EQ(cur_total_elements_buffer_size_, 0); - CHECK_EQ(cuda_used_indices_, nullptr); - const int num_blocks = (num_data_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; - AllocateCUDAMemoryOuter(&cuda_block_sum_buffer_, static_cast(num_blocks * num_feature_partitions_) + 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint16_t_, static_cast(num_feature_partitions_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint32_t_, static_cast(num_feature_partitions_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_partition_ptr_uint64_t_, static_cast(num_feature_partitions_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_partition_ptr_buffer_, static_cast(num_feature_partitions_) + 1, __FILE__, __LINE__); - } else { - DeallocateCUDAMemoryOuter(&cuda_used_indices_, __FILE__, __LINE__); - } - AllocateCUDAMemoryOuter(&cuda_used_indices_, static_cast(num_used_indices_), __FILE__, __LINE__); - need_reallocate_row_ptr = true; - } - CopyFromHostToCUDADeviceOuter(cuda_used_indices_, used_indices, static_cast(num_used_indices_), __FILE__, __LINE__); - num_total_elements_ = CalcTotalNumberOfElements(full_set); - //Log::Warning("num_total_elements_ = %d", num_total_elements_); - if (num_total_elements_ > cur_total_elements_buffer_size_) { - need_reallocate_data = true; - } - if (num_total_elements_ <= std::numeric_limits::max()) { - if (row_ptr_bit_type_ != 16) { - need_reallocate_row_ptr = true; - } - } else if (num_total_elements_ <= std::numeric_limits::max()) { - if (row_ptr_bit_type_ != 32) { - need_reallocate_row_ptr = true; - } - } else { - if (row_ptr_bit_type_ != 64) { - need_reallocate_row_ptr = true; - } - } - if (need_reallocate_row_ptr) { - if (cur_subset_buffer_size_ > 0) { - if (row_ptr_bit_type_ == 16) { - DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint16_t_, __FILE__, __LINE__); - } else if (row_ptr_bit_type_ == 32) { - DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint32_t_, __FILE__, __LINE__); - } else if (row_ptr_bit_type_ == 64) { - DeallocateCUDAMemoryOuter(&cuda_row_ptr_uint64_t_, __FILE__, __LINE__); - } - } - if (num_total_elements_ <= std::numeric_limits::max()) { - row_ptr_bit_type_ = 16; - AllocateCUDAMemoryOuter(&cuda_row_ptr_uint16_t_, static_cast(num_used_indices_) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_row_ptr_uint16_t_, 0, 1, __FILE__, __LINE__); - } else if (num_total_elements_ <= std::numeric_limits::max()) { - row_ptr_bit_type_ = 32; - AllocateCUDAMemoryOuter(&cuda_row_ptr_uint32_t_, static_cast(num_used_indices_) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_row_ptr_uint32_t_, 0, 1, __FILE__, __LINE__); - } else { - row_ptr_bit_type_ = 64; - AllocateCUDAMemoryOuter(&cuda_row_ptr_uint64_t_, static_cast(num_used_indices_) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_row_ptr_uint64_t_, 0, 1, __FILE__, __LINE__); - } - cur_subset_buffer_size_ = num_used_indices_; - } - if (need_reallocate_data) { - if (cur_total_elements_buffer_size_ > 0) { - if (bit_type_ == 8) { - DeallocateCUDAMemoryOuter(&cuda_data_uint8_t_, __FILE__, __LINE__); - } else if (bit_type_ == 16) { - DeallocateCUDAMemoryOuter(&cuda_data_uint16_t_, __FILE__, __LINE__); - } else if (bit_type_ == 32) { - DeallocateCUDAMemoryOuter(&cuda_data_uint32_t_, __FILE__, __LINE__); - } - } - if (bit_type_ == 8) { - AllocateCUDAMemoryOuter(&cuda_data_uint8_t_, num_total_elements_, __FILE__, __LINE__); - } else if (bit_type_ == 16) { - AllocateCUDAMemoryOuter(&cuda_data_uint16_t_, num_total_elements_, __FILE__, __LINE__); - } else if (bit_type_ == 32) { - AllocateCUDAMemoryOuter(&cuda_data_uint32_t_, num_total_elements_, __FILE__, __LINE__); - } - cur_total_elements_buffer_size_ = num_total_elements_; - } - LaunchCopySparseSubrowKernel(full_set); -} - } // namespace LightGBM diff --git a/src/io/cuda/cuda_row_data.cu b/src/io/cuda/cuda_row_data.cu deleted file mode 100644 index b0b5230948f7..000000000000 --- a/src/io/cuda/cuda_row_data.cu +++ /dev/null @@ -1,363 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#include -#include - -namespace LightGBM { - -template -__global__ void CopySubrowDenseKernel(const BIN_TYPE* full_set_bin_data, const int num_column, const data_size_t num_used_indices, - const data_size_t* used_indices, BIN_TYPE* bin_data) { - const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (local_data_index < num_used_indices) { - const data_size_t global_data_index = used_indices[local_data_index]; - const BIN_TYPE* src = full_set_bin_data + global_data_index * num_column; - BIN_TYPE* dst = bin_data + local_data_index * num_column; - for (int column_index = 0; column_index < num_column; ++column_index) { - dst[column_index] = src[column_index]; - } - } -} - -void CUDARowData::LaunchCopyDenseSubrowKernel(const CUDARowData* full_set) { - const int num_column = feature_partition_column_index_offsets_.back(); - const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; - if (bit_type_ == 8) { - const uint8_t* full_set_bin_data = full_set->cuda_data_uint8_t_; - CopySubrowDenseKernel<<>>( - full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint8_t_); - } else if (bit_type_ == 16) { - const uint16_t* full_set_bin_data = full_set->cuda_data_uint16_t_; - CopySubrowDenseKernel<<>>( - full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint16_t_); - } else if (bit_type_ == 32) { - const uint32_t* full_set_bin_data = full_set->cuda_data_uint32_t_; - CopySubrowDenseKernel<<>>( - full_set_bin_data, num_column, num_used_indices_, cuda_used_indices_, cuda_data_uint32_t_); - } -} - -template -__global__ void CalcTotalNumberOfElementsKernel( - const data_size_t num_used_indices, - const data_size_t* cuda_used_indices, - const ROW_PTR_TYPE* cuda_row_ptr, - const int num_feature_partitions, - const data_size_t num_data, - uint64_t* block_sum_buffer) { - __shared__ uint64_t shared_mem_buffer[32]; - const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const int partition_index = static_cast(blockIdx.y); - const ROW_PTR_TYPE* partition_row_ptr = cuda_row_ptr + partition_index * (num_data + 1); - uint64_t num_elements_in_row = 0; - if (local_data_index < num_used_indices) { - const data_size_t global_data_index = cuda_used_indices[local_data_index]; - const data_size_t row_start = partition_row_ptr[global_data_index]; - const data_size_t row_end = partition_row_ptr[global_data_index + 1]; - num_elements_in_row = static_cast(row_end - row_start); - } - const uint64_t num_elements_in_block = ShuffleReduceSum(num_elements_in_row, shared_mem_buffer, blockDim.x); - if (threadIdx.x == 0) { - //printf("blockIdx.x = %d, partition_index = %d, num_elements_in_block = %lu\n", blockIdx.x, partition_index, num_elements_in_block); - block_sum_buffer[partition_index * gridDim.x + blockIdx.x] = num_elements_in_block; - } -} - -__global__ void ReduceBlockSumKernel( - const uint64_t* block_sum_buffer, - const int num_blocks, - const int num_feature_partitions, - uint64_t* cuda_partition_ptr_buffer) { - __shared__ uint64_t shared_mem_buffer[32]; - uint64_t thread_sum = 0; - const int partition_index = static_cast(blockIdx.x); - const uint64_t* block_sum_buffer_ptr = block_sum_buffer + partition_index * num_blocks; - for (data_size_t block_index = static_cast(threadIdx.x); block_index < num_blocks; block_index += static_cast(blockDim.x)) { - thread_sum += block_sum_buffer_ptr[block_index]; - } - /*if (threadIdx.x == 0) { - printf("thread_sum = %lu\n", thread_sum); - }*/ - const uint64_t num_total_elements = ShuffleReduceSum(thread_sum, shared_mem_buffer, blockDim.x); - if (threadIdx.x == 0) { - //printf("partition_index = %d, num_total_elements = %lu\n", partition_index, num_total_elements); - cuda_partition_ptr_buffer[partition_index + 1] = num_total_elements; - if (blockIdx.x == 0) { - cuda_partition_ptr_buffer[0] = 0; - } - } -} - -__global__ void ComputePartitionPtr( - uint64_t* cuda_partition_ptr_buffer, - const int num_feature_partitions) { - __shared__ uint64_t shared_mem_buffer[32]; - const int num_partitions_per_thread = (num_feature_partitions + blockDim.x - 1) / (blockDim.x - 1); - int start_partition = threadIdx.x == 0 ? 0 : num_partitions_per_thread * static_cast(threadIdx.x - 1); - int end_partition = threadIdx.x == 0 ? 0 : min(start_partition + num_partitions_per_thread, num_feature_partitions + 1); - uint64_t thread_sum = 0; - for (int partition_index = start_partition; partition_index < end_partition; ++partition_index) { - thread_sum += cuda_partition_ptr_buffer[partition_index]; - } - const uint64_t thread_base = ShufflePrefixSum(thread_sum, shared_mem_buffer); - start_partition = threadIdx.x == blockDim.x - 1 ? 0 : num_partitions_per_thread * static_cast(threadIdx.x); - end_partition = threadIdx.x == blockDim.x - 1 ? 0 : min(start_partition + num_partitions_per_thread, num_feature_partitions + 1); - for (int partition_index = start_partition + 1; partition_index < end_partition; ++partition_index) { - cuda_partition_ptr_buffer[partition_index] += cuda_partition_ptr_buffer[partition_index - 1]; - } - for (int partition_index = start_partition; partition_index < end_partition; ++partition_index) { - cuda_partition_ptr_buffer[partition_index] += thread_base; - } - if (threadIdx.x == blockDim.x - 1) { - cuda_partition_ptr_buffer[num_feature_partitions] = thread_base; - } -} - -uint64_t CUDARowData::LaunchCalcTotalNumberOfElementsKernel(const CUDARowData* full_set) { - const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; - SetCUDAMemoryOuter(cuda_block_sum_buffer_, 0, static_cast(num_blocks * num_feature_partitions_) + 1, __FILE__, __LINE__); - if (full_set->row_ptr_bit_type_ == 16) { - CalcTotalNumberOfElementsKernel<<>>( - num_used_indices_, - cuda_used_indices_, - full_set->cuda_row_ptr_uint16_t_, - num_feature_partitions_, - num_data_, - cuda_block_sum_buffer_); - } else if (full_set->row_ptr_bit_type_ == 32) { - CalcTotalNumberOfElementsKernel<<>>( - num_used_indices_, - cuda_used_indices_, - full_set->cuda_row_ptr_uint32_t_, - num_feature_partitions_, - num_data_, - cuda_block_sum_buffer_); - } else if (full_set->row_ptr_bit_type_ == 64) { - CalcTotalNumberOfElementsKernel<<>>( - num_used_indices_, - cuda_used_indices_, - full_set->cuda_row_ptr_uint64_t_, - num_feature_partitions_, - num_data_, - cuda_block_sum_buffer_); - } - //Log::Warning("num_feature_partitions_ = %d", num_feature_partitions_); - ReduceBlockSumKernel<<>>( - cuda_block_sum_buffer_, num_blocks, num_feature_partitions_, cuda_partition_ptr_buffer_); - ComputePartitionPtr<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_); - uint64_t num_total_elements = 0; - CopyFromCUDADeviceToHostOuter(&num_total_elements, cuda_partition_ptr_buffer_ + num_feature_partitions_, 1, __FILE__, __LINE__); - //Log::Warning("num_used_indices = %d, num_blocks = %d, num_total_elements = %d", num_used_indices_, num_blocks, num_total_elements); - return num_total_elements; -} - -template -__global__ void CopyPartitionPtrKernel( - const uint64_t* cuda_partition_ptr_buffer, - const int num_feature_partitions, - ROW_PTR_TYPE* cuda_partition_ptr) { - for (int partition_index = static_cast(threadIdx.x); partition_index < num_feature_partitions + 1; partition_index += static_cast(blockDim.x)) { - cuda_partition_ptr[partition_index] = static_cast(cuda_partition_ptr_buffer[partition_index]); - } -} - -template -__global__ void CopySparseSubrowRowPtrKernel( - const IN_ROW_PTR_TYPE* cuda_row_ptr, - const data_size_t num_used_indices, - const data_size_t* cuda_used_indices, - OUT_ROW_PTR_TYPE* out_cuda_row_ptr) { - const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (local_data_index < num_used_indices) { - const data_size_t global_data_index = cuda_used_indices[local_data_index]; - const IN_ROW_PTR_TYPE row_start = cuda_row_ptr[global_data_index]; - const IN_ROW_PTR_TYPE row_end = cuda_row_ptr[global_data_index + 1]; - const OUT_ROW_PTR_TYPE num_elements_in_row = static_cast(row_end - row_start); - out_cuda_row_ptr[local_data_index + 1] = num_elements_in_row; - } -} - -template -__global__ void CopySparseSubrowDataKernel( - const BIN_TYPE* in_cuda_data, - const IN_ROW_PTR_TYPE* in_cuda_row_ptr, - const OUT_ROW_PTR_TYPE* out_cuda_row_ptr, - const data_size_t num_used_indices, - const data_size_t* cuda_used_indices, - BIN_TYPE* out_cuda_data) { - const data_size_t local_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (local_data_index < num_used_indices) { - const data_size_t global_data_index = cuda_used_indices[local_data_index]; - const IN_ROW_PTR_TYPE in_row_start = in_cuda_row_ptr[global_data_index]; - const IN_ROW_PTR_TYPE in_row_end = in_cuda_row_ptr[global_data_index + 1]; - const IN_ROW_PTR_TYPE in_num_elements_in_row = in_row_end - in_row_start; - const OUT_ROW_PTR_TYPE out_row_start = out_cuda_row_ptr[local_data_index]; - const OUT_ROW_PTR_TYPE out_row_end = out_cuda_row_ptr[local_data_index + 1]; - const OUT_ROW_PTR_TYPE out_num_elements_in_row = out_row_end - out_row_start; - /*if (in_num_elements_in_row != out_num_elements_in_row) { - printf("error !!!!!, in_num_elements_in_row = %d, out_num_elements_in_row = %d\n", static_cast(in_num_elements_in_row), static_cast(out_num_elements_in_row)); - } - if (out_row_end > in_row_end || out_row_start > in_row_start) { - printf("error !!!!!, out_row_end = %d, in_row_end = %d, out_row_start = %d, in_row_start = %d\n", - static_cast(out_row_end), static_cast(in_row_end), static_cast(out_row_start), static_cast(in_row_start)); - }*/ - const BIN_TYPE* in_cuda_data_ptr = in_cuda_data + in_row_start; - BIN_TYPE* out_cuda_data_ptr = out_cuda_data + out_row_start; - for (IN_ROW_PTR_TYPE element_index = 0; element_index < in_num_elements_in_row; ++element_index) { - out_cuda_data_ptr[element_index] = in_cuda_data_ptr[element_index]; - } - } -} - -template -void CUDARowData::LaunchCopySparseSubrowKernelInner1( - const CUDARowData* full_set, - const BIN_TYPE* in_cuda_data, - const OUT_ROW_PTR_TYPE* out_cuda_row_ptr, - BIN_TYPE* out_cuda_data) { - CHECK_GE(full_set->row_ptr_bit_type_, row_ptr_bit_type_); - const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; - if (full_set->row_ptr_bit_type_ == 16) { - CopySparseSubrowDataKernel<<>>( - in_cuda_data, full_set->cuda_row_ptr_uint16_t_, out_cuda_row_ptr, num_used_indices_, cuda_used_indices_, out_cuda_data); - } else if (full_set->row_ptr_bit_type_ == 32) { - CopySparseSubrowDataKernel<<>>( - in_cuda_data, full_set->cuda_row_ptr_uint32_t_, out_cuda_row_ptr, num_used_indices_, cuda_used_indices_, out_cuda_data); - } else if (full_set->row_ptr_bit_type_ == 64) { - CopySparseSubrowDataKernel<<>>( - in_cuda_data, full_set->cuda_row_ptr_uint64_t_, out_cuda_row_ptr, num_used_indices_, cuda_used_indices_, out_cuda_data); - } -} - -template -void CUDARowData::LaunchCopySparseSubrowKernelInner0( - const CUDARowData* full_set, - OUT_ROW_PTR_TYPE* out_cuda_row_ptr) { - ShufflePrefixSumGlobal( - out_cuda_row_ptr, - static_cast(num_used_indices_) + 1, - reinterpret_cast(cuda_block_sum_buffer_)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - if (bit_type_ == 8) { - LaunchCopySparseSubrowKernelInner1(full_set, full_set->cuda_data_uint8_t_, out_cuda_row_ptr, cuda_data_uint8_t_); - } else if (bit_type_ == 16) { - LaunchCopySparseSubrowKernelInner1(full_set, full_set->cuda_data_uint16_t_, out_cuda_row_ptr, cuda_data_uint16_t_); - } else if (bit_type_ == 32) { - LaunchCopySparseSubrowKernelInner1(full_set, full_set->cuda_data_uint32_t_, out_cuda_row_ptr, cuda_data_uint32_t_); - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -void CUDARowData::LaunchCopySparseSubrowKernel(const CUDARowData* full_set) { - if (row_ptr_bit_type_ == 16) { - CopyPartitionPtrKernel<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_, cuda_partition_ptr_uint16_t_); - } else if (row_ptr_bit_type_ == 32) { - CopyPartitionPtrKernel<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_, cuda_partition_ptr_uint32_t_); - } else if (row_ptr_bit_type_ == 64) { - CopyPartitionPtrKernel<<<1, COPY_SUBROW_BLOCK_SIZE_ROW_DATA>>>(cuda_partition_ptr_buffer_, num_feature_partitions_, cuda_partition_ptr_uint64_t_); - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - const int num_blocks = (num_used_indices_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; - if (full_set->row_ptr_bit_type_ == 16) { - CHECK_EQ(row_ptr_bit_type_, 16); - CopySparseSubrowRowPtrKernel<<>>( - full_set->cuda_row_ptr_uint16_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint16_t_); - } else if (full_set->row_ptr_bit_type_ == 32) { - CHECK(row_ptr_bit_type_ == 16 || row_ptr_bit_type_ == 32); - if (row_ptr_bit_type_ == 16) { - CopySparseSubrowRowPtrKernel<<>>( - full_set->cuda_row_ptr_uint32_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint16_t_); - } else if (row_ptr_bit_type_ == 32) { - CopySparseSubrowRowPtrKernel<<>>( - full_set->cuda_row_ptr_uint32_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint32_t_); - } - } else if (full_set->row_ptr_bit_type_ == 64) { - if (row_ptr_bit_type_ == 16) { - CopySparseSubrowRowPtrKernel<<>>( - full_set->cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint16_t_); - } else if (row_ptr_bit_type_ == 32) { - CopySparseSubrowRowPtrKernel<<>>( - full_set->cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint32_t_); - } else if (row_ptr_bit_type_ == 64) { - CopySparseSubrowRowPtrKernel<<>>( - full_set->cuda_row_ptr_uint64_t_, num_used_indices_, cuda_used_indices_, cuda_row_ptr_uint64_t_); - } - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - if (row_ptr_bit_type_ == 16) { - LaunchCopySparseSubrowKernelInner0(full_set, cuda_row_ptr_uint16_t_); - } else if (row_ptr_bit_type_ == 32) { - LaunchCopySparseSubrowKernelInner0(full_set, cuda_row_ptr_uint32_t_); - } else if (row_ptr_bit_type_ == 64) { - LaunchCopySparseSubrowKernelInner0(full_set, cuda_row_ptr_uint64_t_); - } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); -} - -template -void __global__ CopyDenseSubcolKernel( - const BIN_TYPE* in_cuda_data, - const int out_num_feature_partitions, - const int* cuda_used_columns, - const int* cuda_column_index_to_partition_index, - const int* in_cuda_feature_partition_column_index_offsets, - const int* out_cuda_feature_partition_column_index_offsets, - const data_size_t num_data, - BIN_TYPE* out_cuda_data) { - const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (data_index < num_data) { - for (int out_partition_index = 0; out_partition_index < out_num_feature_partitions; ++out_partition_index) { - const int out_partition_column_start = out_cuda_feature_partition_column_index_offsets[out_partition_index]; - const int out_partition_column_end = out_cuda_feature_partition_column_index_offsets[out_partition_index + 1]; - BIN_TYPE* out_cuda_data_ptr = out_cuda_data + out_partition_column_start * num_data + data_index * (out_partition_column_end - out_partition_column_start); - for (int local_column_index = out_partition_column_start; local_column_index < out_partition_column_end; ++local_column_index) { - const int global_column_index = cuda_used_columns[local_column_index]; - const int global_partition_index = cuda_column_index_to_partition_index[global_column_index]; - const int in_partition_column_start = in_cuda_feature_partition_column_index_offsets[global_partition_index]; - const int in_partition_column_end = in_cuda_feature_partition_column_index_offsets[global_partition_index + 1]; - const BIN_TYPE* in_cuda_data_ptr = in_cuda_data + in_partition_column_start * num_data + data_index * (in_partition_column_end - in_partition_column_start); - out_cuda_data_ptr[local_column_index - out_partition_column_start] = in_cuda_data_ptr[global_column_index - in_partition_column_start]; - } - } - } -} - -void CUDARowData::LaunchCopyDenseSubcolKernel(const CUDARowData* full_set) { - const int num_blocks = (num_data_ + COPY_SUBROW_BLOCK_SIZE_ROW_DATA - 1) / COPY_SUBROW_BLOCK_SIZE_ROW_DATA; - if (bit_type_ == 8) { - CopyDenseSubcolKernel<<>>( - full_set->cuda_data_uint8_t_, - num_feature_partitions_, - cuda_used_columns_, - cuda_column_index_to_partition_index_, - cuda_feature_partition_column_index_offsets_, - full_set->cuda_feature_partition_column_index_offsets_, - num_data_, - cuda_data_uint8_t_); - } else if (bit_type_ == 16) { - CopyDenseSubcolKernel<<>>( - full_set->cuda_data_uint16_t_, - num_feature_partitions_, - cuda_used_columns_, - cuda_column_index_to_partition_index_, - cuda_feature_partition_column_index_offsets_, - full_set->cuda_feature_partition_column_index_offsets_, - num_data_, - cuda_data_uint16_t_); - } else if (bit_type_ == 32) { - CopyDenseSubcolKernel<<>>( - full_set->cuda_data_uint32_t_, - num_feature_partitions_, - cuda_used_columns_, - cuda_column_index_to_partition_index_, - cuda_feature_partition_column_index_offsets_, - full_set->cuda_feature_partition_column_index_offsets_, - num_data_, - cuda_data_uint32_t_); - } -} - -} // namespace LightGBM diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index 28bbdc139de0..beefdf90d7ee 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -26,122 +26,140 @@ CUDATree::CUDATree(const Tree* host_tree): InitCUDA(); } -CUDATree::~CUDATree() {} +CUDATree::~CUDATree() { + DeallocateCUDAMemory(&cuda_left_child_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_right_child_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_split_feature_inner_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_split_feature_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_depth_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_parent_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_threshold_in_bin_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_threshold_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_internal_weight_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_internal_value_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_decision_type_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_value_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_count_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_weight_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_internal_count_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_split_gain_, __FILE__, __LINE__); + gpuAssert(cudaStreamDestroy(cuda_stream_), __FILE__, __LINE__); +} void CUDATree::InitCUDAMemory() { - AllocateCUDAMemoryOuter(&cuda_left_child_, + AllocateCUDAMemory(&cuda_left_child_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_right_child_, + AllocateCUDAMemory(&cuda_right_child_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_split_feature_inner_, + AllocateCUDAMemory(&cuda_split_feature_inner_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_split_feature_, + AllocateCUDAMemory(&cuda_split_feature_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_leaf_depth_, + AllocateCUDAMemory(&cuda_leaf_depth_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_leaf_parent_, + AllocateCUDAMemory(&cuda_leaf_parent_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_threshold_in_bin_, + AllocateCUDAMemory(&cuda_threshold_in_bin_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_threshold_, + AllocateCUDAMemory(&cuda_threshold_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_decision_type_, + AllocateCUDAMemory(&cuda_decision_type_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_leaf_value_, + AllocateCUDAMemory(&cuda_leaf_value_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_internal_weight_, + AllocateCUDAMemory(&cuda_internal_weight_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_internal_value_, + AllocateCUDAMemory(&cuda_internal_value_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_leaf_weight_, + AllocateCUDAMemory(&cuda_leaf_weight_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_leaf_count_, + AllocateCUDAMemory(&cuda_leaf_count_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_internal_count_, + AllocateCUDAMemory(&cuda_internal_count_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_split_gain_, + AllocateCUDAMemory(&cuda_split_gain_, static_cast(max_leaves_), __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_leaf_value_, 0.0f, 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_leaf_weight_, 0.0f, 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_leaf_parent_, -1, 1, __FILE__, __LINE__); + SetCUDAMemory(cuda_leaf_value_, 0.0f, 1, __FILE__, __LINE__); + SetCUDAMemory(cuda_leaf_weight_, 0.0f, 1, __FILE__, __LINE__); + SetCUDAMemory(cuda_leaf_parent_, -1, 1, __FILE__, __LINE__); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); } void CUDATree::InitCUDA() { - InitCUDAMemoryFromHostMemoryOuter(&cuda_left_child_, + InitCUDAMemoryFromHostMemory(&cuda_left_child_, left_child_.data(), left_child_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_right_child_, + InitCUDAMemoryFromHostMemory(&cuda_right_child_, right_child_.data(), right_child_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_split_feature_inner_, + InitCUDAMemoryFromHostMemory(&cuda_split_feature_inner_, split_feature_inner_.data(), split_feature_inner_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_split_feature_, + InitCUDAMemoryFromHostMemory(&cuda_split_feature_, split_feature_.data(), split_feature_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_threshold_in_bin_, + InitCUDAMemoryFromHostMemory(&cuda_threshold_in_bin_, threshold_in_bin_.data(), threshold_in_bin_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_threshold_, + InitCUDAMemoryFromHostMemory(&cuda_threshold_, threshold_.data(), threshold_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_decision_type_, + InitCUDAMemoryFromHostMemory(&cuda_decision_type_, decision_type_.data(), decision_type_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_leaf_value_, + InitCUDAMemoryFromHostMemory(&cuda_leaf_value_, leaf_value_.data(), leaf_value_.size(), __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); } int CUDATree::Split(const int leaf_index, @@ -183,27 +201,27 @@ void CUDATree::ToHost() { leaf_depth_.resize(max_leaves_); const size_t num_leaves_size = static_cast(num_leaves_); - CopyFromCUDADeviceToHostOuter(left_child_.data(), cuda_left_child_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(right_child_.data(), cuda_right_child_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(split_feature_inner_.data(), cuda_split_feature_inner_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(split_feature_.data(), cuda_split_feature_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(threshold_in_bin_.data(), cuda_threshold_in_bin_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(threshold_.data(), cuda_threshold_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(decision_type_.data(), cuda_decision_type_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(split_gain_.data(), cuda_split_gain_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(leaf_parent_.data(), cuda_leaf_parent_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(leaf_value_.data(), cuda_leaf_value_, num_leaves_size, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(leaf_weight_.data(), cuda_leaf_weight_, num_leaves_size, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(leaf_count_.data(), cuda_leaf_count_, num_leaves_size, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(internal_value_.data(), cuda_internal_value_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(internal_weight_.data(), cuda_internal_weight_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(internal_count_.data(), cuda_internal_count_, num_leaves_size - 1, __FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(leaf_depth_.data(), cuda_leaf_depth_, num_leaves_size, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + CopyFromCUDADeviceToHost(left_child_.data(), cuda_left_child_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(right_child_.data(), cuda_right_child_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(split_feature_inner_.data(), cuda_split_feature_inner_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(split_feature_.data(), cuda_split_feature_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(threshold_in_bin_.data(), cuda_threshold_in_bin_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(threshold_.data(), cuda_threshold_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(decision_type_.data(), cuda_decision_type_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(split_gain_.data(), cuda_split_gain_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(leaf_parent_.data(), cuda_leaf_parent_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(leaf_value_.data(), cuda_leaf_value_, num_leaves_size, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(leaf_weight_.data(), cuda_leaf_weight_, num_leaves_size, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(leaf_count_.data(), cuda_leaf_count_, num_leaves_size, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(internal_value_.data(), cuda_internal_value_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(internal_weight_.data(), cuda_internal_weight_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(internal_count_.data(), cuda_internal_count_, num_leaves_size - 1, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(leaf_depth_.data(), cuda_leaf_depth_, num_leaves_size, __FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); } void CUDATree::SyncLeafOutputFromHostToCUDA() { - CopyFromHostToCUDADeviceOuter(cuda_leaf_value_, leaf_value_.data(), leaf_value_.size(), __FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_leaf_value_, leaf_value_.data(), leaf_value_.size(), __FILE__, __LINE__); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 983bbe1064ac..52d7597931f5 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -18,7 +18,6 @@ CUDABestSplitFinder::CUDABestSplitFinder( const Config* config): num_features_(train_data->num_features()), num_leaves_(config->num_leaves), - num_total_bin_(feature_hist_offsets.back()), feature_hist_offsets_(feature_hist_offsets), lambda_l1_(config->lambda_l1), lambda_l2_(config->lambda_l2), @@ -26,10 +25,47 @@ CUDABestSplitFinder::CUDABestSplitFinder( min_sum_hessian_in_leaf_(config->min_sum_hessian_in_leaf), min_gain_to_split_(config->min_gain_to_split), cuda_hist_(cuda_hist) { - feature_missing_type_.resize(num_features_); - feature_mfb_offsets_.resize(num_features_); - feature_default_bins_.resize(num_features_); - feature_num_bins_.resize(num_features_); + InitFeatureMetaInfo(train_data); + cuda_leaf_best_split_info_ = nullptr; + cuda_best_split_info_ = nullptr; + cuda_feature_hist_offsets_ = nullptr; + cuda_feature_mfb_offsets_ = nullptr; + cuda_feature_default_bins_ = nullptr; + cuda_feature_num_bins_ = nullptr; + cuda_best_split_info_buffer_ = nullptr; + cuda_task_feature_index_ = nullptr; + cuda_task_reverse_ = nullptr; + cuda_task_skip_default_bin_ = nullptr; + cuda_task_na_as_missing_ = nullptr; + cuda_task_out_default_left_ = nullptr; + cuda_is_feature_used_bytree_ = nullptr; +} + +CUDABestSplitFinder::~CUDABestSplitFinder() { + DeallocateCUDAMemory(&cuda_leaf_best_split_info_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_best_split_info_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_hist_offsets_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_mfb_offsets_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_default_bins_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_num_bins_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_best_split_info_buffer_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_task_feature_index_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_task_reverse_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_task_skip_default_bin_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_task_na_as_missing_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_task_out_default_left_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_is_feature_used_bytree_, __FILE__, __LINE__); + gpuAssert(cudaStreamDestroy(cuda_streams_[0]), __FILE__, __LINE__); + gpuAssert(cudaStreamDestroy(cuda_streams_[1]), __FILE__, __LINE__); + cuda_streams_.clear(); + cuda_streams_.shrink_to_fit(); +} + +void CUDABestSplitFinder::InitFeatureMetaInfo(const Dataset* train_data) { + feature_missing_type_.clear(); + feature_mfb_offsets_.clear(); + feature_default_bins_.clear(); + feature_num_bins_.clear(); max_num_bin_in_feature_ = 0; for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { const BinMapper* bin_mapper = train_data->FeatureBinMapper(inner_feature_index); @@ -49,38 +85,35 @@ CUDABestSplitFinder::CUDABestSplitFinder( } void CUDABestSplitFinder::Init() { - AllocateCUDAMemoryOuter(&cuda_feature_hist_offsets_, - feature_hist_offsets_.size() * 2, - __FILE__, - __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_feature_hist_offsets_, + InitCUDAFeatureMetaInfo(); + cuda_streams_.resize(2); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); + AllocateCUDAMemory(&cuda_best_split_info_buffer_, 7, __FILE__, __LINE__); +} + +void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { + InitCUDAMemoryFromHostMemory(&cuda_feature_hist_offsets_, feature_hist_offsets_.data(), feature_hist_offsets_.size(), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_feature_mfb_offsets_, - feature_mfb_offsets_.size(), - __FILE__, - __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_feature_mfb_offsets_, + InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_offsets_, feature_mfb_offsets_.data(), feature_mfb_offsets_.size(), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_feature_default_bins_, - feature_default_bins_.size(), - __FILE__, - __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_feature_default_bins_, + InitCUDAMemoryFromHostMemory(&cuda_feature_default_bins_, feature_default_bins_.data(), feature_default_bins_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_num_bins_, + InitCUDAMemoryFromHostMemory(&cuda_feature_num_bins_, feature_num_bins_.data(), static_cast(num_features_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_is_feature_used_bytree_, static_cast(num_features_), __FILE__, __LINE__); num_tasks_ = 0; for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { const uint32_t num_bin = feature_num_bins_[inner_feature_index]; @@ -128,46 +161,84 @@ void CUDABestSplitFinder::Init() { const int num_task_blocks = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; const size_t cuda_best_leaf_split_info_buffer_size = static_cast(num_task_blocks) * static_cast(num_leaves_); - AllocateCUDAMemoryOuter(&cuda_leaf_best_split_info_, + AllocateCUDAMemory(&cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_task_feature_index_, + InitCUDAMemoryFromHostMemory(&cuda_task_feature_index_, host_task_feature_index_.data(), host_task_feature_index_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_task_reverse_, + InitCUDAMemoryFromHostMemory(&cuda_task_reverse_, host_task_reverse_.data(), host_task_reverse_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_task_skip_default_bin_, + InitCUDAMemoryFromHostMemory(&cuda_task_skip_default_bin_, host_task_skip_default_bin_.data(), host_task_skip_default_bin_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_task_na_as_missing_, + InitCUDAMemoryFromHostMemory(&cuda_task_na_as_missing_, host_task_na_as_missing_.data(), host_task_na_as_missing_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_task_out_default_left_, + InitCUDAMemoryFromHostMemory(&cuda_task_out_default_left_, host_task_out_default_left_.data(), host_task_out_default_left_.size(), __FILE__, __LINE__); const size_t output_buffer_size = 2 * static_cast(num_tasks_); - AllocateCUDAMemoryOuter(&cuda_best_split_info_, output_buffer_size, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_best_split_info_, output_buffer_size, __FILE__, __LINE__); +} - AllocateCUDAMemoryOuter(&cuda_best_split_info_buffer_, 7, __FILE__, __LINE__); - cuda_streams_.resize(2); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); +void CUDABestSplitFinder::ResetTrainingData( + const hist_t* cuda_hist, + const Dataset* train_data, + const std::vector& feature_hist_offsets) { + cuda_hist_ = cuda_hist; + num_features_ = train_data->num_features(); + feature_hist_offsets_ = feature_hist_offsets; + InitFeatureMetaInfo(train_data); + DeallocateCUDAMemory(&cuda_feature_hist_offsets_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_hist_offsets_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_mfb_offsets_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_default_bins_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_num_bins_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_is_feature_used_bytree_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_best_split_info_, __FILE__, __LINE__); + host_task_reverse_.clear(); + host_task_skip_default_bin_.clear(); + host_task_na_as_missing_.clear(); + host_task_feature_index_.clear(); + host_task_out_default_left_.clear(); + InitCUDAFeatureMetaInfo(); } -void CUDABestSplitFinder::BeforeTrain() {} +void CUDABestSplitFinder::ResetConfig(const Config* config) { + num_leaves_ = config->num_leaves; + lambda_l1_ = config->lambda_l1; + lambda_l2_ = config->lambda_l2; + min_data_in_leaf_ = config->min_data_in_leaf; + min_sum_hessian_in_leaf_ = config->min_sum_hessian_in_leaf; + min_gain_to_split_ = config->min_gain_to_split; + const int num_task_blocks = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; + const size_t cuda_best_leaf_split_info_buffer_size = static_cast(num_task_blocks) * static_cast(num_leaves_); + DeallocateCUDAMemory(&cuda_leaf_best_split_info_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_leaf_best_split_info_, + cuda_best_leaf_split_info_buffer_size, + __FILE__, + __LINE__); +} + +void CUDABestSplitFinder::BeforeTrain(const std::vector& is_feature_used_bytree) { + CopyFromHostToCUDADevice(cuda_is_feature_used_bytree_, + is_feature_used_bytree.data(), + is_feature_used_bytree.size(), __FILE__, __LINE__); +} void CUDABestSplitFinder::FindBestSplitsForLeaf( const CUDALeafSplitsStruct* smaller_leaf_splits, @@ -186,7 +257,7 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf( smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); global_timer.Stop("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); } @@ -212,7 +283,7 @@ const CUDASplitInfo* CUDABestSplitFinder::FindBestFromAllSplits( larger_leaf_best_split_threshold, larger_leaf_best_split_default_left, best_leaf_index); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); return cuda_leaf_best_split_info_ + (*best_leaf_index); } diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index e8c6ec9c8ef8..d91326117e23 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -357,6 +357,7 @@ __global__ void FindBestSplitsForLeafKernel( const uint8_t* feature_mfb_offsets, const uint32_t* feature_default_bins, const uint32_t* feature_num_bins, + const int8_t* is_feature_used_bytree, // input task information const bool larger_only, const int num_tasks, @@ -392,32 +393,36 @@ __global__ void FindBestSplitsForLeafKernel( const double num_data = is_larger ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf; const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; CUDASplitInfo* out = cuda_best_split_info + output_offset; - const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + feature_hist_offsets[inner_feature_index] * 2; - FindBestSplitsForLeafKernelInner( - // input feature information - hist_ptr, - feature_num_bins[inner_feature_index], - feature_mfb_offsets[inner_feature_index], - feature_default_bins[inner_feature_index], - inner_feature_index, - // input config parameter values - lambda_l1, - lambda_l2, - min_data_in_leaf, - min_sum_hessian_in_leaf, - min_gain_to_split, - // input parent node information - parent_gain, - sum_gradients, - sum_hessians, - num_data, - // input task information - reverse, - skip_default_bin, - na_as_missing, - assume_out_default_left, - // output parameters - out); + //if (is_feature_used_bytree[inner_feature_index]) { + const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + feature_hist_offsets[inner_feature_index] * 2; + FindBestSplitsForLeafKernelInner( + // input feature information + hist_ptr, + feature_num_bins[inner_feature_index], + feature_mfb_offsets[inner_feature_index], + feature_default_bins[inner_feature_index], + inner_feature_index, + // input config parameter values + lambda_l1, + lambda_l2, + min_data_in_leaf, + min_sum_hessian_in_leaf, + min_gain_to_split, + // input parent node information + parent_gain, + sum_gradients, + sum_hessians, + num_data, + // input task information + reverse, + skip_default_bin, + na_as_missing, + assume_out_default_left, + // output parameters + out); + /*} else { + out->is_valid = false; + }*/ } void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( @@ -441,6 +446,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_feature_mfb_offsets_, cuda_feature_default_bins_, cuda_feature_num_bins_, + cuda_is_feature_used_bytree_, // input task information larger_only, num_tasks_, @@ -463,7 +469,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( // output parameters cuda_best_split_info_); } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); if (larger_leaf_index >= 0) { FindBestSplitsForLeafKernel<<>>( // input feature information @@ -471,6 +477,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_feature_mfb_offsets_, cuda_feature_default_bins_, cuda_feature_num_bins_, + cuda_is_feature_used_bytree_, // input task information true, num_tasks_, @@ -682,7 +689,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( cuda_leaf_best_split_info_, false); } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); SyncBestSplitForLeafKernel<<>>( host_smaller_leaf_index, host_larger_leaf_index, @@ -719,7 +726,7 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( larger_only, num_leaves_); if (num_blocks_per_leaf > 1) { - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); SyncBestSplitForLeafKernelAllBlocks<<<1, 1>>>( host_smaller_leaf_index, host_larger_leaf_index, @@ -794,8 +801,8 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int cur_num_le cuda_best_split_info_buffer_, cuda_leaf_best_split_info_); std::vector host_leaf_best_split_info_buffer(7); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CopyFromCUDADeviceToHostOuter(host_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 7, __FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); + CopyFromCUDADeviceToHost(host_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 7, __FILE__, __LINE__); *smaller_leaf_best_split_feature = host_leaf_best_split_info_buffer[0]; *smaller_leaf_best_split_threshold = static_cast(host_leaf_best_split_info_buffer[1]); *smaller_leaf_best_split_default_left = static_cast(host_leaf_best_split_info_buffer[2]); diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index b5f6a052c46a..3445e21d67bd 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -31,9 +31,15 @@ class CUDABestSplitFinder { const std::vector& feature_hist_offsets, const Config* config); + ~CUDABestSplitFinder(); + + void InitFeatureMetaInfo(const Dataset* train_data); + void Init(); - void BeforeTrain(); + void InitCUDAFeatureMetaInfo(); + + void BeforeTrain(const std::vector& is_feature_used_bytree); void FindBestSplitsForLeaf( const CUDALeafSplitsStruct* smaller_leaf_splits, @@ -57,6 +63,13 @@ class CUDABestSplitFinder { uint8_t* larger_leaf_best_split_default_left, int* best_leaf_index); + void ResetTrainingData( + const hist_t* cuda_hist, + const Dataset* train_data, + const std::vector& feature_hist_offsets); + + void ResetConfig(const Config* config); + private: void LaunchFindBestSplitsForLeafKernel(const CUDALeafSplitsStruct* smaller_leaf_splits, const CUDALeafSplitsStruct* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, @@ -79,20 +92,19 @@ class CUDABestSplitFinder { int* best_leaf_index); // Host memory - const int num_features_; - const int num_leaves_; - const int num_total_bin_; + int num_features_; + int num_leaves_; int max_num_bin_in_feature_; std::vector feature_hist_offsets_; std::vector feature_mfb_offsets_; std::vector feature_default_bins_; std::vector feature_num_bins_; std::vector feature_missing_type_; - const double lambda_l1_; - const double lambda_l2_; - const data_size_t min_data_in_leaf_; - const double min_sum_hessian_in_leaf_; - const double min_gain_to_split_; + double lambda_l1_; + double lambda_l2_; + data_size_t min_data_in_leaf_; + double min_sum_hessian_in_leaf_; + double min_gain_to_split_; std::vector cuda_streams_; // for best split find tasks std::vector host_task_feature_index_; @@ -120,6 +132,7 @@ class CUDABestSplitFinder { uint8_t* cuda_task_skip_default_bin_; uint8_t* cuda_task_na_as_missing_; uint8_t* cuda_task_out_default_left_; + int8_t* cuda_is_feature_used_bytree_; // CUDA memory, held by other object const hist_t* cuda_hist_; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index dc6b3af5e3fd..416a4c0dc897 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -26,90 +26,98 @@ CUDADataPartition::CUDADataPartition( CalcBlockDim(num_data_); max_num_split_indices_blocks_ = grid_dim_; cur_num_leaves_ = 1; - bin_upper_bounds_.resize(num_features_); - feature_num_bins_.resize(num_features_); - int cur_group = 0; - uint32_t prev_group_bins = 0; - for (int feature_index = 0; feature_index < num_features_; ++feature_index) { - const int group = train_data->Feature2Group(feature_index); - if (cur_group != group) { - prev_group_bins += static_cast(train_data->FeatureGroupNumBin(cur_group)); - cur_group = group; - } - const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); - bin_upper_bounds_[feature_index] = bin_mapper->bin_upper_bound(); - feature_num_bins_[feature_index] = bin_mapper->num_bin(); - } - cuda_column_data_ = train_data->cuda_column_data(); + + cuda_data_indices_ = nullptr; + cuda_leaf_data_start_ = nullptr; + cuda_leaf_data_end_ = nullptr; + cuda_leaf_num_data_ = nullptr; + cuda_hist_pool_ = nullptr; + cuda_leaf_output_ = nullptr; + cuda_block_to_left_offset_ = nullptr; + cuda_data_index_to_leaf_index_ = nullptr; + cuda_block_data_to_left_offset_ = nullptr; + cuda_block_data_to_right_offset_ = nullptr; + cuda_out_data_indices_in_leaf_ = nullptr; + cuda_split_info_buffer_ = nullptr; + cuda_num_data_ = nullptr; + cuda_add_train_score_ = nullptr; +} + +CUDADataPartition::~CUDADataPartition() { + DeallocateCUDAMemory(&cuda_data_indices_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_data_start_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_data_end_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_num_data_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_hist_pool_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_output_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_block_to_left_offset_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_data_index_to_leaf_index_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_block_data_to_left_offset_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_block_data_to_right_offset_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_out_data_indices_in_leaf_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_split_info_buffer_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_num_data_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_add_train_score_, __FILE__, __LINE__); + CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[0])); + CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[1])); + CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[2])); + CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[3])); + cuda_streams_.clear(); + cuda_streams_.shrink_to_fit(); } void CUDADataPartition::Init() { // allocate CUDA memory - AllocateCUDAMemoryOuter(&cuda_data_indices_, static_cast(num_data_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_leaf_data_start_, static_cast(num_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_leaf_data_end_, static_cast(num_leaves_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_leaf_num_data_, static_cast(num_leaves_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_data_indices_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_leaf_data_start_, static_cast(num_leaves_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_leaf_data_end_, static_cast(num_leaves_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_leaf_num_data_, static_cast(num_leaves_), __FILE__, __LINE__); // leave some space for alignment - AllocateCUDAMemoryOuter(&cuda_block_to_left_offset_, static_cast(num_data_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_data_index_to_leaf_index_, static_cast(num_data_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_data_to_left_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_data_to_right_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_data_to_left_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_data_to_right_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_out_data_indices_in_leaf_, static_cast(num_data_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_hist_pool_, static_cast(num_leaves_), __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_block_to_left_offset_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_data_index_to_leaf_index_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_block_data_to_left_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_block_data_to_right_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + SetCUDAMemory(cuda_block_data_to_left_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + SetCUDAMemory(cuda_block_data_to_right_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_out_data_indices_in_leaf_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_hist_pool_, static_cast(num_leaves_), __FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_split_info_buffer_, 12, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_split_info_buffer_, 12, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_leaf_output_, static_cast(num_leaves_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_leaf_output_, static_cast(num_leaves_), __FILE__, __LINE__); cuda_streams_.resize(4); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[2])); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[3])); + gpuAssert(cudaStreamCreate(&cuda_streams_[0]), __FILE__, __LINE__); + gpuAssert(cudaStreamCreate(&cuda_streams_[1]), __FILE__, __LINE__); + gpuAssert(cudaStreamCreate(&cuda_streams_[2]), __FILE__, __LINE__); + gpuAssert(cudaStreamCreate(&cuda_streams_[3]), __FILE__, __LINE__); - std::vector flatten_bin_upper_bounds; - std::vector feature_num_bin_offsets; - int offset = 0; - feature_num_bin_offsets.emplace_back(offset); - for (size_t i = 0; i < bin_upper_bounds_.size(); ++i) { - CHECK_EQ(static_cast(feature_num_bins_[i]), bin_upper_bounds_[i].size()); - for (const auto value : bin_upper_bounds_[i]) { - flatten_bin_upper_bounds.emplace_back(value); - } - offset += feature_num_bins_[i]; - feature_num_bin_offsets.emplace_back(offset); - } - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_num_bin_offsets_, feature_num_bin_offsets.data(), feature_num_bin_offsets.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_bin_upper_bounds_, flatten_bin_upper_bounds.data(), flatten_bin_upper_bounds.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_num_data_, &num_data_, 1, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_num_data_, &num_data_, 1, __FILE__, __LINE__); add_train_score_.resize(num_data_, 0.0f); - AllocateCUDAMemoryOuter(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); use_bagging_ = false; - use_bagging_subset_ = false; used_indices_ = nullptr; } void CUDADataPartition::BeforeTrain() { - if (!use_bagging_ || use_bagging_subset_) { + if (!use_bagging_) { LaunchFillDataIndicesBeforeTrain(); } - SetCUDAMemoryOuter(cuda_leaf_num_data_, 0, static_cast(num_leaves_), __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_leaf_data_start_, 0, static_cast(num_leaves_), __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_leaf_data_end_, 0, static_cast(num_leaves_), __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SetCUDAMemory(cuda_leaf_num_data_, 0, static_cast(num_leaves_), __FILE__, __LINE__); + SetCUDAMemory(cuda_leaf_data_start_, 0, static_cast(num_leaves_), __FILE__, __LINE__); + SetCUDAMemory(cuda_leaf_data_end_, 0, static_cast(num_leaves_), __FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); if (!use_bagging_) { - CopyFromCUDADeviceToCUDADeviceOuter(cuda_leaf_num_data_, cuda_num_data_, 1, __FILE__, __LINE__); - CopyFromCUDADeviceToCUDADeviceOuter(cuda_leaf_data_end_, cuda_num_data_, 1, __FILE__, __LINE__); + CopyFromCUDADeviceToCUDADevice(cuda_leaf_num_data_, cuda_num_data_, 1, __FILE__, __LINE__); + CopyFromCUDADeviceToCUDADevice(cuda_leaf_data_end_, cuda_num_data_, 1, __FILE__, __LINE__); } else { - CopyFromHostToCUDADeviceOuter(cuda_leaf_num_data_, &num_used_indices_, 1, __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_leaf_data_end_, &num_used_indices_, 1, __FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_leaf_num_data_, &num_used_indices_, 1, __FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_leaf_data_end_, &num_used_indices_, 1, __FILE__, __LINE__); } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__); } void CUDADataPartition::Split( @@ -211,10 +219,11 @@ void CUDADataPartition::UpdateTrainScore(const Tree* tree, double* scores) { const CUDATree* cuda_tree = reinterpret_cast(tree); const data_size_t num_data_in_root = root_num_data(); if (use_bagging_) { - CopyFromHostToCUDADeviceOuter(cuda_data_indices_, used_indices_, static_cast(num_used_indices_), __FILE__, __LINE__); + // we need restore the order of indices in cuda_data_indices_ + CopyFromHostToCUDADevice(cuda_data_indices_, used_indices_, static_cast(num_used_indices_), __FILE__, __LINE__); } LaunchAddPredictionToScoreKernel(cuda_tree->cuda_leaf_value(), cuda_add_train_score_); - CopyFromCUDADeviceToHostOuter(add_train_score_.data(), + CopyFromCUDADeviceToHost(add_train_score_.data(), cuda_add_train_score_, static_cast(num_data_in_root), __FILE__, __LINE__); if (!use_bagging_) { OMP_INIT_EX(); @@ -254,48 +263,70 @@ void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf) { void CUDADataPartition::SetUsedDataIndices(const data_size_t* used_indices, const data_size_t num_used_indices) { use_bagging_ = true; - use_bagging_subset_ = false; num_used_indices_ = num_used_indices; used_indices_ = used_indices; - CopyFromHostToCUDADeviceOuter(cuda_data_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_data_indices_, used_indices, static_cast(num_used_indices), __FILE__, __LINE__); LaunchFillDataIndexToLeafIndex(); } -void CUDADataPartition::ResetTrainingData(const Dataset* train_data) { +void CUDADataPartition::ResetTrainingData(const Dataset* train_data, const int num_total_bin, hist_t* cuda_hist) { const data_size_t old_num_data = num_data_; num_data_ = train_data->num_data(); + num_features_ = train_data->num_features(); + num_total_bin_ = num_total_bin; + cuda_column_data_ = train_data->cuda_column_data(); + cuda_hist_ = cuda_hist; + CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_num_data_, &num_data_, 1, __FILE__, __LINE__); if (num_data_ > old_num_data) { CalcBlockDim(num_data_); const int old_max_num_split_indices_blocks = max_num_split_indices_blocks_; max_num_split_indices_blocks_ = grid_dim_; if (max_num_split_indices_blocks_ > old_max_num_split_indices_blocks) { - DeallocateCUDAMemoryOuter(&cuda_block_data_to_left_offset_, __FILE__, __LINE__); - DeallocateCUDAMemoryOuter(&cuda_block_data_to_right_offset_, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_data_to_left_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_data_to_right_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_data_to_left_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_block_data_to_right_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_block_data_to_left_offset_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_block_data_to_right_offset_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_block_data_to_left_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_block_data_to_right_offset_, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + SetCUDAMemory(cuda_block_data_to_left_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); + SetCUDAMemory(cuda_block_data_to_right_offset_, 0, static_cast(max_num_split_indices_blocks_) + 1, __FILE__, __LINE__); } - DeallocateCUDAMemoryOuter(&cuda_data_indices_, __FILE__, __LINE__); - DeallocateCUDAMemoryOuter(&cuda_block_to_left_offset_, __FILE__, __LINE__); - DeallocateCUDAMemoryOuter(&cuda_data_index_to_leaf_index_, __FILE__, __LINE__); - DeallocateCUDAMemoryOuter(&cuda_out_data_indices_in_leaf_, __FILE__, __LINE__); - DeallocateCUDAMemoryOuter(&cuda_add_train_score_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_data_indices_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_block_to_left_offset_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_data_index_to_leaf_index_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_out_data_indices_in_leaf_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_add_train_score_, __FILE__, __LINE__); add_train_score_.resize(num_data_, 0.0f); - AllocateCUDAMemoryOuter(&cuda_data_indices_, static_cast(num_data_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_block_to_left_offset_, static_cast(num_data_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_data_index_to_leaf_index_, static_cast(num_data_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_out_data_indices_in_leaf_, static_cast(num_data_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_data_indices_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_block_to_left_offset_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_data_index_to_leaf_index_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_out_data_indices_in_leaf_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); } + used_indices_ = nullptr; + use_bagging_ = false; + num_used_indices_ = 0; + cur_num_leaves_ = 1; +} + +void CUDADataPartition::ResetConfig(const Config* config) { + num_threads_ = OMP_NUM_THREADS(); + num_leaves_ = config->num_leaves; + DeallocateCUDAMemory(&cuda_leaf_data_start_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_data_end_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_num_data_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_hist_pool_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_output_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_leaf_data_start_, static_cast(num_leaves_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_leaf_data_end_, static_cast(num_leaves_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_leaf_num_data_, static_cast(num_leaves_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_leaf_output_, static_cast(num_leaves_), __FILE__, __LINE__); } void CUDADataPartition::SetBaggingSubset(const Dataset* subset) { num_used_indices_ = subset->num_data(); used_indices_ = nullptr; use_bagging_ = true; - use_bagging_subset_ = true; cuda_column_data_ = subset->cuda_column_data(); } diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index e109e183ffb9..705d4399cb4d 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -1111,7 +1111,7 @@ void CUDADataPartition::LaunchSplitInnerKernel( cuda_leaf_num_data_, cuda_data_indices_, grid_dim_); } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::AggregateBlockOffsetKernel"); global_timer.Start("CUDADataPartition::SplitInnerKernel"); SplitInnerKernel<<>>( @@ -1119,7 +1119,7 @@ void CUDADataPartition::LaunchSplitInnerKernel( cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, cuda_block_to_left_offset_, cuda_out_data_indices_in_leaf_); global_timer.Stop("CUDADataPartition::SplitInnerKernel"); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); global_timer.Start("CUDADataPartition::SplitTreeStructureKernel"); SplitTreeStructureKernel<<<4, 5, 0, cuda_streams_[0]>>>(left_leaf_index, right_leaf_index, @@ -1137,8 +1137,8 @@ void CUDADataPartition::LaunchSplitInnerKernel( std::vector cpu_split_info_buffer(12); const double* cpu_sum_hessians_info = reinterpret_cast(cpu_split_info_buffer.data() + 8); global_timer.Start("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); - CopyFromCUDADeviceToHostAsyncOuter(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 12, cuda_streams_[0], __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + CopyFromCUDADeviceToHostAsync(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 12, cuda_streams_[0], __FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); const data_size_t left_leaf_num_data = cpu_split_info_buffer[1]; const data_size_t left_leaf_data_start = cpu_split_info_buffer[2]; @@ -1190,7 +1190,7 @@ void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double* leaf_valu AddPredictionToScoreKernel<<>>( cuda_data_indices_, leaf_value, cuda_scores, cuda_data_index_to_leaf_index_, num_data_in_root); } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel"); } diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 62ec205706b9..cf634304d27c 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -33,6 +33,8 @@ class CUDADataPartition { const int num_threads, hist_t* cuda_hist); + ~CUDADataPartition(); + void Init(); void BeforeTrain(); @@ -64,7 +66,9 @@ class CUDADataPartition { void SetBaggingSubset(const Dataset* subset); - void ResetTrainingData(const Dataset* train_data); + void ResetTrainingData(const Dataset* train_data, const int num_total_bin, hist_t* cuda_hist); + + void ResetConfig(const Config* config); data_size_t root_num_data() const { if (use_bagging_) { @@ -82,8 +86,6 @@ class CUDADataPartition { bool use_bagging() const { return use_bagging_; } - bool use_bagging_subset() const { return use_bagging_subset_; } - private: void CalcBlockDim(const data_size_t num_data_in_leaf); @@ -218,13 +220,9 @@ class CUDADataPartition { /*! \brief number of training data */ data_size_t num_data_; /*! \brief number of features in training data */ - const int num_features_; + int num_features_; /*! \brief number of total bins in training data */ - const int num_total_bin_; - /*! \brief upper bounds of feature histogram bins */ - std::vector> bin_upper_bounds_; - /*! \brief number of bins per feature */ - std::vector feature_num_bins_; + int num_total_bin_; /*! \brief bin data stored by column */ const CUDAColumnData* cuda_column_data_; /*! \brief grid dimension when splitting one leaf */ @@ -238,15 +236,13 @@ class CUDADataPartition { // config information /*! \brief maximum number of leaves in a tree */ - const int num_leaves_; + int num_leaves_; /*! \brief number of threads */ - const int num_threads_; + int num_threads_; // per iteration information /*! \brief whether bagging is used in this iteration */ bool use_bagging_; - /*! \brief whether use subset data for bagging in this iteration */ - bool use_bagging_subset_; /*! \brief number of used data indices in this iteration */ data_size_t num_used_indices_; @@ -295,10 +291,6 @@ class CUDADataPartition { int* cuda_split_info_buffer_; // dataset information - /*! \brief upper bounds of bin boundaries for feature histograms */ - double* cuda_bin_upper_bounds_; - /*! \brief the bin offsets of features, used to access cuda_bin_upper_bounds_ */ - int* cuda_feature_num_bin_offsets_; /*! \brief number of data in training set, for intialization of cuda_leaf_num_data_ and cuda_leaf_data_end_ */ data_size_t* cuda_num_data_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index a6374797086e..7a7709b2ffc2 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -22,16 +22,34 @@ CUDAHistogramConstructor::CUDAHistogramConstructor( num_features_(train_data->num_features()), num_leaves_(num_leaves), num_threads_(num_threads), - num_feature_groups_(train_data->num_feature_groups()), min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf), gpu_device_id_(gpu_device_id) { - int offset = 0; - for (int group_id = 0; group_id < train_data->num_feature_groups(); ++group_id) { - offset += train_data->FeatureGroupNumBin(group_id); - } + InitFeatureMetaInfo(train_data, feature_hist_offsets); + cuda_row_data_.reset(nullptr); + cuda_feature_num_bins_ = nullptr; + cuda_feature_hist_offsets_ = nullptr; + cuda_feature_most_freq_bins_ = nullptr; + cuda_hist_ = nullptr; + cuda_need_fix_histogram_features_ = nullptr; + cuda_need_fix_histogram_features_num_bin_aligned_ = nullptr; +} + +CUDAHistogramConstructor::~CUDAHistogramConstructor() { + DeallocateCUDAMemory(&cuda_feature_num_bins_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_hist_offsets_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_most_freq_bins_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_hist_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_need_fix_histogram_features_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_need_fix_histogram_features_num_bin_aligned_, __FILE__, __LINE__); + gpuAssert(cudaStreamDestroy(cuda_stream_), __FILE__, __LINE__); +} + +void CUDAHistogramConstructor::InitFeatureMetaInfo(const Dataset* train_data, const std::vector& feature_hist_offsets) { need_fix_histogram_features_.clear(); need_fix_histogram_features_num_bin_aligend_.clear(); + feature_num_bins_.clear(); + feature_most_freq_bins_.clear(); for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) { const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); const uint32_t most_freq_bin = bin_mapper->GetMostFreqBin(); @@ -52,29 +70,26 @@ CUDAHistogramConstructor::CUDAHistogramConstructor( for (size_t i = 0; i < feature_hist_offsets.size(); ++i) { feature_hist_offsets_.emplace_back(feature_hist_offsets[i]); } - num_total_bin_ = offset; - cuda_row_data_.reset(nullptr); - cuda_row_data_subset_.reset(nullptr); - use_bagging_subset_ = false; + num_total_bin_ = static_cast(feature_hist_offsets.back()); } void CUDAHistogramConstructor::BeforeTrain(const score_t* gradients, const score_t* hessians) { cuda_gradients_ = gradients; cuda_hessians_ = hessians; - SetCUDAMemoryOuter(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); + SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); } void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStates* share_state) { - AllocateCUDAMemoryOuter(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); + SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_num_bins_, + InitCUDAMemoryFromHostMemory(&cuda_feature_num_bins_, feature_num_bins_.data(), feature_num_bins_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_hist_offsets_, + InitCUDAMemoryFromHostMemory(&cuda_feature_hist_offsets_, feature_hist_offsets_.data(), feature_hist_offsets_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_feature_most_freq_bins_, + InitCUDAMemoryFromHostMemory(&cuda_feature_most_freq_bins_, feature_most_freq_bins_.data(), feature_most_freq_bins_.size(), __FILE__, __LINE__); cuda_row_data_.reset(new CUDARowData(train_data, share_state, gpu_device_id_)); @@ -82,8 +97,8 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); - InitCUDAMemoryFromHostMemoryOuter(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__); - InitCUDAMemoryFromHostMemoryOuter(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(), + InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(), need_fix_histogram_features_num_bin_aligend_.size(), __FILE__, __LINE__); } @@ -99,7 +114,7 @@ void CUDAHistogramConstructor::ConstructHistogramForLeaf( return; } LaunchConstructHistogramKernel(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); LaunchSubtractHistogramKernel(cuda_smaller_leaf_splits, cuda_larger_leaf_splits); global_timer.Stop("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel"); @@ -111,24 +126,54 @@ void CUDAHistogramConstructor::CalcConstructHistogramKernelDim( int* block_dim_x, int* block_dim_y, const data_size_t num_data_in_smaller_leaf) { - const CUDARowData* cuda_row_data = use_bagging_subset_ ? cuda_row_data_subset_.get() : cuda_row_data_.get(); - *block_dim_x = cuda_row_data->max_num_column_per_partition(); - *block_dim_y = NUM_THRADS_PER_BLOCK / cuda_row_data->max_num_column_per_partition(); - *grid_dim_x = cuda_row_data->num_feature_partitions(); + *block_dim_x = cuda_row_data_->max_num_column_per_partition(); + *block_dim_y = NUM_THRADS_PER_BLOCK / cuda_row_data_->max_num_column_per_partition(); + *grid_dim_x = cuda_row_data_->num_feature_partitions(); *grid_dim_y = std::max(min_grid_dim_y_, ((num_data_in_smaller_leaf + NUM_DATA_PER_THREAD - 1) / NUM_DATA_PER_THREAD + (*block_dim_y) - 1) / (*block_dim_y)); } -void CUDAHistogramConstructor::ResetTrainingData(const Dataset* train_data) { +void CUDAHistogramConstructor::ResetTrainingData(const Dataset* train_data, TrainingShareStates* share_states) { num_data_ = train_data->num_data(); + num_features_ = train_data->num_features(); + InitFeatureMetaInfo(train_data, share_states->feature_hist_offsets()); + if (feature_num_bins_.size() > 0) { + DeallocateCUDAMemory(&cuda_feature_num_bins_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_hist_offsets_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_feature_most_freq_bins_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_need_fix_histogram_features_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_need_fix_histogram_features_num_bin_aligned_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_hist_, __FILE__, __LINE__); + } + + AllocateCUDAMemory(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); + SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); + + InitCUDAMemoryFromHostMemory(&cuda_feature_num_bins_, + feature_num_bins_.data(), feature_num_bins_.size(), __FILE__, __LINE__); + + InitCUDAMemoryFromHostMemory(&cuda_feature_hist_offsets_, + feature_hist_offsets_.data(), feature_hist_offsets_.size(), __FILE__, __LINE__); + + InitCUDAMemoryFromHostMemory(&cuda_feature_most_freq_bins_, + feature_most_freq_bins_.data(), feature_most_freq_bins_.size(), __FILE__, __LINE__); + + cuda_row_data_.reset(new CUDARowData(train_data, share_states, gpu_device_id_)); + cuda_row_data_->Init(train_data, share_states); + + InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(), + need_fix_histogram_features_num_bin_aligend_.size(), __FILE__, __LINE__); } -void CUDAHistogramConstructor::SetBaggingSubset(const data_size_t* used_indices, const data_size_t num_data) { - if (cuda_row_data_subset_ == nullptr) { - cuda_row_data_subset_.reset(new CUDARowData()); - } - cuda_row_data_subset_->CopySubrow(cuda_row_data_.get(), used_indices, num_data); - use_bagging_subset_ = true; +void CUDAHistogramConstructor::ResetConfig(const Config* config) { + num_threads_ = OMP_NUM_THREADS(); + num_leaves_ = config->num_leaves; + min_data_in_leaf_ = config->min_data_in_leaf; + min_sum_hessian_in_leaf_ = config->min_sum_hessian_in_leaf; + DeallocateCUDAMemory(&cuda_hist_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); + SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 206920ccd2da..79739ec035b3 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -138,124 +138,122 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf); dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); - const CUDARowData* cuda_row_data = use_bagging_subset_ ? cuda_row_data_subset_.get() : cuda_row_data_.get(); - - if (cuda_row_data->is_sparse()) { - if (cuda_row_data->bit_type() == 8) { - if (cuda_row_data->row_ptr_bit_type() == 16) { + if (cuda_row_data_->is_sparse()) { + if (cuda_row_data_->bit_type() == 8) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint8(), - cuda_row_data->cuda_row_ptr_uint16(), - cuda_row_data->cuda_partition_ptr_uint16(), - cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (cuda_row_data->row_ptr_bit_type() == 32) { + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint8(), - cuda_row_data->cuda_row_ptr_uint32(), - cuda_row_data->cuda_partition_ptr_uint32(), - cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (cuda_row_data->row_ptr_bit_type() == 64) { + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint8(), - cuda_row_data->cuda_row_ptr_uint64(), - cuda_row_data->cuda_partition_ptr_uint64(), - cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } - } else if (cuda_row_data->bit_type() == 16) { - if (cuda_row_data->row_ptr_bit_type() == 16) { + } else if (cuda_row_data_->bit_type() == 16) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint16(), - cuda_row_data->cuda_row_ptr_uint16(), - cuda_row_data->cuda_partition_ptr_uint16(), - cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (cuda_row_data->row_ptr_bit_type() == 32) { + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint16(), - cuda_row_data->cuda_row_ptr_uint32(), - cuda_row_data->cuda_partition_ptr_uint32(), - cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (cuda_row_data->row_ptr_bit_type() == 64) { + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint16(), - cuda_row_data->cuda_row_ptr_uint64(), - cuda_row_data->cuda_partition_ptr_uint64(), - cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } - } else if (cuda_row_data->bit_type() == 32) { - if (cuda_row_data->row_ptr_bit_type() == 16) { + } else if (cuda_row_data_->bit_type() == 32) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint32(), - cuda_row_data->cuda_row_ptr_uint16(), - cuda_row_data->cuda_partition_ptr_uint16(), - cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (cuda_row_data->row_ptr_bit_type() == 32) { + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint32(), - cuda_row_data->cuda_row_ptr_uint32(), - cuda_row_data->cuda_partition_ptr_uint32(), - cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); - } else if (cuda_row_data->row_ptr_bit_type() == 64) { + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint32(), - cuda_row_data->cuda_row_ptr_uint64(), - cuda_row_data->cuda_partition_ptr_uint64(), - cuda_row_data->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } } } else { - if (cuda_row_data->bit_type() == 8) { + if (cuda_row_data_->bit_type() == 8) { CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint8(), - cuda_row_data->cuda_column_hist_offsets(), - cuda_row_data->cuda_partition_hist_offsets(), - cuda_row_data->cuda_feature_partition_column_index_offsets(), + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_column_hist_offsets(), + cuda_row_data_->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_feature_partition_column_index_offsets(), num_data_); - } else if (cuda_row_data->bit_type() == 16) { + } else if (cuda_row_data_->bit_type() == 16) { CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint16(), - cuda_row_data->cuda_column_hist_offsets(), - cuda_row_data->cuda_partition_hist_offsets(), - cuda_row_data->cuda_feature_partition_column_index_offsets(), + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_column_hist_offsets(), + cuda_row_data_->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_feature_partition_column_index_offsets(), num_data_); - } else if (cuda_row_data->bit_type() == 32) { + } else if (cuda_row_data_->bit_type() == 32) { CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data->cuda_data_uint32(), - cuda_row_data->cuda_column_hist_offsets(), - cuda_row_data->cuda_partition_hist_offsets(), - cuda_row_data->cuda_feature_partition_column_index_offsets(), + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_column_hist_offsets(), + cuda_row_data_->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_feature_partition_column_index_offsets(), num_data_); } } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index aae853b28084..aa845f627d7e 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -41,6 +41,8 @@ class CUDAHistogramConstructor { const double min_sum_hessian_in_leaf, const int gpu_device_id); + ~CUDAHistogramConstructor(); + void Init(const Dataset* train_data, TrainingShareStates* share_state); void ConstructHistogramForLeaf( @@ -51,9 +53,9 @@ class CUDAHistogramConstructor { const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_larger_leaf); - void SetBaggingSubset(const data_size_t* used_indices, const data_size_t num_data); + void ResetTrainingData(const Dataset* train_data, TrainingShareStates* share_states); - void ResetTrainingData(const Dataset* train_data); + void ResetConfig(const Config* config); void BeforeTrain(const score_t* gradients, const score_t* hessians); @@ -64,6 +66,8 @@ class CUDAHistogramConstructor { hist_t* cuda_hist_pointer() { return cuda_hist_; } private: + void InitFeatureMetaInfo(const Dataset* train_data, const std::vector& feature_hist_offsets); + void CalcConstructHistogramKernelDim( int* grid_dim_x, int* grid_dim_y, @@ -84,15 +88,13 @@ class CUDAHistogramConstructor { /*! \brief size of training data */ data_size_t num_data_; /*! \brief number of features in training data */ - const int num_features_; + int num_features_; /*! \brief maximum number of leaves */ - const int num_leaves_; + int num_leaves_; /*! \brief number of threads */ - const int num_threads_; + int num_threads_; /*! \brief total number of bins in histogram */ int num_total_bin_; - /*! \brief number of feature groups */ - int num_feature_groups_; /*! \brief number of bins per feature */ std::vector feature_num_bins_; /*! \brief offsets in histogram of all features */ @@ -100,9 +102,9 @@ class CUDAHistogramConstructor { /*! \brief most frequent bins in each feature */ std::vector feature_most_freq_bins_; /*! \brief minimum number of data allowed per leaf */ - const int min_data_in_leaf_; + int min_data_in_leaf_; /*! \brief minimum sum value of hessians allowed per leaf */ - const double min_sum_hessian_in_leaf_; + double min_sum_hessian_in_leaf_; /*! \brief cuda stream for histogram construction */ cudaStream_t cuda_stream_; /*! \brief indices of feature whose histograms need to be fixed */ @@ -111,16 +113,12 @@ class CUDAHistogramConstructor { std::vector need_fix_histogram_features_num_bin_aligend_; /*! \brief minimum number of blocks allowed in the y dimension */ const int min_grid_dim_y_ = 160; - /*! \brief whether use bagging with subset */ - bool use_bagging_subset_; // CUDA memory, held by this object /*! \brief CUDA row wise data */ std::unique_ptr cuda_row_data_; - /*! \brief CUDA row wise data, used when bagging with subset */ - std::unique_ptr cuda_row_data_subset_; /*! \brief number of bins per feature */ uint32_t* cuda_feature_num_bins_; /*! \brief offsets in histogram of all features */ diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 43ddb427d359..03bc02682d64 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -13,6 +13,14 @@ namespace LightGBM { CUDALeafSplits::CUDALeafSplits(const data_size_t num_data): num_data_(num_data) { cuda_struct_ = nullptr; + cuda_sum_of_gradients_buffer_ = nullptr; + cuda_sum_of_hessians_buffer_ = nullptr; +} + +CUDALeafSplits::~CUDALeafSplits() { + DeallocateCUDAMemory(&cuda_struct_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_sum_of_gradients_buffer_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_sum_of_hessians_buffer_, __FILE__, __LINE__); } void CUDALeafSplits::Init() { @@ -20,15 +28,15 @@ void CUDALeafSplits::Init() { // allocate more memory for sum reduction in CUDA // only the first element records the final sum - AllocateCUDAMemoryOuter(&cuda_sum_of_gradients_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_of_hessians_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_sum_of_gradients_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_sum_of_hessians_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_struct_, 1, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_struct_, 1, __FILE__, __LINE__); } void CUDALeafSplits::InitValues() { LaunchInitValuesEmptyKernel(); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); } void CUDALeafSplits::InitValues( @@ -37,20 +45,20 @@ void CUDALeafSplits::InitValues( const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf, double* root_sum_hessians) { cuda_gradients_ = cuda_gradients; cuda_hessians_ = cuda_hessians; - SetCUDAMemoryOuter(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); - SetCUDAMemoryOuter(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); + SetCUDAMemory(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); + SetCUDAMemory(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); LaunchInitValuesKernal(cuda_bagging_data_indices, cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf); - CopyFromCUDADeviceToHostOuter(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, __FILE__, __LINE__); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + CopyFromCUDADeviceToHost(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, __FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); } void CUDALeafSplits::Resize(const data_size_t num_data) { if (num_data > num_data_) { - DeallocateCUDAMemoryOuter(&cuda_sum_of_gradients_buffer_, __FILE__, __LINE__); - DeallocateCUDAMemoryOuter(&cuda_sum_of_hessians_buffer_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_sum_of_gradients_buffer_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_sum_of_hessians_buffer_, __FILE__, __LINE__); num_blocks_init_from_gradients_ = (num_data + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS; - AllocateCUDAMemoryOuter(&cuda_sum_of_gradients_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_sum_of_hessians_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_sum_of_gradients_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_sum_of_hessians_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__); } else { num_blocks_init_from_gradients_ = (num_data + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS; } diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index d67357dc2c30..1cfd60f340cf 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -90,7 +90,7 @@ void CUDALeafSplits::LaunchInitValuesKernal( cuda_gradients_, cuda_hessians_, num_used_indices, cuda_bagging_data_indices, cuda_sum_of_gradients_buffer_, cuda_sum_of_hessians_buffer_); } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); CUDAInitValuesKernel2<<<1, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>( num_blocks_init_from_gradients_, cuda_sum_of_gradients_buffer_, @@ -99,7 +99,7 @@ void CUDALeafSplits::LaunchInitValuesKernal( cuda_data_indices_in_leaf, cuda_hist_in_leaf, cuda_struct_); - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index 9dbd0404a679..27ea7a4dcf03 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -34,7 +34,7 @@ class CUDALeafSplits { public: CUDALeafSplits(const data_size_t num_data); - CUDALeafSplits(); + ~CUDALeafSplits(); void Init(); diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/new_cuda_tree_learner.cpp index f7d18bf4f5da..f19b1d3f5c90 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.cpp @@ -15,29 +15,38 @@ namespace LightGBM { -NewCUDATreeLearner::NewCUDATreeLearner(const Config* config): SerialTreeLearner(config) {} +NewCUDATreeLearner::NewCUDATreeLearner(const Config* config): SerialTreeLearner(config) { + cuda_gradients_ = nullptr; + cuda_hessians_ = nullptr; +} -NewCUDATreeLearner::~NewCUDATreeLearner() {} +NewCUDATreeLearner::~NewCUDATreeLearner() { + DeallocateCUDAMemory(&cuda_gradients_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_hessians_, __FILE__, __LINE__); +} void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { - // use the first gpu by now SerialTreeLearner::Init(train_data, is_constant_hessian); num_threads_ = OMP_NUM_THREADS(); - const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0; - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); + // use the first gpu by default + gpu_device_id_ = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0; + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id_)); + cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_)); cuda_smaller_leaf_splits_->Init(); cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_)); cuda_larger_leaf_splits_->Init(); - cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, this->config_->num_leaves, num_threads_, + + cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, config_->num_leaves, num_threads_, share_state_->feature_hist_offsets(), - config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf, gpu_device_id)); + config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf, gpu_device_id_)); cuda_histogram_constructor_->Init(train_data_, share_state_.get()); cuda_data_partition_.reset(new CUDADataPartition( - train_data_, share_state_->feature_hist_offsets().back(), this->config_->num_leaves, num_threads_, + train_data_, share_state_->feature_hist_offsets().back(), config_->num_leaves, num_threads_, cuda_histogram_constructor_->cuda_hist_pointer())); cuda_data_partition_->Init(); + cuda_best_split_finder_.reset(new CUDABestSplitFinder(cuda_histogram_constructor_->cuda_hist(), train_data_, this->share_state_->feature_hist_offsets(), config_)); cuda_best_split_finder_->Init(); @@ -49,17 +58,16 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia leaf_data_start_.resize(config_->num_leaves, 0); leaf_sum_hessians_.resize(config_->num_leaves, 0.0f); - AllocateCUDAMemoryOuter(&cuda_gradients_, static_cast(num_data_), __FILE__, __LINE__); - AllocateCUDAMemoryOuter(&cuda_hessians_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_gradients_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_hessians_, static_cast(num_data_), __FILE__, __LINE__); } void NewCUDATreeLearner::BeforeTrain() { const data_size_t root_num_data = cuda_data_partition_->root_num_data(); - const size_t num_gradients_to_copy = cuda_data_partition_->use_bagging_subset() ? static_cast(root_num_data) : static_cast(num_data_); - CopyFromHostToCUDADeviceOuter(cuda_gradients_, gradients_, num_gradients_to_copy, __FILE__, __LINE__); - CopyFromHostToCUDADeviceOuter(cuda_hessians_, hessians_, num_gradients_to_copy, __FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_gradients_, gradients_, static_cast(num_data_), __FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_hessians_, hessians_, static_cast(num_data_), __FILE__, __LINE__); const data_size_t* leaf_splits_init_indices = - (cuda_data_partition_->use_bagging_subset() || !cuda_data_partition_->use_bagging()) ? nullptr : cuda_data_partition_->cuda_data_indices(); + cuda_data_partition_->use_bagging() ? cuda_data_partition_->cuda_data_indices() : nullptr; cuda_data_partition_->BeforeTrain(); cuda_smaller_leaf_splits_->InitValues( cuda_gradients_, @@ -72,7 +80,8 @@ void NewCUDATreeLearner::BeforeTrain() { leaf_num_data_[0] = root_num_data; cuda_larger_leaf_splits_->InitValues(); cuda_histogram_constructor_->BeforeTrain(cuda_gradients_, cuda_hessians_); - cuda_best_split_finder_->BeforeTrain(); + col_sampler_.ResetByTree(); + cuda_best_split_finder_->BeforeTrain(col_sampler_.is_feature_used_bytree()); leaf_data_start_[0] = 0; smaller_leaf_index_ = 0; larger_leaf_index_ = -1; @@ -175,7 +184,7 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_); global_timer.Stop("NewCUDATreeLearner::Split"); } - SynchronizeCUDADeviceOuter(__FILE__, __LINE__); + SynchronizeCUDADevice(__FILE__, __LINE__); tree->ToHost(); return tree.release(); } @@ -183,28 +192,48 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, void NewCUDATreeLearner::ResetTrainingData( const Dataset* train_data, bool is_constant_hessian) { - // TODO(shiyu1994): separte logic of reset training data and set bagging data - train_data_ = train_data; - num_data_ = train_data_->num_data(); + SerialTreeLearner::ResetTrainingData(train_data, is_constant_hessian); CHECK_EQ(num_features_, train_data_->num_features()); - //cuda_data_partition_->ResetTrainingData(train_data); - cuda_histogram_constructor_->ResetTrainingData(train_data); + cuda_histogram_constructor_->ResetTrainingData(train_data, share_state_.get()); + cuda_data_partition_->ResetTrainingData(train_data, + static_cast(share_state_->feature_hist_offsets().back()), + cuda_histogram_constructor_->cuda_hist_pointer()); + cuda_best_split_finder_->ResetTrainingData( + cuda_histogram_constructor_->cuda_hist(), + train_data, + share_state_->feature_hist_offsets()); cuda_smaller_leaf_splits_->Resize(num_data_); cuda_larger_leaf_splits_->Resize(num_data_); CHECK_EQ(is_constant_hessian, share_state_->is_constant_hessian); + DeallocateCUDAMemory(&cuda_gradients_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_hessians_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_gradients_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_hessians_, static_cast(num_data_), __FILE__, __LINE__); } -void NewCUDATreeLearner::SetBaggingData(const Dataset* subset, - const data_size_t* used_indices, data_size_t num_data) { - if (subset == nullptr) { - cuda_data_partition_->SetUsedDataIndices(used_indices, num_data); - } else { - cuda_histogram_constructor_->SetBaggingSubset(used_indices, num_data); - train_data_ = subset; - num_data_ = train_data_->num_data(); - CHECK_EQ(num_features_, train_data_->num_features()); - cuda_data_partition_->SetBaggingSubset(subset); +void NewCUDATreeLearner::ResetConfig(const Config* config) { + const int old_num_leaves = config_->num_leaves; + SerialTreeLearner::ResetConfig(config); + if (config_->gpu_device_id >= 0 && config_->gpu_device_id != gpu_device_id_) { + Log::Fatal("Changing gpu device ID by resetting configuration parameter is not allowed for CUDA tree learner."); } + num_threads_ = OMP_NUM_THREADS(); + if (config_->num_leaves != old_num_leaves) { + leaf_best_split_feature_.resize(config_->num_leaves, -1); + leaf_best_split_threshold_.resize(config_->num_leaves, 0); + leaf_best_split_default_left_.resize(config_->num_leaves, 0); + leaf_num_data_.resize(config_->num_leaves, 0); + leaf_data_start_.resize(config_->num_leaves, 0); + leaf_sum_hessians_.resize(config_->num_leaves, 0.0f); + } + cuda_histogram_constructor_->ResetConfig(config); + cuda_best_split_finder_->ResetConfig(config); + cuda_data_partition_->ResetConfig(config); +} + +void NewCUDATreeLearner::SetBaggingData(const Dataset* /*subset*/, + const data_size_t* used_indices, data_size_t num_data) { + cuda_data_partition_->SetUsedDataIndices(used_indices, num_data); } void NewCUDATreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/new_cuda_tree_learner.hpp index 652b9b9ee699..bfdf2d4a034f 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/new_cuda_tree_learner.hpp @@ -36,15 +36,18 @@ class NewCUDATreeLearner: public SerialTreeLearner { void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; + void ResetConfig(const Config* config) override; + protected: void BeforeTrain() override; - // number of GPUs - int num_gpus_; + // GPU device ID + int gpu_device_id_; // number of threads on CPU int num_threads_; // CUDA components for tree training + // leaf splits information for smaller and larger leaves std::unique_ptr cuda_smaller_leaf_splits_; std::unique_ptr cuda_larger_leaf_splits_; From 1f6dd90a63887c05e13130450d6cfbb5d9584670 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sun, 26 Sep 2021 08:53:26 +0000 Subject: [PATCH 082/166] add USE_CUDA ifdef to cuda tree learner files --- include/LightGBM/cuda/cuda_column_data.hpp | 4 ++++ include/LightGBM/cuda/cuda_metadata.hpp | 4 ++++ include/LightGBM/cuda/cuda_row_data.hpp | 4 ++++ include/LightGBM/cuda/cuda_split_info.hpp | 9 +++++---- include/LightGBM/cuda/cuda_tree.hpp | 4 ++++ include/LightGBM/cuda/cuda_utils.h | 9 +++++---- src/io/cuda/cuda_column_data.cpp | 4 ++++ src/io/cuda/cuda_column_data.cu | 5 +++++ src/io/cuda/cuda_metadata.cpp | 4 ++++ src/io/cuda/cuda_row_data.cpp | 4 ++++ src/io/cuda/cuda_tree.cpp | 4 ++++ src/io/cuda/cuda_tree.cu | 5 +++++ src/treelearner/cuda/cuda_best_split_finder.cpp | 8 ++++---- src/treelearner/cuda/cuda_leaf_splits.cu | 5 +++++ 14 files changed, 61 insertions(+), 12 deletions(-) diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp index d590b502a09b..fd3683586053 100644 --- a/include/LightGBM/cuda/cuda_column_data.hpp +++ b/include/LightGBM/cuda/cuda_column_data.hpp @@ -3,6 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#ifdef USE_CUDA + #ifndef LIGHTGBM_CUDA_COLUMN_DATA_HPP_ #define LIGHTGBM_CUDA_COLUMN_DATA_HPP_ @@ -134,3 +136,5 @@ class CUDAColumnData { } // namespace LightGBM #endif // LIGHTGBM_CUDA_COLUMN_DATA_HPP_ + +#endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp index 13767c96b5d5..3d888bee1ca0 100644 --- a/include/LightGBM/cuda/cuda_metadata.hpp +++ b/include/LightGBM/cuda/cuda_metadata.hpp @@ -3,6 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#ifdef USE_CUDA + #ifndef LIGHTGBM_CUDA_META_DATA_HPP_ #define LIGHTGBM_CUDA_META_DATA_HPP_ @@ -45,3 +47,5 @@ class CUDAMetadata { } // namespace LightGBM #endif // LIGHTGBM_CUDA_META_DATA_HPP_ + +#endif // USE_CUDA \ No newline at end of file diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index e4a67ea4b643..ca6ae29c5ecc 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -3,6 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#ifdef USE_CUDA + #ifndef LIGHTGBM_CUDA_ROW_DATA_HPP_ #define LIGHTGBM_CUDA_ROW_DATA_HPP_ @@ -162,3 +164,5 @@ class CUDARowData { } // namespace LightGBM #endif // LIGHTGBM_CUDA_COLUMN_DATA_HPP_ + +#endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp index 61b3438c063f..f09d2ecdd037 100644 --- a/include/LightGBM/cuda/cuda_split_info.hpp +++ b/include/LightGBM/cuda/cuda_split_info.hpp @@ -3,11 +3,12 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ -#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ -#define LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ #ifdef USE_CUDA +#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ +#define LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ + #include namespace LightGBM { @@ -36,6 +37,6 @@ struct CUDASplitInfo { } // namespace LightGBM -#endif // USE_CUDA - #endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ + +#endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index 0f6374da6344..1c8166ce36d5 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -3,6 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#ifdef USE_CUDA + #ifndef LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ #define LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ @@ -109,3 +111,5 @@ class CUDATree : public Tree { } //namespace LightGBM #endif // LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ + +#endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 7809e7ac0498..5a95250f3ddd 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -2,11 +2,12 @@ * Copyright (c) 2020 IBM Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_ -#define LIGHTGBM_CUDA_CUDA_UTILS_H_ #ifdef USE_CUDA +#ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_ +#define LIGHTGBM_CUDA_CUDA_UTILS_H_ + #include #include #include @@ -95,6 +96,6 @@ void DeallocateCUDAMemory(T** ptr, const char* file, const int line) { } -#endif // USE_CUDA - #endif // LIGHTGBM_CUDA_CUDA_UTILS_H_ + +#endif // USE_CUDA diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp index 2989074ac392..48e02702f5f7 100644 --- a/src/io/cuda/cuda_column_data.cpp +++ b/src/io/cuda/cuda_column_data.cpp @@ -3,6 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#ifdef USE_CUDA + #include namespace LightGBM { @@ -305,3 +307,5 @@ void CUDAColumnData::InitColumnMetaInfo() { } } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/io/cuda/cuda_column_data.cu b/src/io/cuda/cuda_column_data.cu index 881cb91a9676..75ff6234e09e 100644 --- a/src/io/cuda/cuda_column_data.cu +++ b/src/io/cuda/cuda_column_data.cu @@ -3,6 +3,9 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + +#ifdef USE_CUDA + #include #define COPY_SUBROW_BLOCK_SIZE_COLUMN_DATA (1024) @@ -54,3 +57,5 @@ void CUDAColumnData::LaunchCopySubrowKernel(void* const* in_cuda_data_by_column) } } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp index 28aeade0b919..3f5168871a51 100644 --- a/src/io/cuda/cuda_metadata.cpp +++ b/src/io/cuda/cuda_metadata.cpp @@ -3,6 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#ifdef USE_CUDA + #include namespace LightGBM { @@ -70,3 +72,5 @@ void CUDAMetadata::Init(const std::vector& label, } } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 2019c939abd4..cad72b82416f 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -3,6 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#ifdef USE_CUDA + #include namespace LightGBM { @@ -365,3 +367,5 @@ void CUDARowData::InitSparseData(const BIN_TYPE* host_data, } } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index beefdf90d7ee..0bc01f83c91e 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -3,6 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ +#ifdef USE_CUDA + #include namespace LightGBM { @@ -225,3 +227,5 @@ void CUDATree::SyncLeafOutputFromHostToCUDA() { } } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index 143d734866bd..d6b87cf664f0 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -3,6 +3,9 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ + +#ifdef USE_CUDA + #include namespace LightGBM { @@ -241,3 +244,5 @@ void CUDATree::LaunchAddBiasKernel(const double val) { } } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 52d7597931f5..1e9ebf1b6756 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -62,10 +62,10 @@ CUDABestSplitFinder::~CUDABestSplitFinder() { } void CUDABestSplitFinder::InitFeatureMetaInfo(const Dataset* train_data) { - feature_missing_type_.clear(); - feature_mfb_offsets_.clear(); - feature_default_bins_.clear(); - feature_num_bins_.clear(); + feature_missing_type_.resize(num_features_); + feature_mfb_offsets_.resize(num_features_); + feature_default_bins_.resize(num_features_); + feature_num_bins_.resize(num_features_); max_num_bin_in_feature_ = 0; for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { const BinMapper* bin_mapper = train_data->FeatureBinMapper(inner_feature_index); diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 1cfd60f340cf..2c47196ba704 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -4,6 +4,9 @@ * license information. */ + +#ifdef USE_CUDA + #include "cuda_leaf_splits.hpp" #include @@ -103,3 +106,5 @@ void CUDALeafSplits::LaunchInitValuesKernal( } } // namespace LightGBM + +#endif // USE_CUDA From 4ca758625816ebfe49bda5f24513e5080f962739 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sun, 26 Sep 2021 09:19:50 +0000 Subject: [PATCH 083/166] check that dataset doesn't contain CUDA tree learner --- src/treelearner/cuda/cuda_best_split_finder.cu | 16 ---------------- .../cuda/cuda_histogram_constructor.cpp | 3 +++ 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index d91326117e23..14e933cc5e9c 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -561,22 +561,6 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i CUDASplitInfo* cuda_split_info = cuda_leaf_best_split_info + buffer_write_pos; const CUDASplitInfo* best_split_info = cuda_best_split_info + best_read_index; if (best_split_info->is_valid) { - /*cuda_split_info->gain = best_split_info->gain; - cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : - cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; - cuda_split_info->default_left = best_split_info->default_left; - cuda_split_info->threshold = best_split_info->threshold; - cuda_split_info->left_sum_gradients = best_split_info->left_sum_gradients; - cuda_split_info->left_sum_hessians = best_split_info->left_sum_hessians; - cuda_split_info->left_count = best_split_info->left_count; - cuda_split_info->left_gain = best_split_info->left_gain; - cuda_split_info->left_value = best_split_info->left_value; - cuda_split_info->right_sum_gradients = best_split_info->right_sum_gradients; - cuda_split_info->right_sum_hessians = best_split_info->right_sum_hessians; - cuda_split_info->right_count = best_split_info->right_count; - cuda_split_info->right_gain = best_split_info->right_gain; - cuda_split_info->right_value = best_split_info->right_value; - cuda_split_info->is_valid = true;*/ *cuda_split_info = *best_split_info; cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 7a7709b2ffc2..74b88431676b 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -52,6 +52,9 @@ void CUDAHistogramConstructor::InitFeatureMetaInfo(const Dataset* train_data, co feature_most_freq_bins_.clear(); for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) { const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); + if (bin_mapper->bin_type() == BinType::CategoricalBin) { + Log::Fatal("CUDA tree learner doesn't support training categorical features."); + } const uint32_t most_freq_bin = bin_mapper->GetMostFreqBin(); if (most_freq_bin != 0) { need_fix_histogram_features_.emplace_back(feature_index); From 25f57e39c8d9fb535bdeabac4b6f764aabce4867 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sun, 26 Sep 2021 09:21:52 +0000 Subject: [PATCH 084/166] remove printf debug information --- src/treelearner/cuda/cuda_data_partition.cu | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 705d4399cb4d..b47e2c43b5cf 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -1045,17 +1045,9 @@ __global__ void SplitInnerKernel(const int left_leaf_index, const int right_leaf const uint32_t thread_to_left_offset = (threadIdx_x == 0 ? 0 : block_to_left_offset_ptr[threadIdx_x - 1]); const bool to_left = block_to_left_offset_ptr[threadIdx_x] > thread_to_left_offset; if (to_left) { - if (static_cast(thread_to_left_offset) >= block_to_left_offset_buffer[blockIdx.x + 1] - block_to_left_offset_buffer[blockIdx.x]) { - printf("error: thread_to_left_offset = %d, block_to_left_offset_buffer[%d] - block_to_left_offset_buffer[%d] = %d\n", - thread_to_left_offset, blockIdx.x + 1, blockIdx.x, block_to_left_offset_buffer[blockIdx.x + 1] - block_to_left_offset_buffer[blockIdx.x]); - } left_out_data_indices_in_leaf[thread_to_left_offset] = cuda_data_indices_in_leaf[global_thread_index]; } else { const uint32_t thread_to_right_offset = threadIdx.x - thread_to_left_offset; - if (static_cast(thread_to_right_offset) >= block_to_right_offset_buffer[blockIdx.x + 1] - block_to_right_offset_buffer[blockIdx.x]) { - printf("error: thread_to_right_offset = %d, block_to_right_offset_buffer[%d] - block_to_right_offset_buffer[%d] = %d\n", - thread_to_right_offset, blockIdx.x + 1, blockIdx.x, block_to_right_offset_buffer[blockIdx.x + 1] - block_to_right_offset_buffer[blockIdx.x]); - } right_out_data_indices_in_leaf[thread_to_right_offset] = cuda_data_indices_in_leaf[global_thread_index]; } } From 12794b0fa5de5a015fb154c37262a2cddd7d23ba Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sun, 26 Sep 2021 09:49:12 +0000 Subject: [PATCH 085/166] use full new cuda tree learner only when using single GPU --- ...r.cpp => cuda_single_gpu_tree_learner.cpp} | 42 +++++++++---------- ...r.hpp => cuda_single_gpu_tree_learner.hpp} | 10 ++--- src/treelearner/tree_learner.cpp | 8 +++- 3 files changed, 32 insertions(+), 28 deletions(-) rename src/treelearner/cuda/{new_cuda_tree_learner.cpp => cuda_single_gpu_tree_learner.cpp} (87%) rename src/treelearner/cuda/{new_cuda_tree_learner.hpp => cuda_single_gpu_tree_learner.hpp} (90%) diff --git a/src/treelearner/cuda/new_cuda_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp similarity index 87% rename from src/treelearner/cuda/new_cuda_tree_learner.cpp rename to src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index f19b1d3f5c90..03ccbd79052b 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -6,7 +6,7 @@ #ifdef USE_CUDA -#include "new_cuda_tree_learner.hpp" +#include "cuda_single_gpu_tree_learner.hpp" #include #include @@ -15,17 +15,17 @@ namespace LightGBM { -NewCUDATreeLearner::NewCUDATreeLearner(const Config* config): SerialTreeLearner(config) { +CUDASingleGPUTreeLearner::CUDASingleGPUTreeLearner(const Config* config): SerialTreeLearner(config) { cuda_gradients_ = nullptr; cuda_hessians_ = nullptr; } -NewCUDATreeLearner::~NewCUDATreeLearner() { +CUDASingleGPUTreeLearner::~CUDASingleGPUTreeLearner() { DeallocateCUDAMemory(&cuda_gradients_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_hessians_, __FILE__, __LINE__); } -void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { +void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { SerialTreeLearner::Init(train_data, is_constant_hessian); num_threads_ = OMP_NUM_THREADS(); // use the first gpu by default @@ -62,7 +62,7 @@ void NewCUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessia AllocateCUDAMemory(&cuda_hessians_, static_cast(num_data_), __FILE__, __LINE__); } -void NewCUDATreeLearner::BeforeTrain() { +void CUDASingleGPUTreeLearner::BeforeTrain() { const data_size_t root_num_data = cuda_data_partition_->root_num_data(); CopyFromHostToCUDADevice(cuda_gradients_, gradients_, static_cast(num_data_), __FILE__, __LINE__); CopyFromHostToCUDADevice(cuda_hessians_, hessians_, static_cast(num_data_), __FILE__, __LINE__); @@ -87,21 +87,21 @@ void NewCUDATreeLearner::BeforeTrain() { larger_leaf_index_ = -1; } -void NewCUDATreeLearner::AddPredictionToScore(const Tree* tree, double* out_score) const { +void CUDASingleGPUTreeLearner::AddPredictionToScore(const Tree* tree, double* out_score) const { cuda_data_partition_->UpdateTrainScore(tree, out_score); } -Tree* NewCUDATreeLearner::Train(const score_t* gradients, +Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, const score_t* hessians, bool /*is_first_tree*/) { gradients_ = gradients; hessians_ = hessians; - global_timer.Start("NewCUDATreeLearner::BeforeTrain"); + global_timer.Start("CUDASingleGPUTreeLearner::BeforeTrain"); BeforeTrain(); - global_timer.Stop("NewCUDATreeLearner::BeforeTrain"); + global_timer.Stop("CUDASingleGPUTreeLearner::BeforeTrain"); const bool track_branch_features = !(config_->interaction_constraints_vector.empty()); std::unique_ptr tree(new CUDATree(config_->num_leaves, track_branch_features, config_->linear_tree, config_->gpu_device_id)); for (int i = 0; i < config_->num_leaves - 1; ++i) { - global_timer.Start("NewCUDATreeLearner::ConstructHistogramForLeaf"); + global_timer.Start("CUDASingleGPUTreeLearner::ConstructHistogramForLeaf"); const data_size_t num_data_in_smaller_leaf = leaf_num_data_[smaller_leaf_index_]; const data_size_t num_data_in_larger_leaf = larger_leaf_index_ < 0 ? 0 : leaf_num_data_[larger_leaf_index_]; const double sum_hessians_in_smaller_leaf = leaf_sum_hessians_[smaller_leaf_index_]; @@ -113,16 +113,16 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, num_data_in_larger_leaf, sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf); - global_timer.Stop("NewCUDATreeLearner::ConstructHistogramForLeaf"); - global_timer.Start("NewCUDATreeLearner::FindBestSplitsForLeaf"); + global_timer.Stop("CUDASingleGPUTreeLearner::ConstructHistogramForLeaf"); + global_timer.Start("CUDASingleGPUTreeLearner::FindBestSplitsForLeaf"); cuda_best_split_finder_->FindBestSplitsForLeaf( cuda_smaller_leaf_splits_->GetCUDAStruct(), cuda_larger_leaf_splits_->GetCUDAStruct(), smaller_leaf_index_, larger_leaf_index_, num_data_in_smaller_leaf, num_data_in_larger_leaf, sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf); - global_timer.Stop("NewCUDATreeLearner::FindBestSplitsForLeaf"); - global_timer.Start("NewCUDATreeLearner::FindBestFromAllSplits"); + global_timer.Stop("CUDASingleGPUTreeLearner::FindBestSplitsForLeaf"); + global_timer.Start("CUDASingleGPUTreeLearner::FindBestFromAllSplits"); const CUDASplitInfo* best_split_info = nullptr; if (larger_leaf_index_ >= 0) { best_split_info = cuda_best_split_finder_->FindBestFromAllSplits( @@ -149,14 +149,14 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, nullptr, &best_leaf_index_); } - global_timer.Stop("NewCUDATreeLearner::FindBestFromAllSplits"); + global_timer.Stop("CUDASingleGPUTreeLearner::FindBestFromAllSplits"); if (best_leaf_index_ == -1) { Log::Warning("No further splits with positive gain, training stopped with %d leaves.", (i + 1)); break; } - global_timer.Start("NewCUDATreeLearner::Split"); + global_timer.Start("CUDASingleGPUTreeLearner::Split"); int right_leaf_index = tree->Split(best_leaf_index_, train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]), train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], @@ -182,14 +182,14 @@ Tree* NewCUDATreeLearner::Train(const score_t* gradients, &leaf_sum_hessians_[right_leaf_index]); smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index); larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_); - global_timer.Stop("NewCUDATreeLearner::Split"); + global_timer.Stop("CUDASingleGPUTreeLearner::Split"); } SynchronizeCUDADevice(__FILE__, __LINE__); tree->ToHost(); return tree.release(); } -void NewCUDATreeLearner::ResetTrainingData( +void CUDASingleGPUTreeLearner::ResetTrainingData( const Dataset* train_data, bool is_constant_hessian) { SerialTreeLearner::ResetTrainingData(train_data, is_constant_hessian); @@ -211,7 +211,7 @@ void NewCUDATreeLearner::ResetTrainingData( AllocateCUDAMemory(&cuda_hessians_, static_cast(num_data_), __FILE__, __LINE__); } -void NewCUDATreeLearner::ResetConfig(const Config* config) { +void CUDASingleGPUTreeLearner::ResetConfig(const Config* config) { const int old_num_leaves = config_->num_leaves; SerialTreeLearner::ResetConfig(config); if (config_->gpu_device_id >= 0 && config_->gpu_device_id != gpu_device_id_) { @@ -231,12 +231,12 @@ void NewCUDATreeLearner::ResetConfig(const Config* config) { cuda_data_partition_->ResetConfig(config); } -void NewCUDATreeLearner::SetBaggingData(const Dataset* /*subset*/, +void CUDASingleGPUTreeLearner::SetBaggingData(const Dataset* /*subset*/, const data_size_t* used_indices, data_size_t num_data) { cuda_data_partition_->SetUsedDataIndices(used_indices, num_data); } -void NewCUDATreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, +void CUDASingleGPUTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { CHECK(tree->is_cuda_tree()); CUDATree* cuda_tree = reinterpret_cast(tree); diff --git a/src/treelearner/cuda/new_cuda_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp similarity index 90% rename from src/treelearner/cuda/new_cuda_tree_learner.hpp rename to src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index bfdf2d4a034f..4bfc1964923c 100644 --- a/src/treelearner/cuda/new_cuda_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -16,11 +16,11 @@ namespace LightGBM { -class NewCUDATreeLearner: public SerialTreeLearner { +class CUDASingleGPUTreeLearner: public SerialTreeLearner { public: - explicit NewCUDATreeLearner(const Config* config); + explicit CUDASingleGPUTreeLearner(const Config* config); - ~NewCUDATreeLearner(); + ~CUDASingleGPUTreeLearner(); void Init(const Dataset* train_data, bool is_constant_hessian) override; @@ -82,10 +82,10 @@ class NewCUDATreeLearner: public SerialTreeLearner { namespace LightGBM { -class NewCUDATreeLearner: public SerialTreeLearner { +class CUDASingleGPUTreeLearner: public SerialTreeLearner { public: #pragma warning(disable : 4702) - explicit NewCUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) { + explicit CUDASingleGPUTreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) { Log::Fatal("CUDA Tree Learner was not enabled in this build.\n" "Please recompile with CMake option -DUSE_CUDA=1"); } diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp index 44bc6eeda36b..9d6f313e54c0 100644 --- a/src/treelearner/tree_learner.cpp +++ b/src/treelearner/tree_learner.cpp @@ -9,7 +9,7 @@ #include "linear_tree_learner.h" #include "parallel_tree_learner.h" #include "serial_tree_learner.h" -#include "cuda/new_cuda_tree_learner.hpp" +#include "cuda/cuda_single_gpu_tree_learner.hpp" namespace LightGBM { @@ -41,7 +41,11 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con } } else if (device_type == std::string("cuda")) { if (learner_type == std::string("serial")) { - return new NewCUDATreeLearner(config); + if (config->num_gpu == 1) { + return new CUDASingleGPUTreeLearner(config); + } else { + return new CUDATreeLearner(config); + } } else if (learner_type == std::string("feature")) { return new FeatureParallelTreeLearner(config); } else if (learner_type == std::string("data")) { From 7e18687669c21b9d9279263daee8d168dbe9be07 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 27 Sep 2021 02:22:16 +0000 Subject: [PATCH 086/166] disable all CUDA code when using CPU version --- include/LightGBM/dataset.h | 15 +++++++++++++++ src/cuda/cuda_algorithms.cu | 6 +++++- src/cuda/cuda_utils.cpp | 6 +++--- src/io/config.cpp | 5 +++-- src/io/dataset.cpp | 8 ++++++++ src/io/metadata.cpp | 4 ++++ src/objective/rank_objective.hpp | 2 +- src/treelearner/cuda/cuda_best_split_finder.cu | 6 +++--- 8 files changed, 42 insertions(+), 10 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 3f8bce635a65..188462a18585 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -212,10 +212,14 @@ class Metadata { /*! \brief Disable copy */ Metadata(const Metadata&) = delete; + #ifdef USE_CUDA + CUDAMetadata* cuda_metadata() const { return cuda_metadata_.get(); } void CreateCUDAMetadata(const int gpu_device_id); + #endif // USE_CUDA + private: /*! \brief Load initial scores from file */ void LoadInitialScore(); @@ -252,7 +256,9 @@ class Metadata { bool weight_load_from_file_; bool query_load_from_file_; bool init_score_load_from_file_; + #ifdef USE_CUDA std::unique_ptr cuda_metadata_; + #endif // USE_CUDA }; @@ -715,10 +721,14 @@ class Dataset { return feature_groups_[feature_group_index]->feature_min_bin(sub_feature_index); } + #ifdef USE_CUDA + const CUDAColumnData* cuda_column_data() const { return cuda_column_data_.get(); } + #endif // USE_CUDA + private: void CreateCUDAColumnData(); @@ -764,7 +774,12 @@ class Dataset { int num_numeric_features_; std::string device_type_; int gpu_device_id_; + + #ifdef USE_CUDA + std::unique_ptr cuda_column_data_; + + #endif // USE_CUDA }; } // namespace LightGBM diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index d1fa48c91fe4..ad1c8b2a2278 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -2,7 +2,9 @@ * Copyright (c) 2021 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ - + +#ifdef USE_CUDA + #include namespace LightGBM { @@ -76,3 +78,5 @@ void ShufflePrefixSumGlobal(uint64_t* values, size_t len, uint64_t* block_prefix } } // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp index 902bc6eae4c4..a1e6169e949a 100644 --- a/src/cuda/cuda_utils.cpp +++ b/src/cuda/cuda_utils.cpp @@ -5,14 +5,14 @@ #include -//#ifdef USE_CUDA +#ifdef USE_CUDA namespace LightGBM { void SynchronizeCUDADevice(const char* file, const int line) { - CUDASUCCESS_OR_FATAL_OUTER(cudaDeviceSynchronize()); + gpuAssert(cudaDeviceSynchronize(), file, line); } } // namespace LightGBM -//#endif // USE_CUDA +#endif // USE_CUDA diff --git a/src/io/config.cpp b/src/io/config.cpp index 0178cb5cb387..493bf1782362 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -334,8 +334,9 @@ void Config::CheckParamConflict() { num_leaves = static_cast(full_num_leaves); } } - // force col-wise for gpu - if (device_type == std::string("gpu")/* || device_type == std::string("cuda")*/) { + // force col-wise for gpu, and non-single GPU CUDA version + if (device_type == std::string("gpu") || + (device_type == std::string("cuda") && (num_gpu > 1 || tree_learner != std::string("serial")))) { force_col_wise = true; force_row_wise = false; if (deterministic) { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 260ee17ac694..ce9d5c408897 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -438,12 +438,14 @@ void Dataset::FinishLoad() { feature_groups_[i]->FinishLoad(); } } + #ifdef USE_CUDA if (device_type_ == std::string("cuda")) { CreateCUDAColumnData(); metadata_.CreateCUDAMetadata(gpu_device_id_); } else { cuda_column_data_.reset(nullptr); } + #endif // USE_CUDA is_finish_load_ = true; } @@ -845,6 +847,8 @@ void Dataset::CopySubrow(const Dataset* fullset, // update CUDA storage for column data and metadata device_type_ = fullset->device_type_; gpu_device_id_ = fullset->gpu_device_id_; + + #ifdef USE_CUDA if (device_type_ == std::string("cuda")) { global_timer.Start("prepare subset cuda column data"); if (cuda_column_data_ == nullptr) { @@ -856,6 +860,7 @@ void Dataset::CopySubrow(const Dataset* fullset, global_timer.Stop("copy subset cuda column data"); global_timer.Stop("prepare subset cuda column data"); } + #endif // USE_CUDA } bool Dataset::SetFloatField(const char* field_name, const float* field_data, @@ -1514,6 +1519,7 @@ const void* Dataset::GetColWiseData( return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator); } +#ifdef USE_CUDA void Dataset::CreateCUDAColumnData() { cuda_column_data_.reset(new CUDAColumnData(num_data_, gpu_device_id_)); int num_columns = 0; @@ -1618,4 +1624,6 @@ void Dataset::CreateCUDAColumnData() { feature_to_column); } +#endif // USE_CUDA + } // namespace LightGBM diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index cb8fb3ad064c..087ced92a833 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -18,7 +18,9 @@ Metadata::Metadata() { weight_load_from_file_ = false; query_load_from_file_ = false; init_score_load_from_file_ = false; + #ifdef USE_CUDA cuda_metadata_ = nullptr; + #endif // USE_CUDA } void Metadata::Init(const char* data_filename) { @@ -473,10 +475,12 @@ void Metadata::LoadQueryWeights() { } } +#ifdef USE_CUDA void Metadata::CreateCUDAMetadata(const int gpu_device_id) { cuda_metadata_.reset(new CUDAMetadata(gpu_device_id)); cuda_metadata_->Init(label_, weights_, query_boundaries_, query_weights_, init_score_, queries_); } +#endif // USE_CUDA void Metadata::LoadFromMemory(const void* memory) { const char* mem_ptr = reinterpret_cast(memory); diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 9d68116fd9c4..239bb3651f53 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -256,7 +256,7 @@ class LambdarankNDCG : public RankingObjective { const char* GetName() const override { return "lambdarank"; } private: - /*! \brief Simgoid param */ + /*! \brief Sigmoid param */ double sigmoid_; /*! \brief Normalize the lambdas or not */ bool norm_; diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 14e933cc5e9c..c1ffebdc3f27 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -393,7 +393,7 @@ __global__ void FindBestSplitsForLeafKernel( const double num_data = is_larger ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf; const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; CUDASplitInfo* out = cuda_best_split_info + output_offset; - //if (is_feature_used_bytree[inner_feature_index]) { + if (is_feature_used_bytree[inner_feature_index]) { const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + feature_hist_offsets[inner_feature_index] * 2; FindBestSplitsForLeafKernelInner( // input feature information @@ -420,9 +420,9 @@ __global__ void FindBestSplitsForLeafKernel( assume_out_default_left, // output parameters out); - /*} else { + } else { out->is_valid = false; - }*/ + } } void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( From 469e99269893c2f916c3dcf46dce52129dc2e8cf Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 27 Sep 2021 02:23:28 +0000 Subject: [PATCH 087/166] recover main.cpp --- src/main.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 776ffefd7e81..8034da826811 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,16 +8,10 @@ #include "network/linkers.h" -int main(int /*argc*/, char** /*argv*/) { +int main(int argc, char** argv) { bool success = false; - std::string config_str = std::string("config=train.conf"); - char* argv = new char[config_str.size() + 1]; - for (size_t i = 0; i < config_str.size(); ++i) { - argv[i] = config_str[i]; - } - argv[config_str.size()] = '\0'; try { - LightGBM::Application app(2, &argv - 1); + LightGBM::Application app(argc, argv); app.Run(); #ifdef USE_MPI From f2812c87b4a82defc825e6c201cfd3c9538ffd14 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 30 Sep 2021 06:30:54 +0000 Subject: [PATCH 088/166] add cpp files for multi value bins --- src/io/multi_val_dense_bin.cpp | 60 +++++++++++++ src/io/multi_val_dense_bin.hpp | 52 +---------- src/io/multi_val_sparse_bin.cpp | 154 ++++++++++++++++++++++++++++++++ src/io/multi_val_sparse_bin.hpp | 146 +----------------------------- 4 files changed, 218 insertions(+), 194 deletions(-) create mode 100644 src/io/multi_val_dense_bin.cpp create mode 100644 src/io/multi_val_sparse_bin.cpp diff --git a/src/io/multi_val_dense_bin.cpp b/src/io/multi_val_dense_bin.cpp new file mode 100644 index 000000000000..dbbdb973a347 --- /dev/null +++ b/src/io/multi_val_dense_bin.cpp @@ -0,0 +1,60 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "multi_val_dense_bin.hpp" + +namespace LightGBM { + +template <> +const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint8_t* to_return = data_.data(); + *bit_type = 8; + *total_size = static_cast(num_data_) * static_cast(num_feature_); + CHECK_EQ(*total_size, data_.size()); + *is_sparse = false; + *out_data_ptr = nullptr; + *data_ptr_bit_type = 0; + return to_return; +} + +template <> +const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint16_t* data_ptr = data_.data(); + const uint8_t* to_return = reinterpret_cast(data_ptr); + *bit_type = 16; + *total_size = static_cast(num_data_) * static_cast(num_feature_); + CHECK_EQ(*total_size, data_.size()); + *is_sparse = false; + *out_data_ptr = nullptr; + *data_ptr_bit_type = 0; + return to_return; +} + +template <> +const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint32_t* data_ptr = data_.data(); + const uint8_t* to_return = reinterpret_cast(data_ptr); + *bit_type = 32; + *total_size = static_cast(num_data_) * static_cast(num_feature_); + CHECK_EQ(*total_size, data_.size()); + *is_sparse = false; + *out_data_ptr = nullptr; + *data_ptr_bit_type = 0; + return to_return; +} + +} // namespace LightGBM diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index d70689f825d4..68ccbe904342 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -234,55 +235,6 @@ MultiValDenseBin* MultiValDenseBin::Clone() { return new MultiValDenseBin(*this); } -template <> -const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint8_t* to_return = data_.data(); - *bit_type = 8; - *total_size = static_cast(num_data_) * static_cast(num_feature_); - CHECK_EQ(*total_size, data_.size()); - *is_sparse = false; - *out_data_ptr = nullptr; - *data_ptr_bit_type = 0; - return to_return; -} - -template <> -const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint16_t* data_ptr = data_.data(); - const uint8_t* to_return = reinterpret_cast(data_ptr); - *bit_type = 16; - *total_size = static_cast(num_data_) * static_cast(num_feature_); - CHECK_EQ(*total_size, data_.size()); - *is_sparse = false; - *out_data_ptr = nullptr; - *data_ptr_bit_type = 0; - return to_return; -} - -template <> -const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint32_t* data_ptr = data_.data(); - const uint8_t* to_return = reinterpret_cast(data_ptr); - *bit_type = 32; - *total_size = static_cast(num_data_) * static_cast(num_feature_); - CHECK_EQ(*total_size, data_.size()); - *is_sparse = false; - *out_data_ptr = nullptr; - *data_ptr_bit_type = 0; - return to_return; -} - } // namespace LightGBM + #endif // LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_ diff --git a/src/io/multi_val_sparse_bin.cpp b/src/io/multi_val_sparse_bin.cpp new file mode 100644 index 000000000000..d465fda4da09 --- /dev/null +++ b/src/io/multi_val_sparse_bin.cpp @@ -0,0 +1,154 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#include "multi_val_sparse_bin.hpp" + +namespace LightGBM { + +template <> +const void* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint8_t* to_return = data_.data(); + *bit_type = 8; + *total_size = data_.size(); + *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 16; + return to_return; +} + +template <> +const void* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 16; + *total_size = data_.size(); + *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 16; + return to_return; +} + +template <> +const void* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 32; + *total_size = data_.size(); + *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 16; + return to_return; +} + +template <> +const void* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint8_t* to_return = data_.data(); + *bit_type = 8; + *total_size = data_.size(); + *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 32; + return to_return; +} + +template <> +const void* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 16; + *total_size = data_.size(); + *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 32; + return to_return; +} + +template <> +const void* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 32; + *total_size = data_.size(); + *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 32; + return to_return; +} + +template <> +const void* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint8_t* to_return = data_.data(); + *bit_type = 8; + *total_size = data_.size(); + *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 64; + return to_return; +} + +template <> +const void* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 16; + *total_size = data_.size(); + *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 64; + return to_return; +} + +template <> +const void* MultiValSparseBin::GetRowWiseData( + uint8_t* bit_type, + size_t* total_size, + bool* is_sparse, + const void** out_data_ptr, + uint8_t* data_ptr_bit_type) const { + const uint8_t* to_return = reinterpret_cast(data_.data()); + *bit_type = 32; + *total_size = data_.size(); + *is_sparse = true; + *out_data_ptr = reinterpret_cast(row_ptr_.data()); + *data_ptr_bit_type = 64; + return to_return; +} + +} // namespace LightGBM diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index 909ffade3634..ad2568d0affa 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -322,149 +323,6 @@ MultiValSparseBin* MultiValSparseBin::Clone() { return new MultiValSparseBin(*this); } -template <> -const void* MultiValSparseBin::GetRowWiseData( - uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint8_t* to_return = data_.data(); - *bit_type = 8; - *total_size = data_.size(); - *is_sparse = true; - *out_data_ptr = reinterpret_cast(row_ptr_.data()); - *data_ptr_bit_type = 16; - return to_return; -} - -template <> -const void* MultiValSparseBin::GetRowWiseData( - uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint8_t* to_return = reinterpret_cast(data_.data()); - *bit_type = 16; - *total_size = data_.size(); - *is_sparse = true; - *out_data_ptr = reinterpret_cast(row_ptr_.data()); - *data_ptr_bit_type = 16; - return to_return; -} - -template <> -const void* MultiValSparseBin::GetRowWiseData( - uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint8_t* to_return = reinterpret_cast(data_.data()); - *bit_type = 32; - *total_size = data_.size(); - *is_sparse = true; - *out_data_ptr = reinterpret_cast(row_ptr_.data()); - *data_ptr_bit_type = 16; - return to_return; -} - -template <> -const void* MultiValSparseBin::GetRowWiseData( - uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint8_t* to_return = data_.data(); - *bit_type = 8; - *total_size = data_.size(); - *is_sparse = true; - *out_data_ptr = reinterpret_cast(row_ptr_.data()); - *data_ptr_bit_type = 32; - return to_return; -} - -template <> -const void* MultiValSparseBin::GetRowWiseData( - uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint8_t* to_return = reinterpret_cast(data_.data()); - *bit_type = 16; - *total_size = data_.size(); - *is_sparse = true; - *out_data_ptr = reinterpret_cast(row_ptr_.data()); - *data_ptr_bit_type = 32; - return to_return; -} - -template <> -const void* MultiValSparseBin::GetRowWiseData( - uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint8_t* to_return = reinterpret_cast(data_.data()); - *bit_type = 32; - *total_size = data_.size(); - *is_sparse = true; - *out_data_ptr = reinterpret_cast(row_ptr_.data()); - *data_ptr_bit_type = 32; - return to_return; -} - -template <> -const void* MultiValSparseBin::GetRowWiseData( - uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint8_t* to_return = data_.data(); - *bit_type = 8; - *total_size = data_.size(); - *is_sparse = true; - *out_data_ptr = reinterpret_cast(row_ptr_.data()); - *data_ptr_bit_type = 64; - return to_return; -} - -template <> -const void* MultiValSparseBin::GetRowWiseData( - uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint8_t* to_return = reinterpret_cast(data_.data()); - *bit_type = 16; - *total_size = data_.size(); - *is_sparse = true; - *out_data_ptr = reinterpret_cast(row_ptr_.data()); - *data_ptr_bit_type = 64; - return to_return; -} - -template <> -const void* MultiValSparseBin::GetRowWiseData( - uint8_t* bit_type, - size_t* total_size, - bool* is_sparse, - const void** out_data_ptr, - uint8_t* data_ptr_bit_type) const { - const uint8_t* to_return = reinterpret_cast(data_.data()); - *bit_type = 32; - *total_size = data_.size(); - *is_sparse = true; - *out_data_ptr = reinterpret_cast(row_ptr_.data()); - *data_ptr_bit_type = 64; - return to_return; -} - } // namespace LightGBM + #endif // LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_ From 8e884b20f47b1f02c66c07ede4969ab41d619731 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 30 Sep 2021 06:54:24 +0000 Subject: [PATCH 089/166] update LightGBM.vcxproj --- include/LightGBM/cuda/cuda_algorithms.hpp | 3 ++- include/LightGBM/cuda/cuda_metadata.hpp | 9 +++++---- include/LightGBM/cuda/cuda_row_data.hpp | 8 +++++--- include/LightGBM/cuda/cuda_tree.hpp | 2 +- include/LightGBM/cuda/cuda_utils.h | 2 +- windows/LightGBM.vcxproj | 2 ++ windows/LightGBM.vcxproj.filters | 6 ++++++ 7 files changed, 22 insertions(+), 10 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 2e5bc049edec..876c3bbf9602 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -8,6 +8,7 @@ #ifdef USE_CUDA +#include #include #include #include @@ -68,7 +69,7 @@ __device__ __forceinline__ T ShuffleReduceSumWarp(T value, const data_size_t len if (len > 0) { // TODO(shiyu1994): check how mask works const uint32_t mask = 0xffffffff; - for (int offset = warpSize / 2; offset > 0; offset >>= 1) { + for (int offset = warpSize / 2; offset > 0; offset >>= 1) { value += __shfl_down_sync(mask, value, offset); } } diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp index 3d888bee1ca0..9abeb69f7468 100644 --- a/include/LightGBM/cuda/cuda_metadata.hpp +++ b/include/LightGBM/cuda/cuda_metadata.hpp @@ -8,15 +8,16 @@ #ifndef LIGHTGBM_CUDA_META_DATA_HPP_ #define LIGHTGBM_CUDA_META_DATA_HPP_ -#include "../meta.h" - #include +#include + +#include "../meta.h" namespace LightGBM { class CUDAMetadata { public: - CUDAMetadata(const int gpu_device_id); + explicit CUDAMetadata(const int gpu_device_id); ~CUDAMetadata(); @@ -48,4 +49,4 @@ class CUDAMetadata { #endif // LIGHTGBM_CUDA_META_DATA_HPP_ -#endif // USE_CUDA \ No newline at end of file +#endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index ca6ae29c5ecc..d99e85c0a1a5 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -8,10 +8,12 @@ #ifndef LIGHTGBM_CUDA_ROW_DATA_HPP_ #define LIGHTGBM_CUDA_ROW_DATA_HPP_ +#include + +#include #include #include #include -#include #include #include "../train_share_states.h" @@ -155,9 +157,9 @@ class CUDARowData { /*! \brief hisotgram offset of each partition */ uint32_t* cuda_partition_hist_offsets_; /*! \brief block buffer when calculating prefix sum */ - uint16_t* cuda_block_buffer_uint16_t_; + uint16_t* cuda_block_buffer_uint16_t_; /*! \brief block buffer when calculating prefix sum */ - uint32_t* cuda_block_buffer_uint32_t_; + uint32_t* cuda_block_buffer_uint32_t_; /*! \brief block buffer when calculating prefix sum */ uint64_t* cuda_block_buffer_uint64_t_; }; diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index 1c8166ce36d5..f8b303072375 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -108,7 +108,7 @@ class CUDATree : public Tree { const int num_threads_per_block_add_prediction_to_score_; }; -} //namespace LightGBM +} // namespace LightGBM #endif // LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 5a95250f3ddd..8b8153bd439f 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -94,7 +94,7 @@ void DeallocateCUDAMemory(T** ptr, const char* file, const int line) { } } -} +} // namespace LightGBM #endif // LIGHTGBM_CUDA_CUDA_UTILS_H_ diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index 59b589a40d51..0030b28eb01f 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -320,6 +320,8 @@ + + diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index 0f48c7564580..1e6f2cbabf21 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -326,5 +326,11 @@ src\treelearner + + src\io + + + src\io + \ No newline at end of file From 9b9a63ce9c83c5897aac512ed8922b6d1331c678 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 30 Sep 2021 07:08:50 +0000 Subject: [PATCH 090/166] update LightGBM.vcxproj fix lint errors --- src/io/cuda/cuda_metadata.cpp | 2 +- src/io/cuda/cuda_tree.cu | 2 +- src/treelearner/cuda/cuda_best_split_finder.cu | 9 +++------ src/treelearner/cuda/cuda_best_split_finder.hpp | 6 +++--- src/treelearner/cuda/cuda_data_partition.cpp | 2 ++ src/treelearner/cuda/cuda_data_partition.cu | 8 ++++++-- src/treelearner/cuda/cuda_histogram_constructor.cpp | 6 ++++-- windows/LightGBM.vcxproj | 2 ++ windows/LightGBM.vcxproj.filters | 6 ++++++ 9 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp index 3f5168871a51..e269fb0c7c2d 100644 --- a/src/io/cuda/cuda_metadata.cpp +++ b/src/io/cuda/cuda_metadata.cpp @@ -71,6 +71,6 @@ void CUDAMetadata::Init(const std::vector& label, SynchronizeCUDADevice(__FILE__, __LINE__); } -} // namespace LightGBM +} // namespace LightGBM #endif // USE_CUDA diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index d6b87cf664f0..03735f87730b 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -35,7 +35,7 @@ __device__ bool IsZeroCUDA(double fval) { return (fval >= -kZeroThreshold && fval <= kZeroThreshold); } -__global__ void SplitKernel(// split information +__global__ void SplitKernel( // split information const int leaf_index, const int real_feature_index, const double real_threshold, diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index c1ffebdc3f27..45d8d10bd166 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -201,7 +201,6 @@ __device__ void FindBestSplitsForLeafKernelInner( const uint8_t assume_out_default_left, // output parameters CUDASplitInfo* cuda_best_split_info) { - const double cnt_factor = num_data / sum_hessians; const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; @@ -379,7 +378,6 @@ __global__ void FindBestSplitsForLeafKernel( const double lambda_l2, // output CUDASplitInfo* cuda_best_split_info) { - const unsigned int task_index = blockIdx.x % num_tasks; const bool is_larger = static_cast(blockIdx.x >= num_tasks || larger_only); const int inner_feature_index = task_feature_index[task_index]; @@ -516,7 +514,7 @@ __device__ void ReduceBestSplit(bool* found, double* gain, uint32_t* shared_read } } __syncthreads(); - } + } } __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const int larger_leaf_index, @@ -642,7 +640,6 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( const int host_larger_leaf_index, const bool is_smaller_leaf_valid, const bool is_larger_leaf_valid) { - int num_tasks = num_tasks_; int num_tasks_aligned = 1; num_tasks -= 1; @@ -758,7 +755,7 @@ __global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int } else if (threadIdx_x == 2) { cuda_best_split_info_buffer[2] = cuda_leaf_best_split_info[smaller_leaf_index].default_left; } - if (larger_leaf_index >= 0) { + if (larger_leaf_index >= 0) { if (threadIdx_x == 3) { cuda_best_split_info_buffer[3] = cuda_leaf_best_split_info[larger_leaf_index].inner_feature_index; } else if (threadIdx_x == 4) { @@ -770,7 +767,7 @@ __global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int } void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int cur_num_leaves, - const int smaller_leaf_index, const int larger_leaf_index, + const int smaller_leaf_index, const int larger_leaf_index, int* smaller_leaf_best_split_feature, uint32_t* smaller_leaf_best_split_threshold, uint8_t* smaller_leaf_best_split_default_left, diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 3445e21d67bd..59f069774629 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -9,14 +9,14 @@ #ifdef USE_CUDA -#include "cuda_leaf_splits.hpp" - #include #include #include #include +#include "cuda_leaf_splits.hpp" + #define MAX_NUM_BIN_IN_FEATURE (256) #define NUM_THREADS_FIND_BEST_LEAF (256) #define NUM_TASKS_PER_SYNC_BLOCK (1024) @@ -138,7 +138,7 @@ class CUDABestSplitFinder { const hist_t* cuda_hist_; }; -} +} // namespace LightGBM #endif // USE_CUDA #endif // LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 416a4c0dc897..3017e6037e5f 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -6,6 +6,8 @@ #ifdef USE_CUDA +#include + #include "cuda_data_partition.hpp" namespace LightGBM { diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index b47e2c43b5cf..1d690d4a7c3b 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -6,10 +6,14 @@ #ifdef USE_CUDA -#include #include "cuda_data_partition.hpp" + +#include #include +#include +#include + namespace LightGBM { __global__ void FillDataIndicesBeforeTrainKernel(const data_size_t num_data, @@ -607,7 +611,7 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num uint32_t t_zero_bin = min_bin + default_bin; if (most_freq_bin == 0) { --th; - --t_zero_bin; + --t_zero_bin; } uint8_t split_default_to_left = 0; uint8_t split_missing_default_to_left = 0; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 74b88431676b..2545943c5337 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -8,6 +8,8 @@ #include "cuda_histogram_constructor.hpp" +#include + namespace LightGBM { CUDAHistogramConstructor::CUDAHistogramConstructor( @@ -107,7 +109,7 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat void CUDAHistogramConstructor::ConstructHistogramForLeaf( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, - const CUDALeafSplitsStruct* cuda_larger_leaf_splits, + const CUDALeafSplitsStruct* cuda_larger_leaf_splits, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, @@ -148,7 +150,7 @@ void CUDAHistogramConstructor::ResetTrainingData(const Dataset* train_data, Trai DeallocateCUDAMemory(&cuda_need_fix_histogram_features_num_bin_aligned_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_hist_, __FILE__, __LINE__); } - + AllocateCUDAMemory(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); SetCUDAMemory(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__); diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index 0030b28eb01f..bd0ad783f59f 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -317,12 +317,14 @@ + + diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index 1e6f2cbabf21..b0e3d7744f3e 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -332,5 +332,11 @@ src\io + + src\io + + + src\io + \ No newline at end of file From e0c9f6f98f47eab42b16f110e15b64a361450075 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 30 Sep 2021 07:17:34 +0000 Subject: [PATCH 091/166] fix lint errors --- src/treelearner/cuda/cuda_histogram_constructor.cu | 7 +++++-- src/treelearner/cuda/cuda_histogram_constructor.hpp | 8 +++----- src/treelearner/cuda/cuda_leaf_splits.cpp | 2 +- src/treelearner/cuda/cuda_leaf_splits.hpp | 2 +- src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp | 2 ++ src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp | 8 ++++++-- 6 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 79739ec035b3..b8d819b287db 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -6,9 +6,12 @@ #ifdef USE_CUDA -#include #include "cuda_histogram_constructor.hpp" +#include + +#include + namespace LightGBM { template @@ -265,7 +268,7 @@ __global__ void SubtractHistogramKernel( const CUDALeafSplitsStruct* cuda_larger_leaf_splits) { const unsigned int global_thread_index = threadIdx.x + blockIdx.x * blockDim.x; const int cuda_larger_leaf_index_ref = cuda_larger_leaf_splits->leaf_index; - if (cuda_larger_leaf_index_ref >= 0) { + if (cuda_larger_leaf_index_ref >= 0) { const hist_t* smaller_leaf_hist = cuda_smaller_leaf_splits->hist_in_leaf; hist_t* larger_leaf_hist = cuda_larger_leaf_splits->hist_in_leaf; if (global_thread_index < 2 * num_total_bin) { diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index aa845f627d7e..c4c77b3089f4 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -12,13 +12,11 @@ #include #include -#include - +#include +#include #include "cuda_leaf_splits.hpp" -#include - #define SHRAE_HIST_SIZE (6144 * 2) #define NUM_DATA_PER_THREAD (400) #define NUM_THRADS_PER_BLOCK (504) @@ -47,7 +45,7 @@ class CUDAHistogramConstructor { void ConstructHistogramForLeaf( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, - const CUDALeafSplitsStruct* cuda_larger_leaf_splits, + const CUDALeafSplitsStruct* cuda_larger_leaf_splits, const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_larger_leaf, const double sum_hessians_in_smaller_leaf, diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 03bc02682d64..c7dc55eefba6 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -41,7 +41,7 @@ void CUDALeafSplits::InitValues() { void CUDALeafSplits::InitValues( const score_t* cuda_gradients, const score_t* cuda_hessians, - const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, + const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf, double* root_sum_hessians) { cuda_gradients_ = cuda_gradients; cuda_hessians_ = cuda_hessians; diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index 27ea7a4dcf03..3ad0284dc86d 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -32,7 +32,7 @@ struct CUDALeafSplitsStruct { class CUDALeafSplits { public: - CUDALeafSplits(const data_size_t num_data); + explicit CUDALeafSplits(const data_size_t num_data); ~CUDALeafSplits(); diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 03ccbd79052b..7ca15fa8d77c 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -13,6 +13,8 @@ #include #include +#include + namespace LightGBM { CUDASingleGPUTreeLearner::CUDASingleGPUTreeLearner(const Config* config): SerialTreeLearner(config) { diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 4bfc1964923c..da4a10951be4 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -8,12 +8,16 @@ #ifdef USE_CUDA -#include "../serial_tree_learner.h" +#include +#include + #include "cuda_leaf_splits.hpp" #include "cuda_histogram_constructor.hpp" #include "cuda_data_partition.hpp" #include "cuda_best_split_finder.hpp" +#include "../serial_tree_learner.h" + namespace LightGBM { class CUDASingleGPUTreeLearner: public SerialTreeLearner { @@ -28,7 +32,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { bool is_constant_hessian) override; Tree* Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) override; - + void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override; void AddPredictionToScore(const Tree* tree, double* out_score) const override; From 3bba6d774c143c7aca9cffdb062a22baf7deb1f8 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 30 Sep 2021 07:27:39 +0000 Subject: [PATCH 092/166] fix lint errors --- include/LightGBM/cuda/cuda_algorithms.hpp | 3 ++- include/LightGBM/cuda/cuda_row_data.hpp | 4 ++-- include/LightGBM/dataset.h | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 876c3bbf9602..37e580fe888b 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -8,10 +8,11 @@ #ifdef USE_CUDA +#include + #include #include #include -#include #include #include diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index d99e85c0a1a5..e8cf64d51a99 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -8,14 +8,14 @@ #ifndef LIGHTGBM_CUDA_ROW_DATA_HPP_ #define LIGHTGBM_CUDA_ROW_DATA_HPP_ -#include - #include #include #include #include #include +#include + #include "../train_share_states.h" #define SHRAE_HIST_SIZE (6144 * 2) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 188462a18585..f70dda6eb222 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -6,8 +6,6 @@ #define LIGHTGBM_DATASET_H_ #include -#include -#include #include #include #include @@ -15,6 +13,9 @@ #include #include +#include +#include + #include #include #include From 8f9f03e7c98f0cb57f3b2a729fe416a51239ffc7 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 30 Sep 2021 09:03:28 +0000 Subject: [PATCH 093/166] update Makevars fix lint errors --- R-package/src/Makevars.in | 4 ++++ R-package/src/Makevars.win.in | 4 ++++ include/LightGBM/cuda/cuda_algorithms.hpp | 6 +++--- include/LightGBM/dataset.h | 6 +++--- src/io/sparse_bin.hpp | 1 + 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index 2490ba0757df..76b617e8d880 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -37,6 +37,10 @@ OBJECTS = \ io/parser.o \ io/train_share_states.o \ io/tree.o \ + io/dense_bin.o \ + io/sparse_bin.o \ + io/multi_val_dense_bin.o \ + io/multi_val_sparse_bin.o \ metric/dcg_calculator.o \ metric/metric.o \ objective/objective_function.o \ diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 0fb2de926905..a984c4befb53 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -38,6 +38,10 @@ OBJECTS = \ io/parser.o \ io/train_share_states.o \ io/tree.o \ + io/dense_bin.o \ + io/sparse_bin.o \ + io/multi_val_dense_bin.o \ + io/multi_val_sparse_bin.o \ metric/dcg_calculator.o \ metric/metric.o \ objective/objective_function.o \ diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 37e580fe888b..b8538ffb1d8f 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -8,16 +8,16 @@ #ifdef USE_CUDA -#include - -#include #include #include +#include #include #include #include +#include + #define NUM_BANKS_DATA_PARTITION (16) #define LOG_NUM_BANKS_DATA_PARTITION (4) #define GLOBAL_PREFIX_SUM_BLOCK_SIZE (1024) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index f70dda6eb222..c4344d0a0fa9 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -13,9 +13,6 @@ #include #include -#include -#include - #include #include #include @@ -24,6 +21,9 @@ #include #include +#include +#include + namespace LightGBM { /*! \brief forward declaration */ diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index ad15092ac39b..40a4856934b5 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -669,4 +669,5 @@ BinIterator* SparseBin::GetIterator(uint32_t min_bin, uint32_t max_bin, } } // namespace LightGBM + #endif // LightGBM_IO_SPARSE_BIN_HPP_ From 01d772d18952099815b94701711520f82bfe5254 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 8 Oct 2021 08:44:38 +0000 Subject: [PATCH 094/166] fix the case with 0 feature and 0 bin fix split finding for invalid leaves create cuda column data when loaded from bin file --- CMakeLists.txt | 8 ---- include/LightGBM/bin.h | 2 - include/LightGBM/cuda/cuda_row_data.hpp | 2 +- src/c_api.cpp | 2 +- src/io/config.cpp | 11 +++-- src/io/cuda/cuda_row_data.cpp | 12 ++++- src/io/dataset.cpp | 7 +++ src/io/dataset_loader.cpp | 10 +++++ src/io/multi_val_sparse_bin.cpp | 6 +-- .../cuda/cuda_best_split_finder.cu | 29 ++++++++++-- .../cuda/cuda_best_split_finder.hpp | 3 +- src/treelearner/cuda/cuda_data_partition.hpp | 6 +-- .../cuda/cuda_histogram_constructor.cpp | 6 ++- .../cuda/cuda_single_gpu_tree_learner.cpp | 44 ++++++++++++++++++- 14 files changed, 118 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 269367c6d9a0..8706e87bf040 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -358,14 +358,6 @@ if(USE_CUDA) src/io/cuda/*.cpp src/cuda/*.cpp src/cuda/*.cu - src/objective/cuda/*.cpp - src/objective/cuda/*.cu - src/boosting/cuda/*.cpp - src/boosting/cuda/*.cu - src/application/cuda/*.cpp - src/application/cuda/*.cu - src/metric/cuda/*.cpp - src/metric/cuda/*.cu endif(USE_CUDA) ) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index ebaa32eff975..0c50eadf6f06 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -198,8 +198,6 @@ class BinMapper { } } - inline const std::vector& bin_upper_bound() const { return bin_upper_bound_; } - private: /*! \brief Number of bins */ int num_bin_; diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index e8cf64d51a99..d2b3eab1568d 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -28,7 +28,7 @@ class CUDARowData { CUDARowData(const Dataset* train_data, const TrainingShareStates* train_share_state, const int gpu_device_id); - CUDARowData(); + ~CUDARowData(); void Init(const Dataset* train_data, TrainingShareStates* train_share_state); diff --git a/src/c_api.cpp b/src/c_api.cpp index 3f11223c8bcb..bc3bfc3b2434 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -421,7 +421,7 @@ class Booster { } return Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, - config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); + config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); } void Predict(int start_iteration, int num_iteration, int predict_type, int nrow, int ncol, diff --git a/src/io/config.cpp b/src/io/config.cpp index 493bf1782362..3f62a408ff9b 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -342,9 +342,14 @@ void Config::CheckParamConflict() { if (deterministic) { Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); } - } - // force gpu_use_dp for CUDA - if (device_type == std::string("cuda") && !gpu_use_dp) { + } else if (device_type == std::string("cuda")) { + // force row-wise for single GPU CUDA version + force_col_wise = false; + force_row_wise = true; + } + // force gpu_use_dp for non-single GPU CUDA version + if (device_type == std::string("cuda") && + (num_gpu > 1 || tree_learner != std::string("serial")) && !gpu_use_dp) { Log::Warning("CUDA currently requires double precision calculations."); gpu_use_dp = true; } diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index cad72b82416f..844478085722 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -14,7 +14,12 @@ CUDARowData::CUDARowData(const Dataset* train_data, const int gpu_device_id): gpu_device_id_(gpu_device_id) { num_threads_ = OMP_NUM_THREADS(); num_data_ = train_data->num_data(); - num_total_bin_ = static_cast(train_share_state->feature_hist_offsets().back()); + const auto& feature_hist_offsets = train_share_state->feature_hist_offsets(); + if (feature_hist_offsets.empty()) { + num_total_bin_ = 0; + } else { + num_total_bin_ = static_cast(feature_hist_offsets.back()); + } num_feature_group_ = train_data->num_feature_groups(); num_feature_ = train_data->num_features(); if (gpu_device_id >= 0) { @@ -39,7 +44,7 @@ CUDARowData::CUDARowData(const Dataset* train_data, cuda_block_buffer_uint64_t_ = nullptr; } -CUDARowData::CUDARowData() { +CUDARowData::~CUDARowData() { DeallocateCUDAMemory(&cuda_data_uint8_t_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_data_uint16_t_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_data_uint32_t_, __FILE__, __LINE__); @@ -55,6 +60,9 @@ CUDARowData::CUDARowData() { } void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_share_state) { + if (num_feature_ == 0) { + return; + } DivideCUDAFeatureGroups(train_data, train_share_state); bit_type_ = 0; size_t total_size = 0; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index ce9d5c408897..d16a974491b8 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1498,6 +1498,13 @@ void Dataset::AddFeaturesFrom(Dataset* other) { raw_data_.push_back(other->raw_data_[i]); } } + #ifdef USE_CUDA + if (device_type_ == std::string("cuda")) { + CreateCUDAColumnData(); + } else { + cuda_column_data_ = nullptr; + } + #endif // USE_CUDA } const void* Dataset::GetColWiseData( diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 2d2c4d622b1c..e81ea20551ec 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -248,6 +248,16 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac is_load_from_binary = true; Log::Info("Load from binary file %s", bin_filename.c_str()); dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines, &num_global_data, &used_data_indices)); + #ifdef USE_CUDA + dataset->device_type_ = config_.device_type; + dataset->gpu_device_id_ = config_.gpu_device_id; + if (config_.device_type == std::string("cuda")) { + dataset->CreateCUDAColumnData(); + dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_); + } else { + dataset->cuda_column_data_ = nullptr; + } + #endif // USE_CUDA } // check meta data dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices); diff --git a/src/io/multi_val_sparse_bin.cpp b/src/io/multi_val_sparse_bin.cpp index d465fda4da09..b03e8bb7c97e 100644 --- a/src/io/multi_val_sparse_bin.cpp +++ b/src/io/multi_val_sparse_bin.cpp @@ -104,7 +104,7 @@ const void* MultiValSparseBin::GetRowWiseData( } template <> -const void* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, @@ -120,7 +120,7 @@ const void* MultiValSparseBin::GetRowWiseData( } template <> -const void* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, @@ -136,7 +136,7 @@ const void* MultiValSparseBin::GetRowWiseData( } template <> -const void* MultiValSparseBin::GetRowWiseData( +const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, size_t* total_size, bool* is_sparse, diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 45d8d10bd166..70ba2ecdeca3 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -113,7 +113,7 @@ __device__ int ReduceBestGainForLeaves(double gain, int leaf_index, double* shar ReduceBestGainForLeavesWarp(gain, leaf_index, shared_gain_buffer + warpID, shared_leaf_index_buffer + warpID); __syncthreads(); if (warpID == 0) { - gain = warpLane < num_warp ? shared_gain_buffer[warpLane] : 0.0f; + gain = warpLane < num_warp ? shared_gain_buffer[warpLane] : kMinScore; leaf_index = warpLane < num_warp ? shared_leaf_index_buffer[warpLane] : -1; leaf_index = ReduceBestGainForLeavesBlock(gain, leaf_index); } @@ -579,9 +579,9 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( const bool larger_only) { if (!larger_only) { if (blockIdx.x == 0) { + CUDASplitInfo* smaller_leaf_split_info = cuda_leaf_best_split_info + smaller_leaf_index; for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { const unsigned int leaf_read_pos = static_cast(smaller_leaf_index) + block_index * static_cast(num_leaves); - CUDASplitInfo* smaller_leaf_split_info = cuda_leaf_best_split_info + smaller_leaf_index; const CUDASplitInfo* other_split_info = cuda_leaf_best_split_info + leaf_read_pos; if ((other_split_info->is_valid && smaller_leaf_split_info->is_valid && other_split_info->gain > smaller_leaf_split_info->gain) || @@ -607,9 +607,9 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( } if (larger_leaf_index >= 0) { if (blockIdx.x == 1 || larger_only) { + CUDASplitInfo* larger_leaf_split_info = cuda_leaf_best_split_info + larger_leaf_index; for (unsigned int block_index = 1; block_index < num_blocks_per_leaf; ++block_index) { const unsigned int leaf_read_pos = static_cast(larger_leaf_index) + block_index * static_cast(num_leaves); - CUDASplitInfo* larger_leaf_split_info = cuda_leaf_best_split_info + larger_leaf_index; const CUDASplitInfo* other_split_info = cuda_leaf_best_split_info + leaf_read_pos; if ((other_split_info->is_valid && larger_leaf_split_info->is_valid && other_split_info->gain > larger_leaf_split_info->gain) || @@ -635,11 +635,34 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( } } +__global__ void SetInvalidLeafSplitInfoKernel( + CUDASplitInfo* cuda_leaf_best_split_info, + const bool is_smaller_leaf_valid, + const bool is_larger_leaf_valid, + const int smaller_leaf_index, + const int larger_leaf_index) { + if (!is_smaller_leaf_valid) { + cuda_leaf_best_split_info[smaller_leaf_index].is_valid = false; + } + if (!is_larger_leaf_valid && larger_leaf_index >= 0) { + cuda_leaf_best_split_info[larger_leaf_index].is_valid = false; + } +} + void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( const int host_smaller_leaf_index, const int host_larger_leaf_index, const bool is_smaller_leaf_valid, const bool is_larger_leaf_valid) { + if (!is_smaller_leaf_valid || !is_larger_leaf_valid) { + SetInvalidLeafSplitInfoKernel<<<1, 1>>>( + cuda_leaf_best_split_info_, + is_smaller_leaf_valid, is_larger_leaf_valid, + host_smaller_leaf_index, host_larger_leaf_index); + } + if (!is_smaller_leaf_valid && !is_larger_leaf_valid) { + return; + } int num_tasks = num_tasks_; int num_tasks_aligned = 1; num_tasks -= 1; diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 59f069774629..6291470900e1 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -10,11 +10,12 @@ #ifdef USE_CUDA #include -#include #include #include +#include + #include "cuda_leaf_splits.hpp" #define MAX_NUM_BIN_IN_FEATURE (256) diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index cf634304d27c..9ba5118d9927 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -8,16 +8,16 @@ #ifdef USE_CUDA -#include #include #include #include -#include "cuda_leaf_splits.hpp" +#include #include #include -// TODO(shiyu1994): adjust these values according to different CUDA and GPU versions +#include "cuda_leaf_splits.hpp" + #define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) #define SPLIT_INDICES_BLOCK_SIZE_DATA_PARTITION (1024) #define AGGREGATE_BLOCK_SIZE_DATA_PARTITION (1024) diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 2545943c5337..cd853cd22be5 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -75,7 +75,11 @@ void CUDAHistogramConstructor::InitFeatureMetaInfo(const Dataset* train_data, co for (size_t i = 0; i < feature_hist_offsets.size(); ++i) { feature_hist_offsets_.emplace_back(feature_hist_offsets[i]); } - num_total_bin_ = static_cast(feature_hist_offsets.back()); + if (feature_hist_offsets.empty()) { + num_total_bin_ = 0; + } else { + num_total_bin_ = static_cast(feature_hist_offsets.back()); + } } void CUDAHistogramConstructor::BeforeTrain(const score_t* gradients, const score_t* hessians) { diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 7ca15fa8d77c..3b9714523351 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -44,8 +45,10 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_ config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf, gpu_device_id_)); cuda_histogram_constructor_->Init(train_data_, share_state_.get()); + const auto& feature_hist_offsets = share_state_->feature_hist_offsets(); + const int num_total_bin = feature_hist_offsets.empty() ? 0 : static_cast(feature_hist_offsets.back()); cuda_data_partition_.reset(new CUDADataPartition( - train_data_, share_state_->feature_hist_offsets().back(), config_->num_leaves, num_threads_, + train_data_, num_total_bin, config_->num_leaves, num_threads_, cuda_histogram_constructor_->cuda_hist_pointer())); cuda_data_partition_->Init(); @@ -242,7 +245,44 @@ void CUDASingleGPUTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFuncti data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { CHECK(tree->is_cuda_tree()); CUDATree* cuda_tree = reinterpret_cast(tree); - SerialTreeLearner::RenewTreeOutput(tree, obj, residual_getter, total_num_data, bag_indices, bag_cnt); + if (obj != nullptr && obj->IsRenewTreeOutput()) { + CHECK_LE(cuda_tree->num_leaves(), data_partition_->num_leaves()); + const data_size_t* bag_mapper = nullptr; + if (total_num_data != num_data_) { + CHECK_EQ(bag_cnt, num_data_); + bag_mapper = bag_indices; + } + std::vector n_nozeroworker_perleaf(tree->num_leaves(), 1); + int num_machines = Network::num_machines(); + #pragma omp parallel for schedule(static) + for (int i = 0; i < tree->num_leaves(); ++i) { + const double output = static_cast(tree->LeafOutput(i)); + data_size_t cnt_leaf_data = leaf_num_data_[i]; + std::vector index_mapper(cnt_leaf_data, -1); + CopyFromCUDADeviceToHost(index_mapper.data(), + cuda_data_partition_->cuda_data_indices() + leaf_data_start_[i], + static_cast(cnt_leaf_data), __FILE__, __LINE__); + if (cnt_leaf_data > 0) { + const double new_output = obj->RenewTreeOutput(output, residual_getter, index_mapper.data(), bag_mapper, cnt_leaf_data); + tree->SetLeafOutput(i, new_output); + } else { + CHECK_GT(num_machines, 1); + tree->SetLeafOutput(i, 0.0); + n_nozeroworker_perleaf[i] = 0; + } + } + if (num_machines > 1) { + std::vector outputs(tree->num_leaves()); + for (int i = 0; i < tree->num_leaves(); ++i) { + outputs[i] = static_cast(tree->LeafOutput(i)); + } + outputs = Network::GlobalSum(&outputs); + n_nozeroworker_perleaf = Network::GlobalSum(&n_nozeroworker_perleaf); + for (int i = 0; i < tree->num_leaves(); ++i) { + tree->SetLeafOutput(i, outputs[i] / n_nozeroworker_perleaf[i]); + } + } + } cuda_tree->SyncLeafOutputFromHostToCUDA(); } From e57dd15305c77b0a976dcb070bcdb6fe287f309a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 8 Oct 2021 09:00:13 +0000 Subject: [PATCH 095/166] fix lint errors hide GetRowWiseData when cuda is not used --- include/LightGBM/bin.h | 2 ++ include/LightGBM/config.h | 2 +- include/LightGBM/train_share_states.h | 6 ++++++ src/io/multi_val_dense_bin.cpp | 4 ++++ src/io/multi_val_dense_bin.hpp | 2 ++ src/io/multi_val_sparse_bin.cpp | 4 ++++ src/io/multi_val_sparse_bin.hpp | 3 +++ src/treelearner/cuda/cuda_data_partition.hpp | 2 ++ src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp | 4 ++-- 9 files changed, 26 insertions(+), 3 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 0c50eadf6f06..0e63851bfee0 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -464,11 +464,13 @@ class MultiValBin { virtual MultiValBin* Clone() = 0; + #ifdef USE_CUDA virtual const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, const void** out_data_ptr, uint8_t* data_ptr_bit_type) const = 0; + #endif // USE_CUDA }; inline uint32_t BinMapper::ValueToBin(double value) const { diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 38ea18df60dd..76aa6e841288 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -209,7 +209,7 @@ struct Config { // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support - std::string device_type = "cpu"; + std::string device_type = "cuda"; // [doc-only] // alias = random_seed, random_state diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index 466575b00fef..93cbf60ff5dd 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -125,6 +125,8 @@ class MultiValBinWrapper { is_subrow_copied_ = is_subrow_copied; } + + #ifdef USE_CUDA const void* GetRowWiseData( uint8_t* bit_type, size_t* total_size, @@ -140,6 +142,7 @@ class MultiValBinWrapper { return multi_val_bin_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type); } } + #endif // USE_CUDA private: bool is_use_subcol_ = false; @@ -229,6 +232,8 @@ struct TrainingShareStates { } } + + #ifdef USE_CUDA const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, @@ -243,6 +248,7 @@ struct TrainingShareStates { return nullptr; } } + #endif // USE_CUDA private: std::vector feature_hist_offsets_; diff --git a/src/io/multi_val_dense_bin.cpp b/src/io/multi_val_dense_bin.cpp index dbbdb973a347..d1d70f8e3bed 100644 --- a/src/io/multi_val_dense_bin.cpp +++ b/src/io/multi_val_dense_bin.cpp @@ -7,6 +7,8 @@ namespace LightGBM { + +#ifdef USE_CUDA template <> const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, @@ -57,4 +59,6 @@ const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, return to_return; } +#endif // USE_CUDA + } // namespace LightGBM diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index 68ccbe904342..b4fbfbe673aa 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -211,11 +211,13 @@ class MultiValDenseBin : public MultiValBin { MultiValDenseBin* Clone() override; + #ifdef USE_CUDA const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, const void** out_data_ptr, uint8_t* data_ptr_bit_type) const override; + #endif // USE_CUDA private: data_size_t num_data_; diff --git a/src/io/multi_val_sparse_bin.cpp b/src/io/multi_val_sparse_bin.cpp index b03e8bb7c97e..55d8e82492ad 100644 --- a/src/io/multi_val_sparse_bin.cpp +++ b/src/io/multi_val_sparse_bin.cpp @@ -7,6 +7,8 @@ namespace LightGBM { +#ifdef USE_CUDA + template <> const void* MultiValSparseBin::GetRowWiseData( uint8_t* bit_type, @@ -151,4 +153,6 @@ const void* MultiValSparseBin::GetRowWiseData( return to_return; } +#endif // USE_CUDA + } // namespace LightGBM diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index ad2568d0affa..eaa30ef0a0cc 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -291,11 +291,14 @@ class MultiValSparseBin : public MultiValBin { MultiValSparseBin* Clone() override; + + #ifdef USE_CUDA const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, const void** out_data_ptr, uint8_t* data_ptr_bit_type) const override; + #endif // USE_CUDA private: data_size_t num_data_; diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 9ba5118d9927..73afeec48d15 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -12,6 +12,8 @@ #include #include +#include + #include #include #include diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index da4a10951be4..2c5dfcb46ad2 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -6,11 +6,11 @@ #ifndef LIGHTGBM_NEW_CUDA_TREE_LEARNER_HPP_ #define LIGHTGBM_NEW_CUDA_TREE_LEARNER_HPP_ -#ifdef USE_CUDA - #include #include +#ifdef USE_CUDA + #include "cuda_leaf_splits.hpp" #include "cuda_histogram_constructor.hpp" #include "cuda_data_partition.hpp" From a5b9f7a8bcf575b32c1dcf24e9a5c6d228bc8f6e Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 8 Oct 2021 09:04:51 +0000 Subject: [PATCH 096/166] recover default device type to cpu --- include/LightGBM/config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 76aa6e841288..38ea18df60dd 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -209,7 +209,7 @@ struct Config { // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support - std::string device_type = "cuda"; + std::string device_type = "cpu"; // [doc-only] // alias = random_seed, random_state From 5f03d4517c8ddbd698aea2e86710bd996c07e9e1 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sat, 9 Oct 2021 08:10:38 +0000 Subject: [PATCH 097/166] fix na_as_missing case fix cuda feature meta information --- src/io/dataset.cpp | 24 +++++++++---------- .../cuda/cuda_best_split_finder.cu | 3 ++- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index d16a974491b8..e001fccbb3f1 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1568,14 +1568,14 @@ void Dataset::CreateCUDAColumnData() { feature_most_freq_bins[feature_index] = most_freq_bin; feature_default_bin[feature_index] = feature_bin_mapper->GetDefaultBin(); if (feature_bin_mapper->missing_type() == MissingType::Zero) { - feature_missing_is_zero.emplace_back(1); - feature_missing_is_na.emplace_back(0); + feature_missing_is_zero[feature_index] = 1; + feature_missing_is_na[feature_index] = 0; } else if (feature_bin_mapper->missing_type() == MissingType::NaN) { - feature_missing_is_zero.emplace_back(0); - feature_missing_is_na.emplace_back(1); + feature_missing_is_zero[feature_index] = 0; + feature_missing_is_na[feature_index] = 1; } else { - feature_missing_is_zero.emplace_back(0); - feature_missing_is_na.emplace_back(0); + feature_missing_is_zero[feature_index] = 0; + feature_missing_is_na[feature_index] = 0; } ++feature_index; } @@ -1601,14 +1601,14 @@ void Dataset::CreateCUDAColumnData() { feature_most_freq_bins[feature_index] = most_freq_bin; feature_default_bin[feature_index] = feature_bin_mapper->GetDefaultBin(); if (feature_bin_mapper->missing_type() == MissingType::Zero) { - feature_missing_is_zero.emplace_back(1); - feature_missing_is_na.emplace_back(0); + feature_missing_is_zero[feature_index] = 1; + feature_missing_is_na[feature_index] = 0; } else if (feature_bin_mapper->missing_type() == MissingType::NaN) { - feature_missing_is_zero.emplace_back(0); - feature_missing_is_na.emplace_back(1); + feature_missing_is_zero[feature_index] = 0; + feature_missing_is_na[feature_index] = 1; } else { - feature_missing_is_zero.emplace_back(0); - feature_missing_is_na.emplace_back(0); + feature_missing_is_zero[feature_index] = 0; + feature_missing_is_na[feature_index] = 0; } ++feature_index; } diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 70ba2ecdeca3..9419e542d953 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -229,7 +229,8 @@ __device__ void FindBestSplitsForLeafKernelInner( local_hess_hist = feature_hist_ptr[bin_offset + 1]; } } else { - if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { + if (threadIdx_x >= static_cast(na_as_missing) && + threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { const unsigned int read_index = feature_num_bin_minus_offset - 1 - threadIdx_x; const unsigned int bin_offset = read_index << 1; local_grad_hist = feature_hist_ptr[bin_offset]; From b2aaa9f2ec56d6212beefc3e6fd820b6a6b88c8e Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 15 Oct 2021 06:44:59 +0000 Subject: [PATCH 098/166] fix UpdateDataIndexToLeafIndexKernel --- src/treelearner/cuda/cuda_data_partition.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 1d690d4a7c3b..444750873e4a 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -70,7 +70,7 @@ __device__ __forceinline__ void PrepareOffset(const data_size_t num_data_in_leaf } } -template +template __global__ void UpdateDataIndexToLeafIndexKernel( const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, const uint32_t th, const BIN_TYPE* column_data, @@ -94,6 +94,8 @@ __global__ void UpdateDataIndexToLeafIndexKernel( } } else if (bin > th) { cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + } else { + cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } else { if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { @@ -110,6 +112,8 @@ __global__ void UpdateDataIndexToLeafIndexKernel( } else { if (!MAX_TO_LEFT) { cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + } else { + cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; } } } From 0726d87baf4a1f083d8dbf4c4b9f2ea9db6d17b1 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 15 Oct 2021 08:22:25 +0000 Subject: [PATCH 099/166] create CUDA trees when needed in CUDADataPartition::UpdateTrainScore --- src/io/cuda/cuda_tree.cpp | 11 +++++++++++ src/treelearner/cuda/cuda_data_partition.cpp | 11 +++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index 0bc01f83c91e..3a019fb862a4 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -161,6 +161,17 @@ void CUDATree::InitCUDA() { leaf_value_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_leaf_weight_, + leaf_weight_.data(), + leaf_weight_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_leaf_parent_, + leaf_parent_.data(), + leaf_parent_.size(), + __FILE__, + __LINE__); + CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); SynchronizeCUDADevice(__FILE__, __LINE__); } diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 3017e6037e5f..31401fb348ed 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -217,8 +217,14 @@ void CUDADataPartition::SplitInner( } void CUDADataPartition::UpdateTrainScore(const Tree* tree, double* scores) { - CHECK(tree->is_cuda_tree()); - const CUDATree* cuda_tree = reinterpret_cast(tree); + const CUDATree* cuda_tree = nullptr; + std::unique_ptr cuda_tree_ptr; + if (tree->is_cuda_tree()) { + cuda_tree = reinterpret_cast(tree); + } else { + cuda_tree_ptr.reset(new CUDATree(tree)); + cuda_tree = cuda_tree_ptr.get(); + } const data_size_t num_data_in_root = root_num_data(); if (use_bagging_) { // we need restore the order of indices in cuda_data_indices_ @@ -322,6 +328,7 @@ void CUDADataPartition::ResetConfig(const Config* config) { AllocateCUDAMemory(&cuda_leaf_data_start_, static_cast(num_leaves_), __FILE__, __LINE__); AllocateCUDAMemory(&cuda_leaf_data_end_, static_cast(num_leaves_), __FILE__, __LINE__); AllocateCUDAMemory(&cuda_leaf_num_data_, static_cast(num_leaves_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_hist_pool_, static_cast(num_leaves_), __FILE__, __LINE__); AllocateCUDAMemory(&cuda_leaf_output_, static_cast(num_leaves_), __FILE__, __LINE__); } From 1dea6bc2ac30094d44d9ad01d4517f67c5410c44 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 21 Oct 2021 06:37:08 +0000 Subject: [PATCH 100/166] add refit by tree for cuda tree learner --- src/io/cuda/cuda_tree.cpp | 33 ++++++- src/treelearner/cuda/cuda_data_partition.cpp | 12 +++ src/treelearner/cuda/cuda_data_partition.hpp | 4 + .../cuda/cuda_single_gpu_tree_learner.cpp | 20 +++++ .../cuda/cuda_single_gpu_tree_learner.cu | 87 +++++++++++++++++++ .../cuda/cuda_single_gpu_tree_learner.hpp | 14 +++ 6 files changed, 166 insertions(+), 4 deletions(-) create mode 100644 src/treelearner/cuda/cuda_single_gpu_tree_learner.cu diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index 3a019fb862a4..b97594bc4ff2 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -151,11 +151,36 @@ void CUDATree::InitCUDA() { threshold_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_leaf_depth_, + leaf_depth_.data(), + leaf_depth_.size(), + __FILE__, + __LINE__); InitCUDAMemoryFromHostMemory(&cuda_decision_type_, - decision_type_.data(), - decision_type_.size(), - __FILE__, - __LINE__); + decision_type_.data(), + decision_type_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_internal_weight_, + internal_weight_.data(), + internal_weight_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_internal_value_, + internal_value_.data(), + internal_value_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_internal_count_, + internal_count_.data(), + internal_count_.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_split_gain_, + split_gain_.data(), + split_gain_.size(), + __FILE__, + __LINE__); InitCUDAMemoryFromHostMemory(&cuda_leaf_value_, leaf_value_.data(), leaf_value_.size(), diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 31401fb348ed..0b68968c66df 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -339,6 +339,18 @@ void CUDADataPartition::SetBaggingSubset(const Dataset* subset) { cuda_column_data_ = subset->cuda_column_data(); } +void CUDADataPartition::ResetByLeafPred(const std::vector& leaf_pred, int num_leaves) { + if (leaf_pred.size() != static_cast(num_data_)) { + DeallocateCUDAMemory(&cuda_data_index_to_leaf_index_, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_data_index_to_leaf_index_, leaf_pred.data(), leaf_pred.size(), __FILE__, __LINE__); + num_data_ = static_cast(leaf_pred.size()); + } else { + CopyFromHostToCUDADevice(cuda_data_index_to_leaf_index_, leaf_pred.data(), leaf_pred.size(), __FILE__, __LINE__); + } + num_leaves_ = num_leaves; + cur_num_leaves_ = num_leaves; +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 73afeec48d15..d3a59c6467aa 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -72,6 +72,8 @@ class CUDADataPartition { void ResetConfig(const Config* config); + void ResetByLeafPred(const std::vector& leaf_pred, int num_leaves); + data_size_t root_num_data() const { if (use_bagging_) { return num_used_indices_; @@ -86,6 +88,8 @@ class CUDADataPartition { const data_size_t* cuda_leaf_data_start() const { return cuda_leaf_data_start_; } + const int* cuda_data_index_to_leaf_index() const { return cuda_data_index_to_leaf_index_; } + bool use_bagging() const { return use_bagging_; } private: diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 3b9714523351..030c4e51c03e 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -65,6 +65,10 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_ AllocateCUDAMemory(&cuda_gradients_, static_cast(num_data_), __FILE__, __LINE__); AllocateCUDAMemory(&cuda_hessians_, static_cast(num_data_), __FILE__, __LINE__); + + leaf_gradient_stat_buffer_ = nullptr; + leaf_hessian_stat_buffer_ = nullptr; + leaf_stat_buffer_size_ = 0; } void CUDASingleGPUTreeLearner::BeforeTrain() { @@ -286,6 +290,22 @@ void CUDASingleGPUTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFuncti cuda_tree->SyncLeafOutputFromHostToCUDA(); } +Tree* CUDASingleGPUTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const { + ReduceLeafStat(old_tree, gradients, hessians); +} + +Tree* CUDASingleGPUTreeLearner::FitByExistingTree(const Tree* old_tree, const std::vector& leaf_pred, + const score_t* gradients, const score_t* hessians) const { + cuda_data_partition_->ResetByLeafPred(leaf_pred, old_tree->num_leaves()); + refit_num_data_ = static_cast(leaf_pred.size()); + FitByExistingTree(old_tree, gradients, hessians); +} + +void CUDASingleGPUTreeLearner::ReduceLeafStat( + const Tree* old_tree, const score_t* gradients, const score_t* hessians) const { + LaunchReduceLeafStatKernel(gradients, hessians, old_tree->num_leaves(), refit_num_data_); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu new file mode 100644 index 000000000000..b9fcbbe3be60 --- /dev/null +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu @@ -0,0 +1,87 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#ifdef USE_CUDA + +#include "cuda_single_gpu_tree_learner.hpp" + +namespace LightGBM { + +#define CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE (1024) + +__global__ void ReduceLeafStatKernel_SharedMemory( + const score_t* gradients, + const score_t* hessians, + const int num_leaves, + const data_size_t num_data, + const int* data_index_to_leaf_index, + double* leaf_grad_stat_buffer, + double* leaf_hess_stat_buffer) { + extern __shared__ double shared_mem[]; + double* shared_grad_sum = shared_mem; + double* shared_hess_sum = shared_mem + num_leaves; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + for (int leaf_index = static_cast(threadIdx.x); leaf_index < num_leaves; leaf_index += static_cast(blockDim.x)) { + shared_grad_sum[leaf_index] = 0.0f; + shared_hess_sum[leaf_index] = 0.0f; + } + if (data_index < num_data) { + const int leaf_index = data_index_to_leaf_index[data_index]; + atomicAdd_block(shared_grad_sum + leaf_index, gradients[data_index]); + atomicAdd_block(shared_hess_sum + leaf_index, hessians[data_index]); + } + __syncthreads(); + for (int leaf_index = static_cast(threadIdx.x); leaf_index < num_leaves; leaf_index += static_cast(blockDim.x)) { + atomicAdd_system(leaf_grad_stat_buffer + leaf_index, shared_grad_sum[leaf_index]); + atomicAdd_system(leaf_hess_stat_buffer + leaf_index, shared_hess_sum[leaf_index]); + } +} + +__global__ void ReduceLeafStatKernel_GlobalMemory( + const score_t* gradients, + const score_t* hessians, + const int num_leaves, + const data_size_t num_data, + const int* data_index_to_leaf_index, + double* leaf_grad_stat_buffer, + double* leaf_hess_stat_buffer) { + const size_t offset = static_cast(num_leaves) * (blockIdx.x + 1); + double* grad_sum = leaf_grad_stat_buffer + offset; + double* hess_sum = leaf_hess_stat_buffer + offset; + const data_size_t data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + for (int leaf_index = static_cast(threadIdx.x); leaf_index < num_leaves; leaf_index += static_cast(blockDim.x)) { + grad_sum[leaf_index] = 0.0f; + hess_sum[leaf_index] = 0.0f; + } + if (data_index < num_data) { + const int leaf_index = data_index_to_leaf_index[data_index]; + atomicAdd_block(grad_sum + leaf_index, gradients[data_index]); + atomicAdd_block(hess_sum + leaf_index, hessians[data_index]); + } + __syncthreads(); + for (int leaf_index = static_cast(threadIdx.x); leaf_index < num_leaves; leaf_index += static_cast(blockDim.x)) { + atomicAdd_system(leaf_grad_stat_buffer + leaf_index, grad_sum[leaf_index]); + atomicAdd_system(leaf_hess_stat_buffer + leaf_index, hess_sum[leaf_index]); + } +} + +void CUDASingleGPUTreeLearner::LaunchReduceLeafStatKernel( + const score_t* gradients, const score_t* hessians, const int num_leaves, const data_size_t num_data) const { + const int num_block = (num_data + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; + if (num_leaves <= 2048) { + ReduceLeafStatKernel_SharedMemory<<>>( + gradients, hessians, num_leaves, num_data, cuda_data_partition_->cuda_data_index_to_leaf_index(), + leaf_gradient_stat_buffer_, leaf_hessian_stat_buffer_); + } else { + ReduceLeafStatKernel_GlobalMemory<<>>( + gradients, hessians, num_leaves, num_data, cuda_data_partition_->cuda_data_index_to_leaf_index(), + leaf_gradient_stat_buffer_, leaf_hessian_stat_buffer_); + } +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 2c5dfcb46ad2..9f36792463e0 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -42,9 +42,18 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { void ResetConfig(const Config* config) override; + Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override; + + Tree* FitByExistingTree(const Tree* old_tree, const std::vector& leaf_pred, + const score_t* gradients, const score_t* hessians) const override; + protected: void BeforeTrain() override; + void ReduceLeafStat(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const; + + void LaunchReduceLeafStatKernel(const score_t* gradients, const score_t* hessians, const int num_leaves, const data_size_t num_data) const; + // GPU device ID int gpu_device_id_; // number of threads on CPU @@ -72,6 +81,11 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { int larger_leaf_index_; int best_leaf_index_; + double* leaf_gradient_stat_buffer_; + double* leaf_hessian_stat_buffer_; + data_size_t leaf_stat_buffer_size_; + mutable data_size_t refit_num_data_; + /*! \brief gradients on CUDA */ score_t* cuda_gradients_; /*! \brief hessians on CUDA */ From 14b9ce9ad1b33f27f2fe7bb945d6fb885b20661a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 21 Oct 2021 09:34:43 +0000 Subject: [PATCH 101/166] fix test_refit in test_engine.py --- include/LightGBM/cuda/cuda_tree.hpp | 2 + .../cuda/cuda_best_split_finder.cu | 10 ++--- .../cuda/cuda_best_split_finder.hpp | 5 +++ src/treelearner/cuda/cuda_data_partition.cpp | 1 + .../cuda/cuda_single_gpu_tree_learner.cpp | 30 ++++++++++--- .../cuda/cuda_single_gpu_tree_learner.cu | 43 ++++++++++++++++--- .../cuda/cuda_single_gpu_tree_learner.hpp | 13 +++--- 7 files changed, 82 insertions(+), 22 deletions(-) diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index f8b303072375..6a398a612026 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -71,6 +71,8 @@ class CUDATree : public Tree { void SyncLeafOutputFromHostToCUDA(); + void SyncLeafOutputFromCUDAToHost(); + private: void InitCUDAMemory(); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 9419e542d953..59a0848565e0 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -129,7 +129,7 @@ __device__ double ThresholdL1(double s, double l1) { } } -__device__ double CalculateSplittedLeafOutput(double sum_gradients, +__device__ double CUDABestSplitFinder::CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, const bool use_l1, double l2) { double ret; @@ -308,9 +308,9 @@ __device__ void FindBestSplitsForLeafKernelInner( const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon; const data_size_t left_count = num_data - right_count; - const double left_output = CalculateSplittedLeafOutput(sum_left_gradient, + const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian, lambda_l1, use_l1, lambda_l2); - const double right_output = CalculateSplittedLeafOutput(sum_right_gradient, + const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, lambda_l2); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; @@ -331,9 +331,9 @@ __device__ void FindBestSplitsForLeafKernelInner( const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon; const data_size_t right_count = num_data - left_count; - const double left_output = CalculateSplittedLeafOutput(sum_left_gradient, + const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian, lambda_l1, use_l1, lambda_l2); - const double right_output = CalculateSplittedLeafOutput(sum_right_gradient, + const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, lambda_l2); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 6291470900e1..63de299de6c3 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -71,6 +71,11 @@ class CUDABestSplitFinder { void ResetConfig(const Config* config); + __device__ static double CalculateSplittedLeafOutput( + double sum_gradients, + double sum_hessians, double l1, const bool use_l1, + double l2); + private: void LaunchFindBestSplitsForLeafKernel(const CUDALeafSplitsStruct* smaller_leaf_splits, const CUDALeafSplitsStruct* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 0b68968c66df..1cb8d03f350a 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -98,6 +98,7 @@ void CUDADataPartition::Init() { InitCUDAMemoryFromHostMemory(&cuda_num_data_, &num_data_, 1, __FILE__, __LINE__); add_train_score_.resize(num_data_, 0.0f); + Log::Warning("cuda_add_train_score_ size = %d", num_data_); AllocateCUDAMemory(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); use_bagging_ = false; used_indices_ = nullptr; diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 030c4e51c03e..bf6953c1358f 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -66,8 +66,8 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_ AllocateCUDAMemory(&cuda_gradients_, static_cast(num_data_), __FILE__, __LINE__); AllocateCUDAMemory(&cuda_hessians_, static_cast(num_data_), __FILE__, __LINE__); - leaf_gradient_stat_buffer_ = nullptr; - leaf_hessian_stat_buffer_ = nullptr; + cuda_leaf_gradient_stat_buffer_ = nullptr; + cuda_leaf_hessian_stat_buffer_ = nullptr; leaf_stat_buffer_size_ = 0; } @@ -291,19 +291,37 @@ void CUDASingleGPUTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFuncti } Tree* CUDASingleGPUTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const { - ReduceLeafStat(old_tree, gradients, hessians); + std::unique_ptr cuda_tree(new CUDATree(old_tree)); + SetCUDAMemory(cuda_leaf_gradient_stat_buffer_, 0, static_cast(old_tree->num_leaves()), __FILE__, __LINE__); + SetCUDAMemory(cuda_leaf_hessian_stat_buffer_, 0, static_cast(old_tree->num_leaves()), __FILE__, __LINE__); + ReduceLeafStat(cuda_tree.get(), gradients, hessians); + cuda_tree->SyncLeafOutputFromCUDAToHost(); + return cuda_tree.release(); } Tree* CUDASingleGPUTreeLearner::FitByExistingTree(const Tree* old_tree, const std::vector& leaf_pred, const score_t* gradients, const score_t* hessians) const { cuda_data_partition_->ResetByLeafPred(leaf_pred, old_tree->num_leaves()); refit_num_data_ = static_cast(leaf_pred.size()); - FitByExistingTree(old_tree, gradients, hessians); + data_size_t buffer_size = static_cast(old_tree->num_leaves()); + if (old_tree->num_leaves() > 2048) { + const int num_block = (refit_num_data_ + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; + buffer_size *= static_cast(num_block + 1); + } + if (buffer_size != leaf_stat_buffer_size_) { + if (leaf_stat_buffer_size_ != 0) { + DeallocateCUDAMemory(&cuda_leaf_gradient_stat_buffer_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_leaf_hessian_stat_buffer_, __FILE__, __LINE__); + } + AllocateCUDAMemory(&cuda_leaf_gradient_stat_buffer_, static_cast(buffer_size), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_leaf_hessian_stat_buffer_, static_cast(buffer_size), __FILE__, __LINE__); + } + return FitByExistingTree(old_tree, gradients, hessians); } void CUDASingleGPUTreeLearner::ReduceLeafStat( - const Tree* old_tree, const score_t* gradients, const score_t* hessians) const { - LaunchReduceLeafStatKernel(gradients, hessians, old_tree->num_leaves(), refit_num_data_); + CUDATree* old_tree, const score_t* gradients, const score_t* hessians) const { + LaunchReduceLeafStatKernel(gradients, hessians, old_tree->num_leaves(), refit_num_data_, old_tree->cuda_leaf_value_ref(), old_tree->shrinkage()); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu index b9fcbbe3be60..d8ec44aa8e17 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu @@ -10,8 +10,6 @@ namespace LightGBM { -#define CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE (1024) - __global__ void ReduceLeafStatKernel_SharedMemory( const score_t* gradients, const score_t* hessians, @@ -28,6 +26,7 @@ __global__ void ReduceLeafStatKernel_SharedMemory( shared_grad_sum[leaf_index] = 0.0f; shared_hess_sum[leaf_index] = 0.0f; } + __syncthreads(); if (data_index < num_data) { const int leaf_index = data_index_to_leaf_index[data_index]; atomicAdd_block(shared_grad_sum + leaf_index, gradients[data_index]); @@ -56,6 +55,7 @@ __global__ void ReduceLeafStatKernel_GlobalMemory( grad_sum[leaf_index] = 0.0f; hess_sum[leaf_index] = 0.0f; } + __syncthreads(); if (data_index < num_data) { const int leaf_index = data_index_to_leaf_index[data_index]; atomicAdd_block(grad_sum + leaf_index, gradients[data_index]); @@ -68,18 +68,49 @@ __global__ void ReduceLeafStatKernel_GlobalMemory( } } +__global__ void CalcRefitLeafOutputKernel( + const int num_leaves, + const double* leaf_grad_stat_buffer, + const double* leaf_hess_stat_buffer, + const double lambda_l1, + const bool use_l1, + const double lambda_l2, + const double shrinkage_rate, + const double refit_decay_rate, + double* leaf_value) { + const int leaf_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (leaf_index < num_leaves) { + const double sum_gradients = leaf_grad_stat_buffer[leaf_index]; + const double sum_hessians = leaf_hess_stat_buffer[leaf_index]; + const double old_leaf_value = leaf_value[leaf_index]; + double new_leaf_value = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_gradients, sum_hessians, lambda_l1, use_l1, lambda_l2); + if (isnan(new_leaf_value)) { + new_leaf_value = 0.0f; + } else { + new_leaf_value *= shrinkage_rate; + } + leaf_value[leaf_index] = refit_decay_rate * old_leaf_value + (1.0f - refit_decay_rate) * new_leaf_value; + } +} + void CUDASingleGPUTreeLearner::LaunchReduceLeafStatKernel( - const score_t* gradients, const score_t* hessians, const int num_leaves, const data_size_t num_data) const { - const int num_block = (num_data + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; + const score_t* gradients, const score_t* hessians, const int num_leaves, + const data_size_t num_data, double* cuda_leaf_value, const double shrinkage_rate) const { + int num_block = (num_data + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; if (num_leaves <= 2048) { ReduceLeafStatKernel_SharedMemory<<>>( gradients, hessians, num_leaves, num_data, cuda_data_partition_->cuda_data_index_to_leaf_index(), - leaf_gradient_stat_buffer_, leaf_hessian_stat_buffer_); + cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_); } else { ReduceLeafStatKernel_GlobalMemory<<>>( gradients, hessians, num_leaves, num_data, cuda_data_partition_->cuda_data_index_to_leaf_index(), - leaf_gradient_stat_buffer_, leaf_hessian_stat_buffer_); + cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_); } + const bool use_l1 = config_->lambda_l1 > 0.0f; + num_block = (num_leaves + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; + CalcRefitLeafOutputKernel<<>>( + num_leaves, cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_, + config_->lambda_l1, use_l1, config_->lambda_l2, shrinkage_rate, config_->refit_decay_rate, cuda_leaf_value); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 9f36792463e0..becbbd608367 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -20,6 +20,8 @@ namespace LightGBM { +#define CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE (1024) + class CUDASingleGPUTreeLearner: public SerialTreeLearner { public: explicit CUDASingleGPUTreeLearner(const Config* config); @@ -50,9 +52,10 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { protected: void BeforeTrain() override; - void ReduceLeafStat(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const; + void ReduceLeafStat(CUDATree* old_tree, const score_t* gradients, const score_t* hessians) const; - void LaunchReduceLeafStatKernel(const score_t* gradients, const score_t* hessians, const int num_leaves, const data_size_t num_data) const; + void LaunchReduceLeafStatKernel(const score_t* gradients, const score_t* hessians, + const int num_leaves, const data_size_t num_data, double* cuda_leaf_value, const double shrinkage_rate) const; // GPU device ID int gpu_device_id_; @@ -81,9 +84,9 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { int larger_leaf_index_; int best_leaf_index_; - double* leaf_gradient_stat_buffer_; - double* leaf_hessian_stat_buffer_; - data_size_t leaf_stat_buffer_size_; + mutable double* cuda_leaf_gradient_stat_buffer_; + mutable double* cuda_leaf_hessian_stat_buffer_; + mutable data_size_t leaf_stat_buffer_size_; mutable data_size_t refit_num_data_; /*! \brief gradients on CUDA */ From 4b936de89729481808eb636f8ace71d3a4894fa9 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 21 Oct 2021 12:06:12 +0000 Subject: [PATCH 102/166] create set of large bin partitions in CUDARowData --- include/LightGBM/cuda/cuda_row_data.hpp | 9 +++- src/io/cuda/cuda_row_data.cpp | 61 ++++++++++++++++++++++++- src/io/cuda/cuda_tree.cpp | 5 ++ 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index d2b3eab1568d..cc0c39e40c2e 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -40,6 +40,8 @@ class CUDARowData { void CopySubrowAndSubcol(const CUDARowData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices, const std::vector& is_feature_used, const Dataset* train_data); + int NumLargeBinPartition() const { return static_cast(large_bin_partitions_.size()); } + int num_feature_partitions() const { return num_feature_partitions_; } int max_num_column_per_partition() const { return max_num_column_per_partition_; } @@ -128,7 +130,8 @@ class CUDARowData { int cur_num_feature_partition_buffer_size_; /*! \brief CUDA device ID */ int gpu_device_id_; - + /*! \brief index of partitions with large bins that its histogram cannot fit into shared memory */ + std::vector large_bin_partitions_; // CUDA memory @@ -162,6 +165,10 @@ class CUDARowData { uint32_t* cuda_block_buffer_uint32_t_; /*! \brief block buffer when calculating prefix sum */ uint64_t* cuda_block_buffer_uint64_t_; + /*! \brief small bin partition index to global partition index */ + int* cuda_small_partition_index_to_global_partition_index_; + /*! \brief large bin partition index to global partition index */ + int* cuda_large_partition_index_to_global_partition_index_; }; } // namespace LightGBM diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 844478085722..258d16c43746 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -193,14 +193,26 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha const int num_feature_groups = train_data->num_feature_groups(); int column_index = 0; num_feature_partitions_ = 0; + large_bin_partitions_.clear(); for (int feature_group_index = 0; feature_group_index < num_feature_groups; ++feature_group_index) { if (!train_data->IsMultiGroup(feature_group_index)) { const uint32_t column_feature_hist_start = column_hist_offsets[column_index]; const uint32_t column_feature_hist_end = column_hist_offsets[column_index + 1]; const uint32_t num_bin_in_dense_group = column_feature_hist_end - column_feature_hist_start; + + // if one column has too many bins, use a separate partition for that column if (num_bin_in_dense_group > max_num_bin_per_partition) { - Log::Fatal("Too many bins in a dense feature group."); + feature_partition_column_index_offsets_.emplace_back(column_index + 1); + start_hist_offset = column_feature_hist_end; + partition_hist_offsets_.emplace_back(start_hist_offset); + large_bin_partitions_.emplace_back(num_feature_partitions_); + ++num_feature_partitions_; + column_hist_offsets_.emplace_back(0); + ++column_index; + continue; } + + // try if adding this column exceed the maximum number per partition const uint32_t cur_hist_num_bin = column_feature_hist_end - start_hist_offset; if (cur_hist_num_bin > max_num_bin_per_partition) { feature_partition_column_index_offsets_.emplace_back(column_index); @@ -222,6 +234,21 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha const int feature_index = group_feature_index_start + sub_feature_index; const uint32_t column_feature_hist_start = column_hist_offsets[column_index]; const uint32_t column_feature_hist_end = column_hist_offsets[column_index + 1]; + const uint32_t num_bin_in_dense_group = column_feature_hist_end - column_feature_hist_start; + + // if one column has too many bins, use a separate partition for that column + if (num_bin_in_dense_group > max_num_bin_per_partition) { + feature_partition_column_index_offsets_.emplace_back(column_index + 1); + start_hist_offset = column_feature_hist_end; + partition_hist_offsets_.emplace_back(start_hist_offset); + large_bin_partitions_.emplace_back(num_feature_partitions_); + ++num_feature_partitions_; + column_hist_offsets_.emplace_back(0); + ++column_index; + continue; + } + + // try if adding this column exceed the maximum number per partition const uint32_t cur_hist_num_bin = column_feature_hist_end - start_hist_offset; if (cur_hist_num_bin > max_num_bin_per_partition) { feature_partition_column_index_offsets_.emplace_back(column_index); @@ -249,6 +276,38 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha } } + if (!large_bin_partitions_.empty()) { + std::vector small_partition_index_to_global_partition_index; + std::vector large_partition_index_to_global_partition_index; + int partition_index = 0; + int large_bin_partition_index = 0; + while (partition_index < num_feature_partitions_ && + large_bin_partition_index < static_cast(large_bin_partitions_.size())) { + while (partition_index != large_bin_partitions_[large_bin_partition_index]) { + small_partition_index_to_global_partition_index.emplace_back(partition_index); + ++partition_index; + } + large_partition_index_to_global_partition_index.emplace_back(partition_index); + ++partition_index; + ++large_bin_partition_index; + } + // push remaining partitions into small bin partitions + while (partition_index < num_feature_partitions_) { + small_partition_index_to_global_partition_index.emplace_back(partition_index); + ++partition_index; + } + InitCUDAMemoryFromHostMemory(&cuda_large_partition_index_to_global_partition_index_, + large_partition_index_to_global_partition_index.data(), + large_partition_index_to_global_partition_index.size(), + __FILE__, + __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_small_partition_index_to_global_partition_index_, + small_partition_index_to_global_partition_index.data(), + small_partition_index_to_global_partition_index.size(), + __FILE__, + __LINE__); + } + InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, feature_partition_column_index_offsets_.data(), feature_partition_column_index_offsets_.size(), diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index b97594bc4ff2..6485da8b824c 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -262,6 +262,11 @@ void CUDATree::SyncLeafOutputFromHostToCUDA() { CopyFromHostToCUDADevice(cuda_leaf_value_, leaf_value_.data(), leaf_value_.size(), __FILE__, __LINE__); } +void CUDATree::SyncLeafOutputFromCUDAToHost() { + CopyFromCUDADeviceToHost(leaf_value_.data(), cuda_leaf_value_, leaf_value_.size(), __FILE__, __LINE__); +} + + } // namespace LightGBM #endif // USE_CUDA From 419376873988526400c475ce04e562f958bb238a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sat, 23 Oct 2021 14:44:06 +0000 Subject: [PATCH 103/166] add histogram construction for columns with a large number of bins --- include/LightGBM/cuda/cuda_algorithms.hpp | 14 + include/LightGBM/cuda/cuda_row_data.hpp | 8 +- src/io/cuda/cuda_row_data.cpp | 37 +- .../cuda/cuda_best_split_finder.cpp | 9 +- .../cuda/cuda_best_split_finder.cu | 269 +++++++++++ .../cuda/cuda_best_split_finder.hpp | 9 +- .../cuda/cuda_histogram_constructor.cpp | 7 + .../cuda/cuda_histogram_constructor.cu | 437 ++++++++++++++---- .../cuda/cuda_histogram_constructor.hpp | 7 + 9 files changed, 667 insertions(+), 130 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index b8538ffb1d8f..b89e9e05633c 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -96,6 +96,20 @@ __device__ __forceinline__ T ShuffleReduceSum(T value, T* shared_mem_buffer, con return value; } +template +__device__ __forceinline__ T GlobalMemoryPrefixSum(T* array, const size_t len) { + const size_t num_values_per_thread = (len + blockDim.x - 1) / blockDim.x; + const size_t start = threadIdx.x * num_values_per_thread; + const size_t end = min(start + num_values_per_thread, len); + T thread_sum = 0; + for (size_t index = start; index < end; ++index) { + thread_sum += array[index]; + } + __shared__ T shared_mem[32]; + const T thread_base = ShuffleReduceSum(thread_sum, shared_mem); + +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index cc0c39e40c2e..a73cbb39f210 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -130,8 +130,10 @@ class CUDARowData { int cur_num_feature_partition_buffer_size_; /*! \brief CUDA device ID */ int gpu_device_id_; - /*! \brief index of partitions with large bins that its histogram cannot fit into shared memory */ + /*! \brief index of partitions with large bins that its histogram cannot fit into shared memory, each large bin partition contains a single column */ std::vector large_bin_partitions_; + /*! \brief index of partitions with small bins */ + std::vector small_bin_partitions_; // CUDA memory @@ -165,10 +167,6 @@ class CUDARowData { uint32_t* cuda_block_buffer_uint32_t_; /*! \brief block buffer when calculating prefix sum */ uint64_t* cuda_block_buffer_uint64_t_; - /*! \brief small bin partition index to global partition index */ - int* cuda_small_partition_index_to_global_partition_index_; - /*! \brief large bin partition index to global partition index */ - int* cuda_large_partition_index_to_global_partition_index_; }; } // namespace LightGBM diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 258d16c43746..3ba3deff72b3 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -194,6 +194,7 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha int column_index = 0; num_feature_partitions_ = 0; large_bin_partitions_.clear(); + small_bin_partitions_.clear(); for (int feature_group_index = 0; feature_group_index < num_feature_groups; ++feature_group_index) { if (!train_data->IsMultiGroup(feature_group_index)) { const uint32_t column_feature_hist_start = column_hist_offsets[column_index]; @@ -218,12 +219,14 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha feature_partition_column_index_offsets_.emplace_back(column_index); start_hist_offset = column_feature_hist_start; partition_hist_offsets_.emplace_back(start_hist_offset); + small_bin_partitions_.emplace_back(num_feature_partitions_); ++num_feature_partitions_; } column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - start_hist_offset); if (feature_group_index == num_feature_groups - 1) { feature_partition_column_index_offsets_.emplace_back(column_index + 1); partition_hist_offsets_.emplace_back(column_hist_offsets.back()); + small_bin_partitions_.emplace_back(num_feature_partitions_); ++num_feature_partitions_; } ++column_index; @@ -254,6 +257,7 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha feature_partition_column_index_offsets_.emplace_back(column_index); start_hist_offset = column_feature_hist_start; partition_hist_offsets_.emplace_back(start_hist_offset); + small_bin_partitions_.emplace_back(num_feature_partitions_); ++num_feature_partitions_; } column_hist_offsets_.emplace_back(column_hist_offsets[column_index] - start_hist_offset); @@ -261,6 +265,7 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha CHECK_EQ(feature_index, num_feature_ - 1); feature_partition_column_index_offsets_.emplace_back(column_index + 1); partition_hist_offsets_.emplace_back(column_hist_offsets.back()); + small_bin_partitions_.emplace_back(num_feature_partitions_); ++num_feature_partitions_; } ++column_index; @@ -276,38 +281,6 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha } } - if (!large_bin_partitions_.empty()) { - std::vector small_partition_index_to_global_partition_index; - std::vector large_partition_index_to_global_partition_index; - int partition_index = 0; - int large_bin_partition_index = 0; - while (partition_index < num_feature_partitions_ && - large_bin_partition_index < static_cast(large_bin_partitions_.size())) { - while (partition_index != large_bin_partitions_[large_bin_partition_index]) { - small_partition_index_to_global_partition_index.emplace_back(partition_index); - ++partition_index; - } - large_partition_index_to_global_partition_index.emplace_back(partition_index); - ++partition_index; - ++large_bin_partition_index; - } - // push remaining partitions into small bin partitions - while (partition_index < num_feature_partitions_) { - small_partition_index_to_global_partition_index.emplace_back(partition_index); - ++partition_index; - } - InitCUDAMemoryFromHostMemory(&cuda_large_partition_index_to_global_partition_index_, - large_partition_index_to_global_partition_index.data(), - large_partition_index_to_global_partition_index.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_small_partition_index_to_global_partition_index_, - small_partition_index_to_global_partition_index.data(), - small_partition_index_to_global_partition_index.size(), - __FILE__, - __LINE__); - } - InitCUDAMemoryFromHostMemory(&cuda_feature_partition_column_index_offsets_, feature_partition_column_index_offsets_.data(), feature_partition_column_index_offsets_.size(), diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 1e9ebf1b6756..51fc0169b313 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -24,6 +24,7 @@ CUDABestSplitFinder::CUDABestSplitFinder( min_data_in_leaf_(config->min_data_in_leaf), min_sum_hessian_in_leaf_(config->min_sum_hessian_in_leaf), min_gain_to_split_(config->min_gain_to_split), + num_total_bin_(feature_hist_offsets.back()), cuda_hist_(cuda_hist) { InitFeatureMetaInfo(train_data); cuda_leaf_best_split_info_ = nullptr; @@ -80,7 +81,9 @@ void CUDABestSplitFinder::InitFeatureMetaInfo(const Dataset* train_data) { } } if (max_num_bin_in_feature_ > MAX_NUM_BIN_IN_FEATURE) { - Log::Fatal("feature bin size %d exceeds limit %d", max_num_bin_in_feature_, MAX_NUM_BIN_IN_FEATURE); + use_global_memory_ = true; + } else { + use_global_memory_ = false; } } @@ -90,6 +93,10 @@ void CUDABestSplitFinder::Init() { CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); AllocateCUDAMemory(&cuda_best_split_info_buffer_, 7, __FILE__, __LINE__); + if (use_global_memory_) { + AllocateCUDAMemory(&cuda_feature_hist_grad_buffer_, static_cast(num_total_bin_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_feature_hist_hess_buffer_, static_cast(num_total_bin_), __FILE__, __LINE__); + } } void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 59a0848565e0..c8bffbc4c39e 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -424,6 +424,275 @@ __global__ void FindBestSplitsForLeafKernel( } } +__device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( + // input feature information + const hist_t* feature_hist_ptr, + const uint32_t feature_num_bin, + const uint8_t feature_mfb_offset, + const uint32_t feature_default_bin, + const int inner_feature_index, + // input config parameter values + const double lambda_l1, + const double lambda_l2, + const data_size_t min_data_in_leaf, + const double min_sum_hessian_in_leaf, + const double min_gain_to_split, + // input parent node information + const double parent_gain, + const double sum_gradients, + const double sum_hessians, + const data_size_t num_data, + // input task information + const bool reverse, + const bool skip_default_bin, + const bool na_as_missing, + const uint8_t assume_out_default_left, + // buffer + hist_t* hist_grad_buffer_ptr, + hist_t* hist_hess_buffer_ptr, + // output parameters + CUDASplitInfo* cuda_best_split_info) { + const double cnt_factor = num_data / sum_hessians; + const bool use_l1 = lambda_l1 > 0.0f; + const double min_gain_shift = parent_gain + min_gain_to_split; + + cuda_best_split_info->is_valid = false; + + __shared__ hist_t shared_mem_buffer[32]; + hist_t local_grad_hist = 0.0f; + hist_t local_hess_hist = 0.0f; + double local_gain = 0.0f; + bool threshold_found = false; + uint32_t threshold_value = 0; + __shared__ uint32_t best_thread_index; + __shared__ double shared_gain_buffer[32]; + __shared__ bool shared_found_buffer[32]; + __shared__ uint32_t shared_thread_index_buffer[32]; + const unsigned int threadIdx_x = threadIdx.x; + const uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; + if (!reverse) { + for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; ++bin) { + const bool skip_sum = + (skip_default_bin && (bin + feature_mfb_offset) == static_cast(feature_default_bin)); + if (!skip_sum) { + const unsigned int bin_offset = bin << 1; + hist_grad_buffer_ptr[bin] = feature_hist_ptr[bin_offset]; + hist_hess_buffer_ptr[bin] = feature_hist_ptr[bin_offset + 1]; + } else { + hist_grad_buffer_ptr[bin] = 0.0f; + hist_hess_buffer_ptr[bin] = 0.0f; + } + } + } else { + for (unsigned int bin = 0; bin < feature_num_bin_minus_offset; ++bin) { + const bool skip_sum = bin >= static_cast(na_as_missing) && + (skip_default_bin && (feature_num_bin - 1 - bin) == static_cast(feature_default_bin)); + if (!skip_sum) { + const unsigned int read_index = feature_num_bin_minus_offset - 1 - bin; + const unsigned int bin_offset = read_index << 1; + hist_grad_buffer_ptr[bin] = feature_hist_ptr[bin_offset]; + hist_hess_buffer_ptr[bin] = feature_hist_ptr[bin_offset + 1]; + } else { + hist_grad_buffer_ptr[bin] = 0.0f; + hist_hess_buffer_ptr[bin] = 0.0f; + } + } + } + __syncthreads(); + if (threadIdx_x == 0) { + hist_hess_buffer_ptr[0] += kEpsilon; + } + local_gain = kMinScore; + local_grad_hist = ShufflePrefixSum(local_grad_hist, shared_mem_buffer); + __syncthreads(); + local_hess_hist = ShufflePrefixSum(local_hess_hist, shared_mem_buffer); + if (reverse) { + if (threadIdx_x >= static_cast(na_as_missing) && threadIdx_x <= feature_num_bin - 2 && !skip_sum) { + const double sum_right_gradient = local_grad_hist; + const double sum_right_hessian = local_hess_hist; + const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); + const double sum_left_gradient = sum_gradients - sum_right_gradient; + const double sum_left_hessian = sum_hessians - sum_right_hessian; + const data_size_t left_count = num_data - right_count; + if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain > min_gain_shift) { + local_gain = current_gain - min_gain_shift; + threshold_value = static_cast(feature_num_bin - 2 - threadIdx_x); + threshold_found = true; + } + } + } + } else { + if (threadIdx_x <= feature_num_bin_minus_offset - 2 && !skip_sum) { + const double sum_left_gradient = local_grad_hist; + const double sum_left_hessian = local_hess_hist; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = num_data - left_count; + if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain > min_gain_shift) { + local_gain = current_gain - min_gain_shift; + threshold_value = static_cast(threadIdx_x + feature_mfb_offset); + threshold_found = true; + } + } + } + } + __syncthreads(); + const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer); + if (threadIdx_x == 0) { + best_thread_index = result; + } + __syncthreads(); + if (threshold_found && threadIdx_x == best_thread_index) { + cuda_best_split_info->is_valid = true; + cuda_best_split_info->threshold = threshold_value; + cuda_best_split_info->gain = local_gain; + cuda_best_split_info->default_left = assume_out_default_left; + if (reverse) { + const double sum_right_gradient = local_grad_hist; + const double sum_right_hessian = local_hess_hist - kEpsilon; + const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); + const double sum_left_gradient = sum_gradients - sum_right_gradient; + const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon; + const data_size_t left_count = num_data - right_count; + const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2); + const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2); + cuda_best_split_info->left_sum_gradients = sum_left_gradient; + cuda_best_split_info->left_sum_hessians = sum_left_hessian; + cuda_best_split_info->left_count = left_count; + cuda_best_split_info->right_sum_gradients = sum_right_gradient; + cuda_best_split_info->right_sum_hessians = sum_right_hessian; + cuda_best_split_info->right_count = right_count; + cuda_best_split_info->left_value = left_output; + cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + cuda_best_split_info->right_value = right_output; + cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + } else { + const double sum_left_gradient = local_grad_hist; + const double sum_left_hessian = local_hess_hist - kEpsilon; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon; + const data_size_t right_count = num_data - left_count; + const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2); + const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2); + cuda_best_split_info->left_sum_gradients = sum_left_gradient; + cuda_best_split_info->left_sum_hessians = sum_left_hessian; + cuda_best_split_info->left_count = left_count; + cuda_best_split_info->right_sum_gradients = sum_right_gradient; + cuda_best_split_info->right_sum_hessians = sum_right_hessian; + cuda_best_split_info->right_count = right_count; + cuda_best_split_info->left_value = left_output; + cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + cuda_best_split_info->right_value = right_output; + cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + } + } +} + +__global__ void FindBestSplitsForLeafKernel_GlobalMemory( + // input feature information + const uint32_t* feature_hist_offsets, + const uint8_t* feature_mfb_offsets, + const uint32_t* feature_default_bins, + const uint32_t* feature_num_bins, + const int8_t* is_feature_used_bytree, + // input task information + const bool larger_only, + const int num_tasks, + const int* task_feature_index, + const uint8_t* task_reverse, + const uint8_t* task_skip_default_bin, + const uint8_t* task_na_as_missing, + const uint8_t* task_out_default_left, + // input leaf information + const int smaller_leaf_index, + const CUDALeafSplitsStruct* smaller_leaf_splits, + const int larger_leaf_index, + const CUDALeafSplitsStruct* larger_leaf_splits, + // input config parameter values + const data_size_t min_data_in_leaf, + const double min_sum_hessian_in_leaf, + const double min_gain_to_split, + const double lambda_l1, + const double lambda_l2, + // buffer + hist_t* feature_hist_grad_buffer, + hist_t* feature_hist_hess_buffer, + // output + CUDASplitInfo* cuda_best_split_info) { + const unsigned int task_index = blockIdx.x % num_tasks; + const bool is_larger = static_cast(blockIdx.x >= num_tasks || larger_only); + const int inner_feature_index = task_feature_index[task_index]; + const bool reverse = static_cast(task_reverse[task_index]); + const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); + const bool na_as_missing = static_cast(task_na_as_missing[task_index]); + const bool assume_out_default_left = task_out_default_left[task_index]; + const double parent_gain = is_larger ? larger_leaf_splits->gain : smaller_leaf_splits->gain; + const double sum_gradients = is_larger ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients; + const double sum_hessians = (is_larger ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon; + const double num_data = is_larger ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf; + const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; + CUDASplitInfo* out = cuda_best_split_info + output_offset; + if (is_feature_used_bytree[inner_feature_index]) { + const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + feature_hist_offsets[inner_feature_index] * 2; + hist_t* hist_grad_buffer_ptr = feature_hist_grad_buffer + feature_hist_offsets[inner_feature_index] * 2; + hist_t* hist_hess_buffer_ptr = feature_hist_hess_buffer + feature_hist_offsets[inner_feature_index] * 2; + FindBestSplitsForLeafKernelInner_GlobalMemory( + // input feature information + hist_ptr, + feature_num_bins[inner_feature_index], + feature_mfb_offsets[inner_feature_index], + feature_default_bins[inner_feature_index], + inner_feature_index, + // input config parameter values + lambda_l1, + lambda_l2, + min_data_in_leaf, + min_sum_hessian_in_leaf, + min_gain_to_split, + // input parent node information + parent_gain, + sum_gradients, + sum_hessians, + num_data, + // input task information + reverse, + skip_default_bin, + na_as_missing, + assume_out_default_left, + // buffer + hist_grad_buffer_ptr, + hist_hess_buffer_ptr, + // output parameters + out); + } else { + out->is_valid = false; + } +} + void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( const CUDALeafSplitsStruct* smaller_leaf_splits, const CUDALeafSplitsStruct* larger_leaf_splits, diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 63de299de6c3..0d528875ab02 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -18,7 +18,7 @@ #include "cuda_leaf_splits.hpp" -#define MAX_NUM_BIN_IN_FEATURE (256) +#define MAX_NUM_BIN_IN_FEATURE (1024) #define NUM_THREADS_FIND_BEST_LEAF (256) #define NUM_TASKS_PER_SYNC_BLOCK (1024) @@ -119,6 +119,10 @@ class CUDABestSplitFinder { std::vector host_task_na_as_missing_; std::vector host_task_out_default_left_; int num_tasks_; + // use global memory + bool use_global_memory_; + // number of total bins in the dataset + const int num_total_bin_; // CUDA memory, held by this object // for per leaf best split information @@ -139,6 +143,9 @@ class CUDABestSplitFinder { uint8_t* cuda_task_na_as_missing_; uint8_t* cuda_task_out_default_left_; int8_t* cuda_is_feature_used_bytree_; + // used when finding best split with global memory + hist_t* cuda_feature_hist_grad_buffer_; + hist_t* cuda_feature_hist_hess_buffer_; // CUDA memory, held by other object const hist_t* cuda_hist_; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index cd853cd22be5..eab85c690fb1 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -109,6 +109,13 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(), need_fix_histogram_features_num_bin_aligend_.size(), __FILE__, __LINE__); + + if (cuda_row_data_->NumLargeBinPartition() > 0) { + int grid_dim_x = 0, grid_dim_y = 0, block_dim_x = 0, block_dim_y = 0; + CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_); + const size_t buffer_size = static_cast(grid_dim_y) * static_cast(num_total_bin_) * 2; + AllocateCUDAMemory(&cuda_hist_buffer_, buffer_size, __FILE__, __LINE__); + } } void CUDAHistogramConstructor::ConstructHistogramForLeaf( diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index b8d819b287db..f83649249f3b 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -131,6 +131,127 @@ __global__ void CUDAConstructHistogramSparseKernel( } } +template +__global__ void CUDAConstructHistogramDenseKernel_GlobalMemory( + const CUDALeafSplitsStruct* smaller_leaf_splits, + const score_t* cuda_gradients, + const score_t* cuda_hessians, + const BIN_TYPE* data, + const uint32_t* column_hist_offsets, + const uint32_t* column_hist_offsets_full, + const int* feature_partition_column_index_offsets, + const data_size_t num_data, + float* global_hist_buffer) { + const int dim_y = static_cast(gridDim.y * blockDim.y); + const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; + const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; + const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf; + const unsigned int num_threads_per_block = blockDim.x * blockDim.y; + const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x]; + const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1]; + const BIN_TYPE* data_ptr = data + partition_column_start * num_data; + const int num_columns_in_partition = partition_column_end - partition_column_start; + const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; + const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; + const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1; + const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; + const int num_total_bin = column_hist_offsets_full[gridDim.x]; + float* shared_hist = global_hist_buffer + (blockIdx.y * num_total_bin + partition_hist_start) * 2; + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { + shared_hist[i] = 0.0f; + } + __syncthreads(); + const unsigned int threadIdx_y = threadIdx.y; + const unsigned int blockIdx_y = blockIdx.y; + const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread; + const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; + data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast(blockDim.y))); + const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y; + const data_size_t remainder = block_num_data % blockDim.y; + const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); + data_size_t inner_data_index = static_cast(threadIdx_y); + const int column_index = static_cast(threadIdx.x) + partition_column_start; + if (threadIdx.x < static_cast(num_columns_in_partition)) { + float* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); + for (data_size_t i = 0; i < num_iteration_this; ++i) { + const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; + const score_t grad = cuda_gradients[data_index]; + const score_t hess = cuda_hessians[data_index]; + const uint32_t bin = static_cast(data_ptr[data_index * num_columns_in_partition + threadIdx.x]); + const uint32_t pos = bin << 1; + float* pos_ptr = shared_hist_ptr + pos; + atomicAdd_block(pos_ptr, grad); + atomicAdd_block(pos_ptr + 1, hess); + inner_data_index += blockDim.y; + } + } + __syncthreads(); + hist_t* feature_histogram_ptr = smaller_leaf_splits->hist_in_leaf + (partition_hist_start << 1); + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { + atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); + } +} + +template +__global__ void CUDAConstructHistogramSparseKernel_GlobalMemory( + const CUDALeafSplitsStruct* smaller_leaf_splits, + const score_t* cuda_gradients, + const score_t* cuda_hessians, + const BIN_TYPE* data, + const DATA_PTR_TYPE* row_ptr, + const DATA_PTR_TYPE* partition_ptr, + const uint32_t* column_hist_offsets_full, + const data_size_t num_data, + float* global_hist_buffer) { + const int dim_y = static_cast(gridDim.y * blockDim.y); + const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; + const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; + const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf; + const unsigned int num_threads_per_block = blockDim.x * blockDim.y; + const DATA_PTR_TYPE* block_row_ptr = row_ptr + blockIdx.x * (num_data + 1); + const BIN_TYPE* data_ptr = data + partition_ptr[blockIdx.x]; + const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x]; + const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1]; + const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1; + const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x; + const int num_total_bin = column_hist_offsets_full[gridDim.x]; + float* shared_hist = global_hist_buffer + (blockIdx.y * num_total_bin + partition_hist_start) * 2; + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { + shared_hist[i] = 0.0f; + } + __syncthreads(); + const unsigned int threadIdx_y = threadIdx.y; + const unsigned int blockIdx_y = blockIdx.y; + const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread; + const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; + data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast(blockDim.y))); + const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y; + const data_size_t remainder = block_num_data % blockDim.y; + const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); + data_size_t inner_data_index = static_cast(threadIdx_y); + for (data_size_t i = 0; i < num_iteration_this; ++i) { + const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; + const DATA_PTR_TYPE row_start = block_row_ptr[data_index]; + const DATA_PTR_TYPE row_end = block_row_ptr[data_index + 1]; + const DATA_PTR_TYPE row_size = row_end - row_start; + if (threadIdx.x < row_size) { + const score_t grad = cuda_gradients[data_index]; + const score_t hess = cuda_hessians[data_index]; + const uint32_t bin = static_cast(data_ptr[row_start + threadIdx.x]); + const uint32_t pos = bin << 1; + float* pos_ptr = shared_hist + pos; + atomicAdd_block(pos_ptr, grad); + atomicAdd_block(pos_ptr + 1, hess); + } + inner_data_index += blockDim.y; + } + __syncthreads(); + hist_t* feature_histogram_ptr = smaller_leaf_splits->hist_in_leaf + (partition_hist_start << 1); + for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) { + atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]); + } +} + void CUDAHistogramConstructor::LaunchConstructHistogramKernel( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf) { @@ -141,124 +262,258 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf); dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); - if (cuda_row_data_->is_sparse()) { - if (cuda_row_data_->bit_type() == 8) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); + if (cuda_row_data_->NumLargeBinPartition() == 0) { + if (cuda_row_data_->is_sparse()) { + if (cuda_row_data_->bit_type() == 8) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_); + } + } else if (cuda_row_data_->bit_type() == 16) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_); + } + } else if (cuda_row_data_->bit_type() == 32) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_); + } } - } else if (cuda_row_data_->bit_type() == 16) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( + } else { + if (cuda_row_data_->bit_type() == 8) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_feature_partition_column_index_offsets(), num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( + } else if (cuda_row_data_->bit_type() == 16) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_feature_partition_column_index_offsets(), num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( + } else if (cuda_row_data_->bit_type() == 32) { + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_feature_partition_column_index_offsets(), num_data_); } - } else if (cuda_row_data_->bit_type() == 32) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( + } + } else { + if (cuda_row_data_->is_sparse()) { + if (cuda_row_data_->bit_type() == 8) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, + cuda_hist_buffer_); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, + cuda_hist_buffer_); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, + cuda_hist_buffer_); + } + } else if (cuda_row_data_->bit_type() == 16) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, + cuda_hist_buffer_); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, + cuda_hist_buffer_); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, + cuda_hist_buffer_); + } + } else if (cuda_row_data_->bit_type() == 32) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint16(), + cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, + cuda_hist_buffer_); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint32(), + cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, + cuda_hist_buffer_); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->cuda_data_uint32(), + cuda_row_data_->cuda_row_ptr_uint64(), + cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, + cuda_hist_buffer_); + } + } + } else { + if (cuda_row_data_->bit_type() == 8) { + CUDAConstructHistogramDenseKernel_GlobalMemory<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), + cuda_row_data_->cuda_data_uint8(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( + cuda_row_data_->cuda_feature_partition_column_index_offsets(), + num_data_, + cuda_hist_buffer_); + } else if (cuda_row_data_->bit_type() == 16) { + CUDAConstructHistogramDenseKernel_GlobalMemory<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), + cuda_row_data_->cuda_data_uint16(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( + cuda_row_data_->cuda_feature_partition_column_index_offsets(), + num_data_, + cuda_hist_buffer_); + } else if (cuda_row_data_->bit_type() == 32) { + CUDAConstructHistogramDenseKernel_GlobalMemory<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), + cuda_row_data_->cuda_column_hist_offsets(), cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); + cuda_row_data_->cuda_feature_partition_column_index_offsets(), + num_data_, + cuda_hist_buffer_); } } - } else { - if (cuda_row_data_->bit_type() == 8) { - CUDAConstructHistogramDenseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_); - } else if (cuda_row_data_->bit_type() == 16) { - CUDAConstructHistogramDenseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_); - } else if (cuda_row_data_->bit_type() == 32) { - CUDAConstructHistogramDenseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_); - } } } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index c4c77b3089f4..bb1bf62a1db6 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -77,6 +77,11 @@ class CUDAHistogramConstructor { const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf); + void LaunchSparseConstructHistogramKernel( + const dim3 grid_dim, + const dim3 block_dim, + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits); + void LaunchSubtractHistogramKernel( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits); @@ -125,6 +130,8 @@ class CUDAHistogramConstructor { uint32_t* cuda_feature_most_freq_bins_; /*! \brief CUDA histograms */ hist_t* cuda_hist_; + /*! \brief CUDA histograms buffer for each block */ + float* cuda_hist_buffer_; /*! \brief indices of feature whose histograms need to be fixed */ int* cuda_need_fix_histogram_features_; /*! \brief aligned number of bins of the features whose histograms need to be fixed */ From 0b6e79e8000b5fb8d23b9b98d6e23db0b4dc6474 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 26 Oct 2021 11:26:33 +0000 Subject: [PATCH 104/166] add find best split for categorical features on CUDA --- include/LightGBM/cuda/cuda_algorithms.hpp | 251 +++++- include/LightGBM/cuda/cuda_split_info.hpp | 9 + include/LightGBM/meta.h | 2 + .../cuda/cuda_best_split_finder.cpp | 2 +- .../cuda/cuda_best_split_finder.cu | 815 +++++++++++++++--- .../cuda/cuda_best_split_finder.hpp | 2 +- 6 files changed, 969 insertions(+), 112 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index b89e9e05633c..641169476c1b 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -62,6 +62,48 @@ __device__ __forceinline__ T ShufflePrefixSum(T value, T* shared_mem_buffer) { return warp_base + value; } +template +__device__ __forceinline__ T ShufflePrefixSumExclusive(T value, T* shared_mem_buffer) { + const uint32_t mask = 0xffffffff; + const uint32_t warpLane = threadIdx.x % warpSize; + const uint32_t warpID = threadIdx.x / warpSize; + const uint32_t num_warp = blockDim.x / warpSize; + for (uint32_t offset = 1; offset < warpSize; offset <<= 1) { + const T other_value = __shfl_up_sync(mask, value, offset); + if (warpLane >= offset) { + value += other_value; + } + } + if (warpLane == warpSize - 1) { + shared_mem_buffer[warpID] = value; + } + __syncthreads(); + if (warpID == 0) { + T warp_sum = (warpLane < num_warp ? shared_mem_buffer[warpLane] : 0); + for (uint32_t offset = 1; offset < warpSize; offset <<= 1) { + const T other_warp_sum = __shfl_up_sync(mask, warp_sum, offset); + if (warpLane >= offset) { + warp_sum += other_warp_sum; + } + } + shared_mem_buffer[warpLane] = warp_sum; + } + __syncthreads(); + const T warp_base = warpID == 0 ? 0 : shared_mem_buffer[warpID - 1]; + const T inclusive_result = warp_base + value; + if (threadIdx.x % warpSize == warpSize - 1) { + shared_mem_buffer[warpLane] = inclusive_result; + } + __syncthreads(); + T exclusive_result = __shfl_up_sync(mask, inclusive_result, 1); + if (threadIdx.x == 0) { + exclusive_result = 0; + } else if (threadIdx.x % warpSize == 0) { + exclusive_result = shared_mem_buffer[warpLane - 1]; + } + return exclusive_result; +} + template void ShufflePrefixSumGlobal(T* values, size_t len, T* block_prefix_sum_buffer); @@ -96,8 +138,9 @@ __device__ __forceinline__ T ShuffleReduceSum(T value, T* shared_mem_buffer, con return value; } +// calculate prefix sum values within an 1-dimensional block in global memory, exclusively template -__device__ __forceinline__ T GlobalMemoryPrefixSum(T* array, const size_t len) { +__device__ __forceinline__ void GlobalMemoryPrefixSum(T* array, const size_t len) { const size_t num_values_per_thread = (len + blockDim.x - 1) / blockDim.x; const size_t start = threadIdx.x * num_values_per_thread; const size_t end = min(start + num_values_per_thread, len); @@ -106,8 +149,210 @@ __device__ __forceinline__ T GlobalMemoryPrefixSum(T* array, const size_t len) { thread_sum += array[index]; } __shared__ T shared_mem[32]; - const T thread_base = ShuffleReduceSum(thread_sum, shared_mem); - + const T thread_base = ShufflePrefixSumExclusive(thread_sum, shared_mem); + if (start < end) { + array[start] += thread_base; + } + for (size_t index = start + 1; index < end; ++index) { + array[index] += array[index - 1]; + } +} + +template +__device__ __forceinline__ void BitonicArgSort_1024(const VAL_T* scores, INDEX_T* indices, const INDEX_T num_items) { + INDEX_T depth = 1; + INDEX_T num_items_aligend = 1; + INDEX_T num_items_ref = num_items - 1; + while (num_items_ref > 0) { + num_items_ref >>= 1; + num_items_aligend <<= 1; + ++depth; + } + for (INDEX_T outer_depth = depth - 1; outer_depth >= 1; --outer_depth) { + const INDEX_T outer_segment_length = 1 << (depth - outer_depth); + const INDEX_T outer_segment_index = threadIdx.x / outer_segment_length; + const bool ascending = ASCENDING ? (outer_segment_index % 2 == 0) : (outer_segment_index % 2 > 0); + for (INDEX_T inner_depth = outer_depth; inner_depth < depth; ++inner_depth) { + const INDEX_T segment_length = 1 << (depth - inner_depth); + const INDEX_T half_segment_length = segment_length >> 1; + const INDEX_T half_segment_index = threadIdx.x / half_segment_length; + if (threadIdx.x < num_items_aligend) { + if (half_segment_index % 2 == 0) { + const INDEX_T index_to_compare = threadIdx.x + half_segment_length; + if ((scores[indices[threadIdx.x]] > scores[indices[index_to_compare]]) == ascending) { + const INDEX_T index = indices[threadIdx.x]; + indices[threadIdx.x] = indices[index_to_compare]; + indices[index_to_compare] = index; + } + } + } + __syncthreads(); + } + } +} + +template +__device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, const int len) { + __shared__ VAL_T shared_values[BLOCK_DIM]; + __shared__ INDEX_T shared_indices[BLOCK_DIM]; + int len_to_shift = len - 1; + int max_depth = 1; + while (len_to_shift > 0) { + len_to_shift >>= 1; + ++max_depth; + } + const int num_blocks = (len + static_cast(BLOCK_DIM) - 1) / static_cast(BLOCK_DIM); + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); + if (this_index < len) { + shared_values[threadIdx.x] = values[this_index]; + shared_indices[threadIdx.x] = this_index; + } else { + shared_indices[threadIdx.x] = len; + } + __syncthreads(); + for (int depth = max_depth - 1; depth > max_depth - static_cast(MAX_DEPTH); --depth) { + const int segment_length = (1 << (max_depth - depth)); + const int segment_index = this_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + { + const int half_segment_length = (segment_length >> 1); + const int half_segment_index = this_index / half_segment_length; + const int num_total_segment = (len + segment_length - 1) / segment_length; + const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ? + (num_total_segment * segment_length - len) : 0; + if (half_segment_index % 2 == 0) { + const int segment_start = segment_index * segment_length; + if (this_index >= offset + segment_start) { + const int other_index = static_cast(threadIdx.x) + half_segment_length - offset; + const INDEX_T this_data_index = shared_indices[threadIdx.x]; + const INDEX_T other_data_index = shared_indices[other_index]; + const VAL_T this_value = shared_values[threadIdx.x]; + const VAL_T other_value = shared_values[other_index]; + if (other_data_index < len && (this_value > other_value) == ascending) { + shared_indices[threadIdx.x] = other_data_index; + shared_indices[other_index] = this_data_index; + shared_values[threadIdx.x] = other_value; + shared_values[other_index] = this_value; + } + } + } + __syncthreads(); + } + for (int inner_depth = depth + 1; inner_depth < max_depth; ++inner_depth) { + const int half_segment_length = (1 << (max_depth - inner_depth - 1)); + const int half_segment_index = this_index / half_segment_length; + if (half_segment_index % 2 == 0) { + const int other_index = static_cast(threadIdx.x) + half_segment_length; + const INDEX_T this_data_index = shared_indices[threadIdx.x]; + const INDEX_T other_data_index = shared_indices[other_index]; + const VAL_T this_value = shared_values[threadIdx.x]; + const VAL_T other_value = shared_values[other_index]; + if (other_data_index < len && (this_value > other_value) == ascending) { + shared_indices[threadIdx.x] = other_data_index; + shared_indices[other_index] = this_data_index; + shared_values[threadIdx.x] = other_value; + shared_values[other_index] = this_value; + } + } + __syncthreads(); + } + } + if (this_index < len) { + indices[this_index] = shared_indices[threadIdx.x]; + } + __syncthreads(); + } + for (int depth = max_depth - static_cast(MAX_DEPTH); depth >= 1; --depth) { + const int segment_length = (1 << (max_depth - depth)); + { + const int num_total_segment = (len + segment_length - 1) / segment_length; + const int half_segment_length = (segment_length >> 1); + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); + const int segment_index = this_index / segment_length; + const int half_segment_index = this_index / half_segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + const int offset = (segment_index == num_total_segment - 1 && ascending == ASCENDING) ? + (num_total_segment * segment_length - len) : 0; + if (half_segment_index % 2 == 0) { + const int segment_start = segment_index * segment_length; + if (this_index >= offset + segment_start) { + const int other_index = this_index + half_segment_length - offset; + if (other_index < len) { + const INDEX_T this_data_index = indices[this_index]; + const INDEX_T other_data_index = indices[other_index]; + const VAL_T this_value = values[this_data_index]; + const VAL_T other_value = values[other_data_index]; + if ((this_value > other_value) == ascending) { + indices[this_index] = other_data_index; + indices[other_index] = this_data_index; + } + } + } + } + } + __syncthreads(); + } + for (int inner_depth = depth + 1; inner_depth <= max_depth - static_cast(MAX_DEPTH); ++inner_depth) { + const int half_segment_length = (1 << (max_depth - inner_depth - 1)); + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); + const int segment_index = this_index / segment_length; + const int half_segment_index = this_index / half_segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + if (half_segment_index % 2 == 0) { + const int other_index = this_index + half_segment_length; + if (other_index < len) { + const INDEX_T this_data_index = indices[this_index]; + const INDEX_T other_data_index = indices[other_index]; + const VAL_T this_value = values[this_data_index]; + const VAL_T other_value = values[other_data_index]; + if ((this_value > other_value) == ascending) { + indices[this_index] = other_data_index; + indices[other_index] = this_data_index; + } + } + } + __syncthreads(); + } + } + for (int block_index = 0; block_index < num_blocks; ++block_index) { + const int this_index = block_index * static_cast(BLOCK_DIM) + static_cast(threadIdx.x); + const int segment_index = this_index / segment_length; + const bool ascending = ASCENDING ? (segment_index % 2 == 0) : (segment_index % 2 == 1); + if (this_index < len) { + const INDEX_T index = indices[this_index]; + shared_values[threadIdx.x] = values[index]; + shared_indices[threadIdx.x] = index; + } else { + shared_indices[threadIdx.x] = len; + } + __syncthreads(); + for (int inner_depth = max_depth - static_cast(MAX_DEPTH) + 1; inner_depth < max_depth; ++inner_depth) { + const int half_segment_length = (1 << (max_depth - inner_depth - 1)); + const int half_segment_index = this_index / half_segment_length; + if (half_segment_index % 2 == 0) { + const int other_index = static_cast(threadIdx.x) + half_segment_length; + const INDEX_T this_data_index = shared_indices[threadIdx.x]; + const INDEX_T other_data_index = shared_indices[other_index]; + const VAL_T this_value = shared_values[threadIdx.x]; + const VAL_T other_value = shared_values[other_index]; + if (other_data_index < len && (this_value > other_value) == ascending) { + shared_indices[threadIdx.x] = other_data_index; + shared_indices[other_index] = this_data_index; + shared_values[threadIdx.x] = other_value; + shared_values[other_index] = this_value; + } + } + __syncthreads(); + } + if (this_index < len) { + indices[this_index] = shared_indices[threadIdx.x]; + } + __syncthreads(); + } + } } } // namespace LightGBM diff --git a/include/LightGBM/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp index f09d2ecdd037..9cac9f32647a 100644 --- a/include/LightGBM/cuda/cuda_split_info.hpp +++ b/include/LightGBM/cuda/cuda_split_info.hpp @@ -33,6 +33,15 @@ struct CUDASplitInfo { data_size_t right_count; double right_gain; double right_value; + + int num_cat_threshold = 0; + uint32_t* cat_threshold = nullptr; + + __device__ ~CUDASplitInfo() { + if (num_cat_threshold > 0 && cat_threshold != nullptr) { + cudaFree(cat_threshold); + } + } }; } // namespace LightGBM diff --git a/include/LightGBM/meta.h b/include/LightGBM/meta.h index 3452f28d8ebc..ee97090cbe0a 100644 --- a/include/LightGBM/meta.h +++ b/include/LightGBM/meta.h @@ -49,6 +49,8 @@ typedef float label_t; const score_t kMinScore = -std::numeric_limits::infinity(); +const score_t kMaxScore = std::numeric_limits::infinity(); + const score_t kEpsilon = 1e-15f; const double kZeroThreshold = 1e-35f; diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 51fc0169b313..0f9fd3522ee8 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -80,7 +80,7 @@ void CUDABestSplitFinder::InitFeatureMetaInfo(const Dataset* train_data) { max_num_bin_in_feature_ = num_bin_hist; } } - if (max_num_bin_in_feature_ > MAX_NUM_BIN_IN_FEATURE) { + if (max_num_bin_in_feature_ > NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER) { use_global_memory_ = true; } else { use_global_memory_ = false; diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index c8bffbc4c39e..e5f5799e6599 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -351,6 +351,266 @@ __device__ void FindBestSplitsForLeafKernelInner( } } +__device__ void FindBestSplitsForLeafKernelCategoricalInner( + // input feature information + const hist_t* feature_hist_ptr, + const uint32_t feature_num_bin, + const uint8_t feature_mfb_offset, + const uint32_t feature_default_bin, + const int inner_feature_index, + // input config parameter values + const double lambda_l1, + const double lambda_l2, + const data_size_t min_data_in_leaf, + const double min_sum_hessian_in_leaf, + const double min_gain_to_split, + const double cat_smooth, + const int max_cat_threshold, + const int min_data_per_group, + // input parent node information + const double parent_gain, + const double sum_gradients, + const double sum_hessians, + const data_size_t num_data, + // task information + const bool is_one_hot, + // output parameters + CUDASplitInfo* cuda_best_split_info) { + __shared__ double shared_gain_buffer[32]; + __shared__ bool shared_found_buffer[32]; + __shared__ uint32_t shared_thread_index_buffer[32]; + __shared__ uint32_t best_thread_index; + const double cnt_factor = num_data / sum_hessians; + const bool use_l1 = lambda_l1 > 0.0f; + const double min_gain_shift = parent_gain + min_gain_to_split; + + double local_gain = kMinScore; + bool threshold_found = false; + + cuda_best_split_info->is_valid = false; + + const int bin_start = 1 - feature_mfb_offset; + const int bin_end = feature_num_bin - feature_mfb_offset; + const int threadIdx_x = static_cast(threadIdx.x); + if (is_one_hot) { + if (threadIdx_x >= bin_start && threadIdx_x < bin_end) { + const int bin_offset = (threadIdx_x << 1); + const hist_t grad = feature_hist_ptr[bin_offset]; + const hist_t hess = feature_hist_ptr[bin_offset + 1]; + data_size_t cnt = + static_cast(__double2int_rn(hess * cnt_factor)); + if (cnt >= min_data_in_leaf && hess >= min_sum_hessian_in_leaf) { + const data_size_t other_count = num_data - cnt; + if (other_count >= min_data_in_leaf) { + const double sum_other_hessian = sum_hessians - hess - kEpsilon; + if (sum_other_hessian >= min_sum_hessian_in_leaf) { + const double sum_other_gradient = sum_gradients - grad; + double current_gain = GetSplitGains( + sum_other_gradient, sum_other_hessian, grad, + hess + kEpsilon, lambda_l1, use_l1, + lambda_l2); + if (current_gain > min_gain_shift) { + local_gain = current_gain - min_gain_shift; + threshold_found = true; + } + } + } + } + } + __syncthreads(); + const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer); + if (threadIdx_x == 0) { + best_thread_index = result; + } + __syncthreads(); + if (threshold_found && threadIdx_x == best_thread_index) { + cuda_best_split_info->is_valid = true; + cuda_best_split_info->num_cat_threshold = 1; + cuda_best_split_info->cat_threshold = new uint32_t[1]; + *(cuda_best_split_info->cat_threshold) = static_cast(threadIdx_x); + cuda_best_split_info->default_left = false; + const int bin_offset = (threadIdx_x << 1); + const hist_t sum_left_gradient = feature_hist_ptr[bin_offset]; + const hist_t sum_left_hessian = feature_hist_ptr[bin_offset + 1]; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); + const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2); + const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2); + cuda_best_split_info->left_sum_gradients = sum_left_gradient; + cuda_best_split_info->left_sum_hessians = sum_left_hessian; + cuda_best_split_info->left_count = left_count; + cuda_best_split_info->right_sum_gradients = sum_right_gradient; + cuda_best_split_info->right_sum_hessians = sum_right_hessian; + cuda_best_split_info->right_count = right_count; + cuda_best_split_info->left_value = left_output; + cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + cuda_best_split_info->right_value = right_output; + cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + } + } else { + __shared__ double shared_value_buffer[NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER]; + __shared__ int16_t shared_index_buffer[NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER]; + __shared__ uint16_t shared_mem_buffer_uint16[32]; + __shared__ double shared_mem_buffer_double[32]; + __shared__ int used_bin; + uint16_t is_valid_bin = 0; + int best_dir = 0; + double best_sum_left_gradient = 0.0f; + double best_sum_left_hessian = 0.0f; + if (threadIdx_x >= bin_start && threadIdx_x < bin_end) { + const int bin_offset = (threadIdx_x << 1); + const double hess = feature_hist_ptr[bin_offset + 1]; + if (__double2int_rn(hess * cnt_factor) >= cat_smooth) { + const double grad = feature_hist_ptr[bin_offset]; + shared_value_buffer[threadIdx_x] = grad / (hess + cat_smooth); + shared_index_buffer[threadIdx_x] = threadIdx_x; + is_valid_bin = 1; + } else { + shared_value_buffer[threadIdx_x] = kMaxScore; + shared_index_buffer[threadIdx_x] = -1; + } + } else { + shared_value_buffer[threadIdx_x] = kMaxScore; + shared_index_buffer[threadIdx_x] = -1; + } + __syncthreads(); + const int local_used_bin = ShuffleReduceSum(is_valid_bin, shared_mem_buffer_uint16, blockDim.x); + if (threadIdx_x == 0) { + used_bin = local_used_bin; + } + __syncthreads(); + BitonicArgSort_1024(shared_value_buffer, shared_index_buffer, blockDim.x); + const int max_num_cat = min(max_cat_threshold, (used_bin + 1) / 2); + + // left to right + double grad = 0.0f; + double hess = 0.0f; + if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { + const int bin_offset = (shared_index_buffer[threadIdx_x] << 1); + grad = feature_hist_ptr[bin_offset]; + hess = feature_hist_ptr[bin_offset]; + } + if (threadIdx_x == 0) { + hess += kEpsilon; + } + __syncthreads(); + double sum_left_gradient = ShufflePrefixSum(grad, shared_mem_buffer_double); + __syncthreads(); + double sum_left_hessian = ShufflePrefixSum(hess, shared_mem_buffer_double); + // TODO(shiyu1994): constrain the split with min_data_in_group + if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = num_data - left_count; + if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain > min_gain_shift) { + local_gain = current_gain - min_gain_shift; + threshold_found = true; + best_dir = 1; + best_sum_left_gradient = sum_left_gradient; + best_sum_left_hessian = sum_left_hessian; + } + } + } + __syncthreads(); + + // right to left + grad = 0.0f; + hess = 0.0f; + if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { + const int bin_offset = (shared_index_buffer[used_bin - 1 - threadIdx_x] << 1); + grad = feature_hist_ptr[bin_offset]; + hess = feature_hist_ptr[bin_offset]; + } + if (threadIdx_x == 0) { + hess += kEpsilon; + } + __syncthreads(); + sum_left_gradient = ShufflePrefixSum(grad, shared_mem_buffer_double); + __syncthreads(); + sum_left_hessian = ShufflePrefixSum(hess, shared_mem_buffer_double); + // TODO(shiyu1994): constrain the split with min_data_in_group + if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = num_data - left_count; + if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain > min_gain_shift) { + local_gain = current_gain - min_gain_shift; + threshold_found = true; + best_dir = -1; + best_sum_left_gradient = sum_left_gradient; + best_sum_left_hessian = sum_left_hessian; + } + } + } + __syncthreads(); + + const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer); + if (threadIdx_x == 0) { + best_thread_index = result; + } + __syncthreads(); + if (threshold_found && threadIdx_x == best_thread_index) { + cuda_best_split_info->is_valid = true; + cuda_best_split_info->num_cat_threshold = threadIdx_x + 1; + cuda_best_split_info->cat_threshold = new uint32_t[threadIdx_x + 1]; + if (best_dir == 1) { + for (int i = 0; i < threadIdx_x + 1; ++i) { + (cuda_best_split_info->cat_threshold)[i] = shared_index_buffer[i] + feature_mfb_offset; + } + } else { + for (int i = 0; i < threadIdx_x + 1; ++i) { + (cuda_best_split_info->cat_threshold)[i] = shared_index_buffer[used_bin - 1 - i] + feature_mfb_offset; + } + } + cuda_best_split_info->default_left = false; + const hist_t sum_left_gradient = best_sum_left_gradient; + const hist_t sum_left_hessian = best_sum_left_hessian; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); + const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2); + const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2); + cuda_best_split_info->left_sum_gradients = sum_left_gradient; + cuda_best_split_info->left_sum_hessians = sum_left_hessian; + cuda_best_split_info->left_count = left_count; + cuda_best_split_info->right_sum_gradients = sum_right_gradient; + cuda_best_split_info->right_sum_hessians = sum_right_hessian; + cuda_best_split_info->right_count = right_count; + cuda_best_split_info->left_value = left_output; + cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + cuda_best_split_info->right_value = right_output; + cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + } + } +} + __global__ void FindBestSplitsForLeafKernel( // input feature information const uint32_t* feature_hist_offsets, @@ -457,10 +717,6 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( const double min_gain_shift = parent_gain + min_gain_to_split; cuda_best_split_info->is_valid = false; - - __shared__ hist_t shared_mem_buffer[32]; - hist_t local_grad_hist = 0.0f; - hist_t local_hess_hist = 0.0f; double local_gain = 0.0f; bool threshold_found = false; uint32_t threshold_value = 0; @@ -484,7 +740,7 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( } } } else { - for (unsigned int bin = 0; bin < feature_num_bin_minus_offset; ++bin) { + for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; ++bin) { const bool skip_sum = bin >= static_cast(na_as_missing) && (skip_default_bin && (feature_num_bin - 1 - bin) == static_cast(feature_default_bin)); if (!skip_sum) { @@ -503,50 +759,58 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( hist_hess_buffer_ptr[0] += kEpsilon; } local_gain = kMinScore; - local_grad_hist = ShufflePrefixSum(local_grad_hist, shared_mem_buffer); + GlobalMemoryPrefixSum(hist_grad_buffer_ptr, static_cast(feature_num_bin_minus_offset)); __syncthreads(); - local_hess_hist = ShufflePrefixSum(local_hess_hist, shared_mem_buffer); + GlobalMemoryPrefixSum(hist_hess_buffer_ptr, static_cast(feature_num_bin_minus_offset)); if (reverse) { - if (threadIdx_x >= static_cast(na_as_missing) && threadIdx_x <= feature_num_bin - 2 && !skip_sum) { - const double sum_right_gradient = local_grad_hist; - const double sum_right_hessian = local_hess_hist; - const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); - const double sum_left_gradient = sum_gradients - sum_right_gradient; - const double sum_left_hessian = sum_hessians - sum_right_hessian; - const data_size_t left_count = num_data - right_count; - if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && - sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { - double current_gain = GetSplitGains( - sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - lambda_l2); - // gain with split is worse than without split - if (current_gain > min_gain_shift) { - local_gain = current_gain - min_gain_shift; - threshold_value = static_cast(feature_num_bin - 2 - threadIdx_x); - threshold_found = true; + for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; ++bin) { + const bool skip_sum = (bin >= static_cast(na_as_missing) && + (skip_default_bin && (feature_num_bin - 1 - bin) == static_cast(feature_default_bin))); + if (!skip_sum) { + const double sum_right_gradient = hist_grad_buffer_ptr[bin]; + const double sum_right_hessian = hist_hess_buffer_ptr[bin]; + const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); + const double sum_left_gradient = sum_gradients - sum_right_gradient; + const double sum_left_hessian = sum_hessians - sum_right_hessian; + const data_size_t left_count = num_data - right_count; + if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain > min_gain_shift) { + local_gain = current_gain - min_gain_shift; + threshold_value = static_cast(feature_num_bin - 2 - bin); + threshold_found = true; + } } } } } else { - if (threadIdx_x <= feature_num_bin_minus_offset - 2 && !skip_sum) { - const double sum_left_gradient = local_grad_hist; - const double sum_left_hessian = local_hess_hist; - const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); - const double sum_right_gradient = sum_gradients - sum_left_gradient; - const double sum_right_hessian = sum_hessians - sum_left_hessian; - const data_size_t right_count = num_data - left_count; - if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && - sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { - double current_gain = GetSplitGains( - sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - lambda_l2); - // gain with split is worse than without split - if (current_gain > min_gain_shift) { - local_gain = current_gain - min_gain_shift; - threshold_value = static_cast(threadIdx_x + feature_mfb_offset); - threshold_found = true; + for (unsigned int bin = threadIdx_x; bin <= feature_num_bin_minus_offset - 2; ++bin) { + const bool skip_sum = + (skip_default_bin && (bin + feature_mfb_offset) == static_cast(feature_default_bin)); + if (!skip_sum) { + const double sum_left_gradient = hist_grad_buffer_ptr[bin]; + const double sum_left_hessian = hist_hess_buffer_ptr[bin]; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = num_data - left_count; + if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain > min_gain_shift) { + local_gain = current_gain - min_gain_shift; + threshold_value = static_cast(bin + feature_mfb_offset); + threshold_found = true; + } } } } @@ -563,8 +827,9 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( cuda_best_split_info->gain = local_gain; cuda_best_split_info->default_left = assume_out_default_left; if (reverse) { - const double sum_right_gradient = local_grad_hist; - const double sum_right_hessian = local_hess_hist - kEpsilon; + const unsigned int best_bin = static_cast(feature_num_bin - 2 - threshold_value); + const double sum_right_gradient = hist_grad_buffer_ptr[best_bin]; + const double sum_right_hessian = hist_hess_buffer_ptr[best_bin] - kEpsilon; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon; @@ -586,8 +851,9 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); } else { - const double sum_left_gradient = local_grad_hist; - const double sum_left_hessian = local_hess_hist - kEpsilon; + const unsigned int best_bin = static_cast(threshold_value - feature_mfb_offset); + const double sum_left_gradient = hist_grad_buffer_ptr[best_bin]; + const double sum_left_hessian = hist_hess_buffer_ptr[best_bin] - kEpsilon; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon; @@ -612,6 +878,271 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( } } +__device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( + // input feature information + const hist_t* feature_hist_ptr, + const uint32_t feature_num_bin, + const uint8_t feature_mfb_offset, + const uint32_t feature_default_bin, + const int inner_feature_index, + // input config parameter values + const double lambda_l1, + const double lambda_l2, + const data_size_t min_data_in_leaf, + const double min_sum_hessian_in_leaf, + const double min_gain_to_split, + const double cat_smooth, + const int max_cat_threshold, + const int min_data_per_group, + // input parent node information + const double parent_gain, + const double sum_gradients, + const double sum_hessians, + const data_size_t num_data, + // task information + const bool is_one_hot, + // buffer + hist_t* hist_grad_buffer_ptr, + hist_t* hist_hess_buffer_ptr, + hist_t* hist_stat_buffer_ptr, + data_size_t* hist_index_buffer_ptr, + // output parameters + CUDASplitInfo* cuda_best_split_info) { + __shared__ double shared_gain_buffer[32]; + __shared__ bool shared_found_buffer[32]; + __shared__ uint32_t shared_thread_index_buffer[32]; + __shared__ uint32_t best_thread_index; + const double cnt_factor = num_data / sum_hessians; + const bool use_l1 = lambda_l1 > 0.0f; + const double min_gain_shift = parent_gain + min_gain_to_split; + + double local_gain = kMinScore; + bool threshold_found = false; + + cuda_best_split_info->is_valid = false; + + const int bin_start = 1 - feature_mfb_offset; + const int bin_end = feature_num_bin - feature_mfb_offset; + int best_threshold = -1; + const int threadIdx_x = static_cast(threadIdx.x); + if (is_one_hot) { + for (int bin = bin_start + threadIdx_x; bin < bin_end; bin += static_cast(blockDim.x)) { + const int bin_offset = (bin << 1); + const hist_t grad = feature_hist_ptr[bin_offset]; + const hist_t hess = feature_hist_ptr[bin_offset + 1]; + data_size_t cnt = + static_cast(__double2int_rn(hess * cnt_factor)); + if (cnt >= min_data_in_leaf && hess >= min_sum_hessian_in_leaf) { + const data_size_t other_count = num_data - cnt; + if (other_count >= min_data_in_leaf) { + const double sum_other_hessian = sum_hessians - hess - kEpsilon; + if (sum_other_hessian >= min_sum_hessian_in_leaf) { + const double sum_other_gradient = sum_gradients - grad; + double current_gain = GetSplitGains( + sum_other_gradient, sum_other_hessian, grad, + hess + kEpsilon, lambda_l1, use_l1, + lambda_l2); + if (current_gain > min_gain_shift) { + best_threshold = bin; + local_gain = current_gain - min_gain_shift; + threshold_found = true; + } + } + } + } + } + __syncthreads(); + const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer); + if (threadIdx_x == 0) { + best_thread_index = result; + } + __syncthreads(); + if (threshold_found && threadIdx_x == best_thread_index) { + cuda_best_split_info->is_valid = true; + cuda_best_split_info->num_cat_threshold = 1; + cuda_best_split_info->cat_threshold = new uint32_t[1]; + *(cuda_best_split_info->cat_threshold) = static_cast(best_threshold); + cuda_best_split_info->default_left = false; + const int bin_offset = (best_threshold << 1); + const hist_t sum_left_gradient = feature_hist_ptr[bin_offset]; + const hist_t sum_left_hessian = feature_hist_ptr[bin_offset + 1]; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); + const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2); + const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2); + cuda_best_split_info->left_sum_gradients = sum_left_gradient; + cuda_best_split_info->left_sum_hessians = sum_left_hessian; + cuda_best_split_info->left_count = left_count; + cuda_best_split_info->right_sum_gradients = sum_right_gradient; + cuda_best_split_info->right_sum_hessians = sum_right_hessian; + cuda_best_split_info->right_count = right_count; + cuda_best_split_info->left_value = left_output; + cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + cuda_best_split_info->right_value = right_output; + cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + } + } else { + __shared__ uint16_t shared_mem_buffer_uint16[32]; + __shared__ int used_bin; + uint16_t is_valid_bin = 0; + int best_dir = 0; + double best_sum_left_gradient = 0.0f; + double best_sum_left_hessian = 0.0f; + for (int bin = 0; bin < bin_end; bin += static_cast(blockDim.x)) { + if (bin >= bin_start) { + const int bin_offset = (bin << 1); + const double hess = feature_hist_ptr[bin_offset + 1]; + if (__double2int_rn(hess * cnt_factor) >= cat_smooth) { + const double grad = feature_hist_ptr[bin_offset]; + hist_stat_buffer_ptr[bin] = grad / (hess + cat_smooth); + hist_index_buffer_ptr[bin] = threadIdx_x; + is_valid_bin = 1; + } else { + hist_stat_buffer_ptr[bin] = kMaxScore; + hist_index_buffer_ptr[bin] = -1; + } + } + } + __syncthreads(); + const int local_used_bin = ShuffleReduceSum(is_valid_bin, shared_mem_buffer_uint16, blockDim.x); + if (threadIdx_x == 0) { + used_bin = local_used_bin; + } + __syncthreads(); + BitonicArgSortDevice( + hist_stat_buffer_ptr, hist_index_buffer_ptr, feature_num_bin - feature_mfb_offset); + const int max_num_cat = min(max_cat_threshold, (used_bin + 1) / 2); + __syncthreads(); + + // left to right + if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { + const int bin_offset = (hist_index_buffer_ptr[threadIdx_x] << 1); + hist_grad_buffer_ptr[threadIdx_x] = feature_hist_ptr[bin_offset]; + hist_hess_buffer_ptr[threadIdx_x] = feature_hist_ptr[bin_offset + 1]; + } + if (threadIdx_x == 0) { + hist_hess_buffer_ptr[0] += kEpsilon; + } + __syncthreads(); + GlobalMemoryPrefixSum(hist_grad_buffer_ptr, static_cast(bin_end)); + __syncthreads(); + GlobalMemoryPrefixSum(hist_hess_buffer_ptr, static_cast(bin_end)); + // TODO(shiyu1994): constrain the split with min_data_in_group + if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { + const double sum_left_gradient = hist_grad_buffer_ptr[threadIdx_x]; + const double sum_left_hessian = hist_hess_buffer_ptr[threadIdx_x]; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = num_data - left_count; + if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain > min_gain_shift) { + local_gain = current_gain - min_gain_shift; + threshold_found = true; + best_dir = 1; + best_sum_left_gradient = sum_left_gradient; + best_sum_left_hessian = sum_left_hessian; + } + } + } + __syncthreads(); + + // right to left + if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { + const int bin_offset = (hist_index_buffer_ptr[used_bin - 1 - threadIdx_x] << 1); + hist_grad_buffer_ptr[threadIdx_x] = feature_hist_ptr[bin_offset]; + hist_hess_buffer_ptr[threadIdx_x] = feature_hist_ptr[bin_offset + 1]; + } + if (threadIdx_x == 0) { + hist_hess_buffer_ptr[0] += kEpsilon; + } + __syncthreads(); + GlobalMemoryPrefixSum(hist_grad_buffer_ptr, static_cast(bin_end)); + __syncthreads(); + GlobalMemoryPrefixSum(hist_hess_buffer_ptr, static_cast(bin_end)); + // TODO(shiyu1994): constrain the split with min_data_in_group + if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { + const double sum_left_gradient = hist_grad_buffer_ptr[threadIdx_x]; + const double sum_left_hessian = hist_hess_buffer_ptr[threadIdx_x]; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = num_data - left_count; + if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, + lambda_l2); + // gain with split is worse than without split + if (current_gain > min_gain_shift) { + local_gain = current_gain - min_gain_shift; + threshold_found = true; + best_dir = -1; + best_sum_left_gradient = sum_left_gradient; + best_sum_left_hessian = sum_left_hessian; + } + } + } + __syncthreads(); + + const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer); + if (threadIdx_x == 0) { + best_thread_index = result; + } + __syncthreads(); + if (threshold_found && threadIdx_x == best_thread_index) { + cuda_best_split_info->is_valid = true; + cuda_best_split_info->num_cat_threshold = threadIdx_x + 1; + cuda_best_split_info->cat_threshold = new uint32_t[threadIdx_x + 1]; + if (best_dir == 1) { + for (int i = 0; i < threadIdx_x + 1; ++i) { + (cuda_best_split_info->cat_threshold)[i] = hist_index_buffer_ptr[i] + feature_mfb_offset; + } + } else { + for (int i = 0; i < threadIdx_x + 1; ++i) { + (cuda_best_split_info->cat_threshold)[i] = hist_index_buffer_ptr[used_bin - 1 - i] + feature_mfb_offset; + } + } + cuda_best_split_info->default_left = false; + const hist_t sum_left_gradient = best_sum_left_gradient; + const hist_t sum_left_hessian = best_sum_left_hessian; + const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); + const double sum_right_gradient = sum_gradients - sum_left_gradient; + const double sum_right_hessian = sum_hessians - sum_left_hessian; + const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); + const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2); + const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2); + cuda_best_split_info->left_sum_gradients = sum_left_gradient; + cuda_best_split_info->left_sum_hessians = sum_left_hessian; + cuda_best_split_info->left_count = left_count; + cuda_best_split_info->right_sum_gradients = sum_right_gradient; + cuda_best_split_info->right_sum_hessians = sum_right_hessian; + cuda_best_split_info->right_count = right_count; + cuda_best_split_info->left_value = left_output; + cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + cuda_best_split_info->right_value = right_output; + cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + } + } +} + __global__ void FindBestSplitsForLeafKernel_GlobalMemory( // input feature information const uint32_t* feature_hist_offsets, @@ -707,66 +1238,136 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( if (!is_smaller_leaf_valid) { larger_only = true; } - if (!larger_only) { - FindBestSplitsForLeafKernel<<>>( - // input feature information - cuda_feature_hist_offsets_, - cuda_feature_mfb_offsets_, - cuda_feature_default_bins_, - cuda_feature_num_bins_, - cuda_is_feature_used_bytree_, - // input task information - larger_only, - num_tasks_, - cuda_task_feature_index_, - cuda_task_reverse_, - cuda_task_skip_default_bin_, - cuda_task_na_as_missing_, - cuda_task_out_default_left_, - // input leaf information - smaller_leaf_index, - smaller_leaf_splits, - larger_leaf_index, - larger_leaf_splits, - // configuration parameter values - min_data_in_leaf_, - min_sum_hessian_in_leaf_, - min_gain_to_split_, - lambda_l1_, - lambda_l2_, - // output parameters - cuda_best_split_info_); - } - SynchronizeCUDADevice(__FILE__, __LINE__); - if (larger_leaf_index >= 0) { - FindBestSplitsForLeafKernel<<>>( - // input feature information - cuda_feature_hist_offsets_, - cuda_feature_mfb_offsets_, - cuda_feature_default_bins_, - cuda_feature_num_bins_, - cuda_is_feature_used_bytree_, - // input task information - true, - num_tasks_, - cuda_task_feature_index_, - cuda_task_reverse_, - cuda_task_skip_default_bin_, - cuda_task_na_as_missing_, - cuda_task_out_default_left_, - // input leaf information - smaller_leaf_index, - smaller_leaf_splits, - larger_leaf_index, - larger_leaf_splits, - // configuration parameter values - min_data_in_leaf_, - min_sum_hessian_in_leaf_, - min_gain_to_split_, - lambda_l1_, - lambda_l2_, - // output parameters - cuda_best_split_info_); + if (!use_global_memory_) { + if (!larger_only) { + FindBestSplitsForLeafKernel<<>>( + // input feature information + cuda_feature_hist_offsets_, + cuda_feature_mfb_offsets_, + cuda_feature_default_bins_, + cuda_feature_num_bins_, + cuda_is_feature_used_bytree_, + // input task information + larger_only, + num_tasks_, + cuda_task_feature_index_, + cuda_task_reverse_, + cuda_task_skip_default_bin_, + cuda_task_na_as_missing_, + cuda_task_out_default_left_, + // input leaf information + smaller_leaf_index, + smaller_leaf_splits, + larger_leaf_index, + larger_leaf_splits, + // configuration parameter values + min_data_in_leaf_, + min_sum_hessian_in_leaf_, + min_gain_to_split_, + lambda_l1_, + lambda_l2_, + // output parameters + cuda_best_split_info_); + } + SynchronizeCUDADevice(__FILE__, __LINE__); + if (larger_leaf_index >= 0) { + FindBestSplitsForLeafKernel<<>>( + // input feature information + cuda_feature_hist_offsets_, + cuda_feature_mfb_offsets_, + cuda_feature_default_bins_, + cuda_feature_num_bins_, + cuda_is_feature_used_bytree_, + // input task information + true, + num_tasks_, + cuda_task_feature_index_, + cuda_task_reverse_, + cuda_task_skip_default_bin_, + cuda_task_na_as_missing_, + cuda_task_out_default_left_, + // input leaf information + smaller_leaf_index, + smaller_leaf_splits, + larger_leaf_index, + larger_leaf_splits, + // configuration parameter values + min_data_in_leaf_, + min_sum_hessian_in_leaf_, + min_gain_to_split_, + lambda_l1_, + lambda_l2_, + // output parameters + cuda_best_split_info_); + } + } else { + if (!larger_only) { + FindBestSplitsForLeafKernel_GlobalMemory<<>>( + // input feature information + cuda_feature_hist_offsets_, + cuda_feature_mfb_offsets_, + cuda_feature_default_bins_, + cuda_feature_num_bins_, + cuda_is_feature_used_bytree_, + // input task information + larger_only, + num_tasks_, + cuda_task_feature_index_, + cuda_task_reverse_, + cuda_task_skip_default_bin_, + cuda_task_na_as_missing_, + cuda_task_out_default_left_, + // input leaf information + smaller_leaf_index, + smaller_leaf_splits, + larger_leaf_index, + larger_leaf_splits, + // configuration parameter values + min_data_in_leaf_, + min_sum_hessian_in_leaf_, + min_gain_to_split_, + lambda_l1_, + lambda_l2_, + // buffer + cuda_feature_hist_grad_buffer_, + cuda_feature_hist_hess_buffer_, + // output parameters + cuda_best_split_info_); + } + SynchronizeCUDADevice(__FILE__, __LINE__); + if (larger_leaf_index >= 0) { + FindBestSplitsForLeafKernel_GlobalMemory<<>>( + // input feature information + cuda_feature_hist_offsets_, + cuda_feature_mfb_offsets_, + cuda_feature_default_bins_, + cuda_feature_num_bins_, + cuda_is_feature_used_bytree_, + // input task information + true, + num_tasks_, + cuda_task_feature_index_, + cuda_task_reverse_, + cuda_task_skip_default_bin_, + cuda_task_na_as_missing_, + cuda_task_out_default_left_, + // input leaf information + smaller_leaf_index, + smaller_leaf_splits, + larger_leaf_index, + larger_leaf_splits, + // configuration parameter values + min_data_in_leaf_, + min_sum_hessian_in_leaf_, + min_gain_to_split_, + lambda_l1_, + lambda_l2_, + // buffer + cuda_feature_hist_grad_buffer_, + cuda_feature_hist_hess_buffer_, + // output parameters + cuda_best_split_info_); + } } } diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 0d528875ab02..74ce370ce910 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -18,7 +18,7 @@ #include "cuda_leaf_splits.hpp" -#define MAX_NUM_BIN_IN_FEATURE (1024) +#define NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER (1024) #define NUM_THREADS_FIND_BEST_LEAF (256) #define NUM_TASKS_PER_SYNC_BLOCK (1024) From 25f20a700a2cdd01a2640cc5dd711895240a92e2 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 27 Oct 2021 13:59:36 +0000 Subject: [PATCH 105/166] add bitvectors for categorical split --- include/LightGBM/bin.h | 17 +++++ include/LightGBM/cuda/cuda_algorithms.hpp | 31 ++++++++ include/LightGBM/cuda/cuda_common.hpp | 24 +++++++ include/LightGBM/cuda/cuda_split_info.hpp | 10 ++- include/LightGBM/dataset.h | 6 ++ src/cuda/cuda_common.cu | 68 ++++++++++++++++++ .../cuda/cuda_best_split_finder.cpp | 72 ++++++++++++------- .../cuda/cuda_best_split_finder.cu | 18 ++++- .../cuda/cuda_best_split_finder.hpp | 13 +++- .../cuda/cuda_single_gpu_tree_learner.cpp | 7 +- .../cuda/cuda_single_gpu_tree_learner.cu | 7 ++ .../cuda/cuda_single_gpu_tree_learner.hpp | 4 ++ 12 files changed, 246 insertions(+), 31 deletions(-) create mode 100644 include/LightGBM/cuda/cuda_common.hpp create mode 100644 src/cuda/cuda_common.cu diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 0e63851bfee0..7c9add123c26 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -119,6 +119,23 @@ class BinMapper { } } + /*! + * \brief Maximum categorical value + * \return Maximum categorical value for categorical features, 0 for numerical features + */ + inline int MaxCatValue() const { + if (bin_2_categorical_.size() == 0) { + return 0; + } + int max_cat_value = bin_2_categorical_[0]; + for (size_t i = 1; i < bin_2_categorical_.size(); ++i) { + if (bin_2_categorical_[i] > max_cat_value) { + max_cat_value = bin_2_categorical_[i]; + } + } + return max_cat_value; + } + /*! * \brief Get sizes in byte of this object */ diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 641169476c1b..a26f703083e1 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -138,6 +138,37 @@ __device__ __forceinline__ T ShuffleReduceSum(T value, T* shared_mem_buffer, con return value; } +template +__device__ __forceinline__ T ShuffleReduceMaxWarp(T value, const data_size_t len) { + if (len > 0) { + // TODO(shiyu1994): check how mask works + const uint32_t mask = 0xffffffff; + for (int offset = warpSize / 2; offset > 0; offset >>= 1) { + value = max(value, __shfl_down_sync(mask, value, offset)); + } + } + return value; +} + +// reduce values from an 1-dimensional block (block size must be no greather than 1024) +template +__device__ __forceinline__ T ShuffleReduceMax(T value, T* shared_mem_buffer, const size_t len) { + const uint32_t warpLane = threadIdx.x % warpSize; + const uint32_t warpID = threadIdx.x / warpSize; + const data_size_t warp_len = min(static_cast(warpSize), static_cast(len) - static_cast(warpID * warpSize)); + value = ShuffleReduceMaxWarp(value, warp_len); + if (warpLane == 0) { + shared_mem_buffer[warpID] = value; + } + __syncthreads(); + const data_size_t num_warp = static_cast((len + warpSize - 1) / warpSize); + if (warpID == 0) { + value = (warpLane < num_warp ? shared_mem_buffer[warpLane] : 0); + value = ShuffleReduceMaxWarp(value, num_warp); + } + return value; +} + // calculate prefix sum values within an 1-dimensional block in global memory, exclusively template __device__ __forceinline__ void GlobalMemoryPrefixSum(T* array, const size_t len) { diff --git a/include/LightGBM/cuda/cuda_common.hpp b/include/LightGBM/cuda/cuda_common.hpp new file mode 100644 index 000000000000..9e7332b50641 --- /dev/null +++ b/include/LightGBM/cuda/cuda_common.hpp @@ -0,0 +1,24 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#ifndef LIGHTGBM_CUDA_CUDA_COMMON_HPP_ +#define LIGHTGBM_CUDA_CUDA_COMMON_HPP_ + +#ifdef USE_CUDA + +#define NUM_THREADS_PER_BLOCK_CUDA_COMMON (1024) + +#include + +namespace LightGBM { + +size_t CUDABitsetLen(const CUDASplitInfo* split_info, size_t* out_len_buffer); + +void CUDAConstructBitset(const CUDASplitInfo* split_info, uint32_t* out); + +} // namespace LightGBM + +#endif // USE_CUDA +#endif // LIGHTGBM_CUDA_CUDA_COMMON_HPP_ diff --git a/include/LightGBM/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp index 9cac9f32647a..8cdf31a8f93b 100644 --- a/include/LightGBM/cuda/cuda_split_info.hpp +++ b/include/LightGBM/cuda/cuda_split_info.hpp @@ -36,10 +36,16 @@ struct CUDASplitInfo { int num_cat_threshold = 0; uint32_t* cat_threshold = nullptr; + int* cat_threshold_real = nullptr; __device__ ~CUDASplitInfo() { - if (num_cat_threshold > 0 && cat_threshold != nullptr) { - cudaFree(cat_threshold); + if (num_cat_threshold > 0) { + if (cat_threshold != nullptr) { + cudaFree(cat_threshold); + } + if (cat_threshold_real != nullptr) { + cudaFree(cat_threshold_real); + } } } }; diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index c4344d0a0fa9..1d49f1a9407a 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -610,6 +610,12 @@ class Dataset { return feature_groups_[group]->bin_mappers_[sub_feature]->ValueToBin(threshold_double); } + inline int MaxRealCatValue(int i) const { + const int group = feature2group_[i]; + const int sub_feature = feature2subfeature_[i]; + return feature_groups_[group]->bin_mappers_[sub_feature]->MaxCatValue(); + } + /*! * \brief Get meta data pointer * \return Pointer of meta data diff --git a/src/cuda/cuda_common.cu b/src/cuda/cuda_common.cu new file mode 100644 index 000000000000..a18839c2015b --- /dev/null +++ b/src/cuda/cuda_common.cu @@ -0,0 +1,68 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#ifdef USE_CUDA + +#include +#include +#include + +namespace LightGBM { + +template +__global__ void CalcBitsetLenKernel(const T* vals, int n, size_t* out_len_buffer) { + __shared__ size_t shared_mem_buffer[32]; + const int i = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + size_t len = 0; + if (i < n) { + const T val = vals[i]; + len = (val / 32) + 1; + } + const size_t block_max_len = ShuffleReduceMax(len, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + out_len_buffer[blockIdx.x] = block_max_len; + } +} + +__global__ void ReduceBlockMaxLen(const size_t* out_len_buffer, const int num_blocks) { + __shared__ size_t shared_mem_buffer[32]; + size_t max_len = 0; + for (int i = static_cast(threadIdx.x); i < num_blocks; i += static_cast(blockDim.x)) { + max_len = max(out_len_buffer[i]); + } + const all_max_len = ShuffleReduceMax(max_len, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + out_len_buffer[0] = max_len; + } +} + +template +__global__ void CUDAConstructBitsetKernel(const T* vals, int n, uint32_t* out) { + const int i = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (i < n) { + const T val = vals[i]; + out[val / 32] |= (0x1 << (val % 32)); + } +} + +template +void CUDAConstructBitsetInner(const T* vals, int n, uint32_t* out) { + const int num_blocks = (n + NUM_THREADS_PER_BLOCK_CUDA_COMMON - 1) / NUM_THREADS_PER_BLOCK_CUDA_COMMON; + CUDAConstructBitsetKernel<<>>(vals, n, out); +} + +template +size_t CUDABitsetLenInner(const T* vals, int n, size_t* out_len_buffer) { + const int num_blocks = (n + NUM_THREADS_PER_BLOCK_CUDA_COMMON - 1) / NUM_THREADS_PER_BLOCK_CUDA_COMMON; + CalcBitsetLenKernel<<>>(vals, n, out_len_buffer); + ReduceBlockMaxLen<<<1, NUM_THREADS_PER_BLOCK_CUDA_COMMON>>>(out_len_buffer, num_blocks); + size_t host_max_len = 0; + CopyFromCUDADeviceToHost(&host_max_len, out_len_buffer, 1, __FILE__, __LINE__); + return host_max_len; +} + +} // namespace LightGBM + +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 0f9fd3522ee8..3f78d44cf84b 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -24,6 +24,7 @@ CUDABestSplitFinder::CUDABestSplitFinder( min_data_in_leaf_(config->min_data_in_leaf), min_sum_hessian_in_leaf_(config->min_sum_hessian_in_leaf), min_gain_to_split_(config->min_gain_to_split), + max_cat_threshold_(config->max_cat_threshold), num_total_bin_(feature_hist_offsets.back()), cuda_hist_(cuda_hist) { InitFeatureMetaInfo(train_data); @@ -68,8 +69,16 @@ void CUDABestSplitFinder::InitFeatureMetaInfo(const Dataset* train_data) { feature_default_bins_.resize(num_features_); feature_num_bins_.resize(num_features_); max_num_bin_in_feature_ = 0; + has_categorical_feature_ = false; + max_num_categorical_bin_ = 0; for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { const BinMapper* bin_mapper = train_data->FeatureBinMapper(inner_feature_index); + if (bin_mapper->bin_type() == BinType::CategoricalBin) { + has_categorical_feature_ = true; + if (bin_mapper->num_bin() > max_num_categorical_bin_) { + max_num_categorical_bin_ = bin_mapper->num_bin(); + } + } const MissingType missing_type = bin_mapper->missing_type(); feature_missing_type_[inner_feature_index] = missing_type; feature_mfb_offsets_[inner_feature_index] = static_cast(bin_mapper->GetMostFreqBin() == 0); @@ -169,37 +178,41 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { const size_t cuda_best_leaf_split_info_buffer_size = static_cast(num_task_blocks) * static_cast(num_leaves_); AllocateCUDAMemory(&cuda_leaf_best_split_info_, - cuda_best_leaf_split_info_buffer_size, - __FILE__, - __LINE__); + cuda_best_leaf_split_info_buffer_size, + __FILE__, + __LINE__); InitCUDAMemoryFromHostMemory(&cuda_task_feature_index_, - host_task_feature_index_.data(), - host_task_feature_index_.size(), - __FILE__, - __LINE__); + host_task_feature_index_.data(), + host_task_feature_index_.size(), + __FILE__, + __LINE__); InitCUDAMemoryFromHostMemory(&cuda_task_reverse_, - host_task_reverse_.data(), - host_task_reverse_.size(), - __FILE__, - __LINE__); + host_task_reverse_.data(), + host_task_reverse_.size(), + __FILE__, + __LINE__); InitCUDAMemoryFromHostMemory(&cuda_task_skip_default_bin_, - host_task_skip_default_bin_.data(), - host_task_skip_default_bin_.size(), - __FILE__, - __LINE__); + host_task_skip_default_bin_.data(), + host_task_skip_default_bin_.size(), + __FILE__, + __LINE__); InitCUDAMemoryFromHostMemory(&cuda_task_na_as_missing_, - host_task_na_as_missing_.data(), - host_task_na_as_missing_.size(), - __FILE__, - __LINE__); + host_task_na_as_missing_.data(), + host_task_na_as_missing_.size(), + __FILE__, + __LINE__); InitCUDAMemoryFromHostMemory(&cuda_task_out_default_left_, - host_task_out_default_left_.data(), - host_task_out_default_left_.size(), - __FILE__, - __LINE__); + host_task_out_default_left_.data(), + host_task_out_default_left_.size(), + __FILE__, + __LINE__); const size_t output_buffer_size = 2 * static_cast(num_tasks_); AllocateCUDAMemory(&cuda_best_split_info_, output_buffer_size, __FILE__, __LINE__); + if (has_categorical_feature_) { + AllocateCatVectors(cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size); + AllocateCatVectors(cuda_best_split_info_, output_buffer_size); + } } void CUDABestSplitFinder::ResetTrainingData( @@ -236,9 +249,12 @@ void CUDABestSplitFinder::ResetConfig(const Config* config) { const size_t cuda_best_leaf_split_info_buffer_size = static_cast(num_task_blocks) * static_cast(num_leaves_); DeallocateCUDAMemory(&cuda_leaf_best_split_info_, __FILE__, __LINE__); AllocateCUDAMemory(&cuda_leaf_best_split_info_, - cuda_best_leaf_split_info_buffer_size, - __FILE__, - __LINE__); + cuda_best_leaf_split_info_buffer_size, + __FILE__, + __LINE__); + if (has_categorical_feature_) { + AllocateCatVectors(cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size); + } } void CUDABestSplitFinder::BeforeTrain(const std::vector& is_feature_used_bytree) { @@ -294,6 +310,10 @@ const CUDASplitInfo* CUDABestSplitFinder::FindBestFromAllSplits( return cuda_leaf_best_split_info_ + (*best_leaf_index); } +void CUDABestSplitFinder::AllocateCatVectors(CUDASplitInfo* cuda_split_infos, size_t len) const { + LaunchAllocateCatVectorsKernel(cuda_split_infos, len); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index e5f5799e6599..f421d1c4fd32 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -1660,7 +1660,8 @@ __global__ void PrepareLeafBestSplitInfo(const int smaller_leaf_index, const int } } -void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int cur_num_leaves, +void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel( + const int cur_num_leaves, const int smaller_leaf_index, const int larger_leaf_index, int* smaller_leaf_best_split_feature, uint32_t* smaller_leaf_best_split_threshold, @@ -1689,6 +1690,21 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel(const int cur_num_le *best_leaf_index = host_leaf_best_split_info_buffer[6]; } +__global__ void AllocateCatVectorsKernel( + CUDASplitInfo* cuda_split_infos, size_t len, + const int max_num_categories_in_split) { + const size_t i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < len) { + cuda_split_infos[i]->cat_threshold = new uint32_t[max_num_categories_in_split]; + cuda_split_infos[i]->cat_threshold_real = new int[max_num_categories_in_split]; + } +} + +void CUDABestSplitFinder::LaunchAllocateCatVectorsKernel(CUDASplitInfo* cuda_split_infos, size_t len) const { + const int max_num_categories_in_split = min(max_cat_threshold_ / 2, max_num_categorical_bin_); + AllocateCatVectorsKernel(cuda_split_infos, len, max_num_categories_in_split); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 74ce370ce910..394cf286db7c 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -87,7 +87,9 @@ class CUDABestSplitFinder { const bool is_smaller_leaf_valid, const bool is_larger_leaf_valid); - void LaunchFindBestFromAllSplitsKernel(const int cur_num_leaves, const int smaller_leaf_index, + void LaunchFindBestFromAllSplitsKernel( + const int cur_num_leaves, + const int smaller_leaf_index, const int larger_leaf_index, int* smaller_leaf_best_split_feature, uint32_t* smaller_leaf_best_split_threshold, @@ -97,6 +99,10 @@ class CUDABestSplitFinder { uint8_t* larger_leaf_best_split_default_left, int* best_leaf_index); + void AllocateCatVectors(CUDASplitInfo* cuda_split_infos, size_t len) const; + + void LaunchAllocateCatVectorsKernel(CUDASplitInfo* cuda_split_infos, size_t len) const; + // Host memory int num_features_; int num_leaves_; @@ -111,6 +117,7 @@ class CUDABestSplitFinder { data_size_t min_data_in_leaf_; double min_sum_hessian_in_leaf_; double min_gain_to_split_; + int max_cat_threshold_; std::vector cuda_streams_; // for best split find tasks std::vector host_task_feature_index_; @@ -123,6 +130,10 @@ class CUDABestSplitFinder { bool use_global_memory_; // number of total bins in the dataset const int num_total_bin_; + // has categorical feature + bool has_categorical_feature_; + // maximum number of bins of categorical features + int max_num_categorical_bin_; // CUDA memory, held by this object // for per leaf best split information diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index bf6953c1358f..8e38d936fc10 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -169,7 +169,7 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, int right_leaf_index = tree->Split(best_leaf_index_, train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]), train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], - leaf_best_split_threshold_[best_leaf_index_]), + leaf_best_split_threshold_[best_leaf_index_]), train_data_->FeatureBinMapper(leaf_best_split_feature_[best_leaf_index_])->missing_type(), best_split_info); @@ -324,6 +324,11 @@ void CUDASingleGPUTreeLearner::ReduceLeafStat( LaunchReduceLeafStatKernel(gradients, hessians, old_tree->num_leaves(), refit_num_data_, old_tree->cuda_leaf_value_ref(), old_tree->shrinkage()); } +void CUDASingleGPUTreeLearner::ConstructBitsetForCategoricalSplit( + const CUDASplitInfo* best_split_info) const { + LaunchConstructBitsetForCategoricalSplitKernel(best_split_info); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu index d8ec44aa8e17..3734b720d1d4 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu @@ -113,6 +113,13 @@ void CUDASingleGPUTreeLearner::LaunchReduceLeafStatKernel( config_->lambda_l1, use_l1, config_->lambda_l2, shrinkage_rate, config_->refit_decay_rate, cuda_leaf_value); } + + +void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel( + const CUDASplitInfo* best_split_info) const { + +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index becbbd608367..7bc0ad48f025 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -57,6 +57,10 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { void LaunchReduceLeafStatKernel(const score_t* gradients, const score_t* hessians, const int num_leaves, const data_size_t num_data, double* cuda_leaf_value, const double shrinkage_rate) const; + void ConstructBitsetForCategoricalSplit(const CUDASplitInfo* best_split_info) const; + + void LaunchConstructBitsetForCategoricalSplitKernel(const CUDASplitInfo* best_split_info) const; + // GPU device ID int gpu_device_id_; // number of threads on CPU From 82c33e46cce1d45f883942e6269b1d0e9f52a399 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 28 Oct 2021 09:39:19 +0000 Subject: [PATCH 106/166] cuda data partition split for categorical features --- include/LightGBM/cuda/cuda_common.hpp | 24 -- include/LightGBM/cuda/cuda_tree.hpp | 35 ++- include/LightGBM/cuda/cuda_utils.h | 57 +++++ src/cuda/cuda_common.cu | 68 ----- src/io/cuda/cuda_tree.cpp | 36 ++- src/io/cuda/cuda_tree.cu | 133 ++++++++++ .../cuda/cuda_best_split_finder.cpp | 8 +- .../cuda/cuda_best_split_finder.cu | 16 +- .../cuda/cuda_best_split_finder.hpp | 6 +- src/treelearner/cuda/cuda_data_partition.cpp | 15 ++ src/treelearner/cuda/cuda_data_partition.cu | 239 ++++++++++++++---- src/treelearner/cuda/cuda_data_partition.hpp | 14 + .../cuda/cuda_single_gpu_tree_learner.cpp | 44 +++- .../cuda/cuda_single_gpu_tree_learner.cu | 71 +++++- .../cuda/cuda_single_gpu_tree_learner.hpp | 13 +- 15 files changed, 613 insertions(+), 166 deletions(-) delete mode 100644 include/LightGBM/cuda/cuda_common.hpp delete mode 100644 src/cuda/cuda_common.cu diff --git a/include/LightGBM/cuda/cuda_common.hpp b/include/LightGBM/cuda/cuda_common.hpp deleted file mode 100644 index 9e7332b50641..000000000000 --- a/include/LightGBM/cuda/cuda_common.hpp +++ /dev/null @@ -1,24 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#ifndef LIGHTGBM_CUDA_CUDA_COMMON_HPP_ -#define LIGHTGBM_CUDA_CUDA_COMMON_HPP_ - -#ifdef USE_CUDA - -#define NUM_THREADS_PER_BLOCK_CUDA_COMMON (1024) - -#include - -namespace LightGBM { - -size_t CUDABitsetLen(const CUDASplitInfo* split_info, size_t* out_len_buffer); - -void CUDAConstructBitset(const CUDASplitInfo* split_info, uint32_t* out); - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_CUDA_CUDA_COMMON_HPP_ diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index 6a398a612026..59242854bb0f 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -33,17 +33,29 @@ class CUDATree : public Tree { * \param track_branch_features Whether to keep track of ancestors of leaf nodes * \param is_linear Whether the tree has linear models at each leaf */ - explicit CUDATree(int max_leaves, bool track_branch_features, bool is_linear, const int gpu_device_id); + explicit CUDATree(int max_leaves, bool track_branch_features, bool is_linear, + const int gpu_device_id, const bool has_categorical_feature); explicit CUDATree(const Tree* host_tree); ~CUDATree() noexcept; int Split(const int leaf_index, - const int real_feature_index, - const double real_threshold, - const MissingType missing_type, - const CUDASplitInfo* cuda_split_info); + const int real_feature_index, + const double real_threshold, + const MissingType missing_type, + const CUDASplitInfo* cuda_split_info); + + int SplitCategorical( + const int leaf_index, + const int real_feature_index, + const double real_threshold, + const MissingType missing_type, + const CUDASplitInfo* cuda_split_info, + uint32_t* cuda_bitset, + size_t cuda_bitset_len, + uint32_t* cuda_bitset_inner, + size_t cuda_bitset_inner_len); const int* cuda_left_child() const { return cuda_left_child_; } @@ -84,6 +96,15 @@ class CUDATree : public Tree { const MissingType missing_type, const CUDASplitInfo* cuda_split_info); + void LaunchSplitCategoricalKernel( + const int leaf_index, + const int real_feature_index, + const double real_threshold, + const MissingType missing_type, + const CUDASplitInfo* cuda_split_info, + size_t cuda_bitset_len, + size_t cuda_bitset_inner_len); + void LaunchShrinkageKernel(const double rate); void LaunchAddBiasKernel(const double val); @@ -104,6 +125,10 @@ class CUDATree : public Tree { double* cuda_leaf_weight_; data_size_t* cuda_internal_count_; float* cuda_split_gain_; + CUDAVector cuda_bitset_; + CUDAVector cuda_bitset_inner_; + CUDAVector cuda_cat_boundaries_; + CUDAVector cuda_cat_boundaries_inner_; cudaStream_t cuda_stream_; diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 8b8153bd439f..e6c74de1a181 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -12,6 +12,8 @@ #include #include +#include + #include namespace LightGBM { @@ -94,6 +96,61 @@ void DeallocateCUDAMemory(T** ptr, const char* file, const int line) { } } +template +class CUDAVector { + public: + CUDAVector() { + size_ = 0; + data_ = nullptr; + } + + CUDAVector(size_t size) { + size_ = size; + AllocateCUDAMemory(&data_, size_, __FILE__, __LINE__); + } + + void Resize(size_t size) { + CHECK_GT(size, 0); + T* new_data = nullptr; + AllocateCUDAMemory(&new_data, size, __FILE__, __LINE__); + CopyFromCUDADeviceToCUDADevice(new_data, data_, size, __FILE__, __LINE__); + DeallocateCUDAMemory(&data_, __FILE__, __LINE__); + data_ = new_data; + } + + void PushBack(const T* values, size_t len) { + T* new_data = nullptr; + AllocateCUDAMemory(&new_data, size_ + len, __FILE__, __LINE__); + CopyFromCUDADeviceToCUDADevice(new_data, data_, size_, __FILE__, __LINE__); + CopyFromCUDADeviceToCUDADevice(new_data + size_, values, len, __FILE__, __LINE__); + DeallocateCUDAMemory(&data_, __FILE__, __LINE__); + size_ += len; + data_ = new_data; + } + + size_t Size() { + return size_; + } + + ~CUDAVector() { + DeallocateCUDAMemory(&data_, __FILE__, __LINE__); + } + + std::vector ToHost() { + std::vector host_vector(size_); + CopyFromCUDADeviceToCUDADevice(host_vector.data(), data_, size_, __FILE__, __LINE__); + return host_vector; + } + + T* RawData() { + return data_; + } + + private: + T* data_; + size_t size_; +}; + } // namespace LightGBM #endif // LIGHTGBM_CUDA_CUDA_UTILS_H_ diff --git a/src/cuda/cuda_common.cu b/src/cuda/cuda_common.cu deleted file mode 100644 index a18839c2015b..000000000000 --- a/src/cuda/cuda_common.cu +++ /dev/null @@ -1,68 +0,0 @@ -/*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -#ifdef USE_CUDA - -#include -#include -#include - -namespace LightGBM { - -template -__global__ void CalcBitsetLenKernel(const T* vals, int n, size_t* out_len_buffer) { - __shared__ size_t shared_mem_buffer[32]; - const int i = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - size_t len = 0; - if (i < n) { - const T val = vals[i]; - len = (val / 32) + 1; - } - const size_t block_max_len = ShuffleReduceMax(len, shared_mem_buffer, blockDim.x); - if (threadIdx.x == 0) { - out_len_buffer[blockIdx.x] = block_max_len; - } -} - -__global__ void ReduceBlockMaxLen(const size_t* out_len_buffer, const int num_blocks) { - __shared__ size_t shared_mem_buffer[32]; - size_t max_len = 0; - for (int i = static_cast(threadIdx.x); i < num_blocks; i += static_cast(blockDim.x)) { - max_len = max(out_len_buffer[i]); - } - const all_max_len = ShuffleReduceMax(max_len, shared_mem_buffer, blockDim.x); - if (threadIdx.x == 0) { - out_len_buffer[0] = max_len; - } -} - -template -__global__ void CUDAConstructBitsetKernel(const T* vals, int n, uint32_t* out) { - const int i = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (i < n) { - const T val = vals[i]; - out[val / 32] |= (0x1 << (val % 32)); - } -} - -template -void CUDAConstructBitsetInner(const T* vals, int n, uint32_t* out) { - const int num_blocks = (n + NUM_THREADS_PER_BLOCK_CUDA_COMMON - 1) / NUM_THREADS_PER_BLOCK_CUDA_COMMON; - CUDAConstructBitsetKernel<<>>(vals, n, out); -} - -template -size_t CUDABitsetLenInner(const T* vals, int n, size_t* out_len_buffer) { - const int num_blocks = (n + NUM_THREADS_PER_BLOCK_CUDA_COMMON - 1) / NUM_THREADS_PER_BLOCK_CUDA_COMMON; - CalcBitsetLenKernel<<>>(vals, n, out_len_buffer); - ReduceBlockMaxLen<<<1, NUM_THREADS_PER_BLOCK_CUDA_COMMON>>>(out_len_buffer, num_blocks); - size_t host_max_len = 0; - CopyFromCUDADeviceToHost(&host_max_len, out_len_buffer, 1, __FILE__, __LINE__); - return host_max_len; -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index 6485da8b824c..4974b6c8e414 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -9,7 +9,8 @@ namespace LightGBM { -CUDATree::CUDATree(int max_leaves, bool track_branch_features, bool is_linear, const int gpu_device_id): +CUDATree::CUDATree(int max_leaves, bool track_branch_features, bool is_linear, + const int gpu_device_id, const bool has_categorical_feature): Tree(max_leaves, track_branch_features, is_linear), num_threads_per_block_add_prediction_to_score_(1024) { is_cuda_tree_ = true; @@ -18,6 +19,10 @@ num_threads_per_block_add_prediction_to_score_(1024) { } else { CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); } + if (has_categorical_feature) { + cuda_cat_boundaries_.Resize(max_leaves); + cuda_cat_boundaries_inner_.Resize(max_leaves); + } InitCUDAMemory(); } @@ -210,6 +215,25 @@ int CUDATree::Split(const int leaf_index, return num_leaves_ - 1; } +int CUDATree::SplitCategorical(const int leaf_index, + const int real_feature_index, + const double real_threshold, + const MissingType missing_type, + const CUDASplitInfo* cuda_split_info, + uint32_t* cuda_bitset, + size_t cuda_bitset_len, + uint32_t* cuda_bitset_inner, + size_t cuda_bitset_inner_len) { + LaunchSplitCategoricalKernel(leaf_index, real_feature_index, + real_threshold, missing_type, cuda_split_info, + cuda_bitset_len, cuda_bitset_inner_len); + cuda_bitset_.PushBack(cuda_bitset, cuda_bitset_len); + cuda_bitset_inner_.PushBack(cuda_bitset_inner, cuda_bitset_inner_len); + ++num_leaves_; + ++num_cat_; + return num_leaves_ - 1; +} + inline void CUDATree::Shrinkage(double rate) { Tree::Shrinkage(rate); LaunchShrinkageKernel(rate); @@ -255,6 +279,16 @@ void CUDATree::ToHost() { CopyFromCUDADeviceToHost(internal_weight_.data(), cuda_internal_weight_, num_leaves_size - 1, __FILE__, __LINE__); CopyFromCUDADeviceToHost(internal_count_.data(), cuda_internal_count_, num_leaves_size - 1, __FILE__, __LINE__); CopyFromCUDADeviceToHost(leaf_depth_.data(), cuda_leaf_depth_, num_leaves_size, __FILE__, __LINE__); + + if (num_cat_ > 0) { + cuda_cat_boundaries_inner_.Resize(num_cat_); + cuda_cat_boundaries_.Resize(num_cat_); + cat_boundaries_ = cuda_cat_boundaries_.ToHost(); + cat_boundaries_inner_ = cuda_cat_boundaries_inner_.ToHost(); + cat_threshold_ = cuda_bitset_.ToHost(); + cat_threshold_inner_ = cuda_bitset_inner_.ToHost(); + } + SynchronizeCUDADevice(__FILE__, __LINE__); } diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index 03735f87730b..70e0f0f640af 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -147,6 +147,139 @@ void CUDATree::LaunchSplitKernel(const int leaf_index, cuda_threshold_); } +__global__ void SplitCategoricalKernel( // split information + const int leaf_index, + const int real_feature_index, + const double real_threshold, + const MissingType missing_type, + const CUDASplitInfo* cuda_split_info, + // tree structure + const int num_leaves, + int* leaf_parent, + int* leaf_depth, + int* left_child, + int* right_child, + int* split_feature_inner, + int* split_feature, + float* split_gain, + double* internal_weight, + double* internal_value, + data_size_t* internal_count, + double* leaf_weight, + double* leaf_value, + data_size_t* leaf_count, + int8_t* decision_type, + uint32_t* threshold_in_bin, + double* threshold, + size_t cuda_bitset_len, + size_t cuda_bitset_inner_len, + int num_cat, + int* cuda_cat_boundaries, + int* cuda_cat_boundaries_inner) { + const int new_node_index = num_leaves - 1; + const int thread_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + const int parent_index = leaf_parent[leaf_index]; + if (thread_index == 0) { + if (parent_index >= 0) { + // if cur node is left child + if (left_child[parent_index] == ~leaf_index) { + left_child[parent_index] = new_node_index; + } else { + right_child[parent_index] = new_node_index; + } + } + left_child[new_node_index] = ~leaf_index; + right_child[new_node_index] = ~num_leaves; + leaf_parent[leaf_index] = new_node_index; + leaf_parent[num_leaves] = new_node_index; + } else if (thread_index == 1) { + // add new node + split_feature_inner[new_node_index] = cuda_split_info->inner_feature_index; + } else if (thread_index == 2) { + split_feature[new_node_index] = real_feature_index; + } else if (thread_index == 3) { + split_gain[new_node_index] = static_cast(cuda_split_info->gain); + } else if (thread_index == 4) { + // save current leaf value to internal node before change + internal_weight[new_node_index] = leaf_weight[leaf_index]; + leaf_weight[leaf_index] = cuda_split_info->left_sum_hessians; + } else if (thread_index == 5) { + internal_value[new_node_index] = leaf_value[leaf_index]; + leaf_value[leaf_index] = isnan(cuda_split_info->left_value) ? 0.0f : cuda_split_info->left_value; + } else if (thread_index == 6) { + internal_count[new_node_index] = cuda_split_info->left_count + cuda_split_info->right_count; + } else if (thread_index == 7) { + leaf_count[leaf_index] = cuda_split_info->left_count; + } else if (thread_index == 8) { + leaf_value[num_leaves] = isnan(cuda_split_info->right_value) ? 0.0f : cuda_split_info->right_value; + } else if (thread_index == 9) { + leaf_weight[num_leaves] = cuda_split_info->right_sum_hessians; + } else if (thread_index == 10) { + leaf_count[num_leaves] = cuda_split_info->right_count; + } else if (thread_index == 11) { + // update leaf depth + leaf_depth[num_leaves] = leaf_depth[leaf_index] + 1; + leaf_depth[leaf_index]++; + } else if (thread_index == 12) { + decision_type[new_node_index] = 0; + SetDecisionTypeCUDA(&decision_type[new_node_index], true, kCategoricalMask); + SetMissingTypeCUDA(&decision_type[new_node_index], static_cast(missing_type)); + } else if (thread_index == 13) { + threshold_in_bin[new_node_index] = num_cat; + } else if (thread_index == 14) { + threshold[new_node_index] = num_cat; + } else if (thread_index == 15) { + if (num_cat == 0) { + cuda_cat_boundaries[num_cat] = 0; + } + cuda_cat_boundaries[num_cat + 1] = cuda_cat_boundaries[num_cat] + cuda_bitset_len; + } else if (thread_index == 16) { + if (num_cat == 0) { + cuda_cat_boundaries_inner[num_cat] = 0; + } + cuda_cat_boundaries_inner[num_cat + 1] = cuda_cat_boundaries_inner[num_cat] + cuda_bitset_inner_len; + } +} + +void CUDATree::LaunchSplitCategoricalKernel(const int leaf_index, + const int real_feature_index, + const double real_threshold, + const MissingType missing_type, + const CUDASplitInfo* cuda_split_info, + size_t cuda_bitset_len, + size_t cuda_bitset_inner_len) { + SplitCategoricalKernel<<<3, 6, 0, cuda_stream_>>>( + // split information + leaf_index, + real_feature_index, + real_threshold, + missing_type, + cuda_split_info, + // tree structure + num_leaves_, + cuda_leaf_parent_, + cuda_leaf_depth_, + cuda_left_child_, + cuda_right_child_, + cuda_split_feature_inner_, + cuda_split_feature_, + cuda_split_gain_, + cuda_internal_weight_, + cuda_internal_value_, + cuda_internal_count_, + cuda_leaf_weight_, + cuda_leaf_value_, + cuda_leaf_count_, + cuda_decision_type_, + cuda_threshold_in_bin_, + cuda_threshold_, + cuda_bitset_len, + cuda_bitset_inner_len, + num_cat_, + cuda_cat_boundaries_.RawData(), + cuda_cat_boundaries_inner_.RawData()); +} + template __global__ void AddPredictionToScoreKernel( // dataset information diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 3f78d44cf84b..937833a442df 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -101,7 +101,7 @@ void CUDABestSplitFinder::Init() { cuda_streams_.resize(2); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[0])); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_streams_[1])); - AllocateCUDAMemory(&cuda_best_split_info_buffer_, 7, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_best_split_info_buffer_, 8, __FILE__, __LINE__); if (use_global_memory_) { AllocateCUDAMemory(&cuda_feature_hist_grad_buffer_, static_cast(num_total_bin_), __FILE__, __LINE__); AllocateCUDAMemory(&cuda_feature_hist_hess_buffer_, static_cast(num_total_bin_), __FILE__, __LINE__); @@ -294,7 +294,8 @@ const CUDASplitInfo* CUDABestSplitFinder::FindBestFromAllSplits( int* larger_leaf_best_split_feature, uint32_t* larger_leaf_best_split_threshold, uint8_t* larger_leaf_best_split_default_left, - int* best_leaf_index) { + int* best_leaf_index, + int* num_cat_threshold) { LaunchFindBestFromAllSplitsKernel( cur_num_leaves, smaller_leaf_index, @@ -305,7 +306,8 @@ const CUDASplitInfo* CUDABestSplitFinder::FindBestFromAllSplits( larger_leaf_best_split_feature, larger_leaf_best_split_threshold, larger_leaf_best_split_default_left, - best_leaf_index); + best_leaf_index, + num_cat_threshold); SynchronizeCUDADevice(__FILE__, __LINE__); return cuda_leaf_best_split_info_ + (*best_leaf_index); } diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index f421d1c4fd32..b28842765ea8 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -1634,6 +1634,7 @@ __global__ void FindBestFromAllSplitsKernel(const int cur_num_leaves, if (best_leaf_index != -1) { cuda_leaf_best_split_info[best_leaf_index].is_valid = false; cuda_leaf_best_split_info[cur_num_leaves].is_valid = false; + cuda_best_split_info_buffer[7] = cuda_leaf_best_split_info[best_leaf_index].num_cat_threshold; } } } @@ -1669,16 +1670,17 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel( int* larger_leaf_best_split_feature, uint32_t* larger_leaf_best_split_threshold, uint8_t* larger_leaf_best_split_default_left, - int* best_leaf_index) { + int* best_leaf_index, + int* num_cat_threshold) { FindBestFromAllSplitsKernel<<<1, NUM_THREADS_FIND_BEST_LEAF, 0, cuda_streams_[1]>>>(cur_num_leaves, cuda_leaf_best_split_info_, cuda_best_split_info_buffer_); PrepareLeafBestSplitInfo<<<6, 1, 0, cuda_streams_[0]>>>(smaller_leaf_index, larger_leaf_index, cuda_best_split_info_buffer_, cuda_leaf_best_split_info_); - std::vector host_leaf_best_split_info_buffer(7); + std::vector host_leaf_best_split_info_buffer(8); SynchronizeCUDADevice(__FILE__, __LINE__); - CopyFromCUDADeviceToHost(host_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 7, __FILE__, __LINE__); + CopyFromCUDADeviceToHost(host_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 8, __FILE__, __LINE__); *smaller_leaf_best_split_feature = host_leaf_best_split_info_buffer[0]; *smaller_leaf_best_split_threshold = static_cast(host_leaf_best_split_info_buffer[1]); *smaller_leaf_best_split_default_left = static_cast(host_leaf_best_split_info_buffer[2]); @@ -1688,6 +1690,7 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel( *larger_leaf_best_split_default_left = static_cast(host_leaf_best_split_info_buffer[5]); } *best_leaf_index = host_leaf_best_split_info_buffer[6]; + *num_cat_threshold = host_leaf_best_split_info_buffer[7]; } __global__ void AllocateCatVectorsKernel( @@ -1695,14 +1698,15 @@ __global__ void AllocateCatVectorsKernel( const int max_num_categories_in_split) { const size_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i < len) { - cuda_split_infos[i]->cat_threshold = new uint32_t[max_num_categories_in_split]; - cuda_split_infos[i]->cat_threshold_real = new int[max_num_categories_in_split]; + cuda_split_infos[i].cat_threshold = new uint32_t[max_num_categories_in_split]; + cuda_split_infos[i].cat_threshold_real = new int[max_num_categories_in_split]; } } void CUDABestSplitFinder::LaunchAllocateCatVectorsKernel(CUDASplitInfo* cuda_split_infos, size_t len) const { const int max_num_categories_in_split = min(max_cat_threshold_ / 2, max_num_categorical_bin_); - AllocateCatVectorsKernel(cuda_split_infos, len, max_num_categories_in_split); + const int num_blocks = (static_cast(len) + NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER - 1) / NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER; + AllocateCatVectorsKernel<<>>(cuda_split_infos, len, max_num_categories_in_split); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 394cf286db7c..1a0edd8c0249 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -62,7 +62,8 @@ class CUDABestSplitFinder { int* larger_leaf_best_split_feature, uint32_t* larger_leaf_best_split_threshold, uint8_t* larger_leaf_best_split_default_left, - int* best_leaf_index); + int* best_leaf_index, + int* num_cat_threshold); void ResetTrainingData( const hist_t* cuda_hist, @@ -97,7 +98,8 @@ class CUDABestSplitFinder { int* larger_leaf_best_split_feature, uint32_t* larger_leaf_best_split_threshold, uint8_t* larger_leaf_best_split_default_left, - int* best_leaf_index); + int* best_leaf_index, + data_size_t* num_cat_threshold); void AllocateCatVectors(CUDASplitInfo* cuda_split_infos, size_t len) const; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 1cb8d03f350a..e4ac9131d817 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -30,6 +30,21 @@ CUDADataPartition::CUDADataPartition( cur_num_leaves_ = 1; cuda_column_data_ = train_data->cuda_column_data(); + is_categorical_feature_.resize(train_data->num_features(), false); + is_single_feature_in_group_.resize(train_data->num_features(), false); + int feature_group_index = train_data->Feature2Group(0); + for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) { + if (train_data->FeatureBinMapper(feature_index)->bin_type() == BinType::CategoricalBin) { + is_categorical_feature_[feature_index] = true; + } + const int feature_group_index = train_data->Feature2Group(feature_index); + if (!train_data->IsMultiGroup(feature_group_index) && + (feature_index == 0 || train_data->Feature2Group(feature_index - 1) != feature_group_index) && + (feature_index == train_data->num_features() - 1 || train_data->Feature2Group(feature_index + 1) != feature_group_index)) { + is_single_feature_in_group_[feature_index] = true; + } + } + cuda_data_indices_ = nullptr; cuda_leaf_data_start_ = nullptr; cuda_leaf_data_end_ = nullptr; diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 444750873e4a..cc48d2dcc4c1 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -272,16 +272,14 @@ void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel( // min_bin_ref < max_bin_ref template -__global__ void GenDataToLeftBitVectorKernel0(const int best_split_feature_ref, +__global__ void GenDataToLeftBitVectorKernel0( const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, + const uint32_t th, const BIN_TYPE* column_data, // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint32_t min_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, uint16_t* block_to_left_offset, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer) { __shared__ uint16_t shared_mem_buffer[32]; uint16_t thread_to_left_offset_cnt = 0; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; @@ -308,16 +306,14 @@ __global__ void GenDataToLeftBitVectorKernel0(const int best_split_feature_ref, // min_bin_ref == max_bin_ref template -__global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, +__global__ void GenDataToLeftBitVectorKernel16( const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const uint32_t th, const int num_features_ref, const BIN_TYPE* column_data, + const BIN_TYPE* column_data, // values from feature - const uint32_t t_zero_bin, const uint32_t most_freq_bin_ref, const uint32_t max_bin_ref, const uint32_t min_bin_ref, + const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, uint16_t* block_to_left_offset, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer) { __shared__ uint16_t shared_mem_buffer[32]; uint16_t thread_to_left_offset_cnt = 0; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; @@ -345,12 +341,55 @@ __global__ void GenDataToLeftBitVectorKernel16(const int best_split_feature_ref, thread_to_left_offset_cnt, shared_mem_buffer); } +template +__device__ bool CUDAFindInBitset(const uint32_t* bits, int n, T pos) { + int i1 = pos / 32; + if (i1 >= n) { + return false; + } + int i2 = pos % 32; + return (bits[i1] >> i2) & 1; +} + +// for categorical features +template +__global__ void GenDataToLeftBitVectorKernel_Categorical( + const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, + const uint32_t* bitset, int bitset_len, const BIN_TYPE* column_data, + // values from feature + const uint32_t max_bin, const uint32_t min_bin, const int8_t mfb_offset, + const uint8_t split_default_to_left, + uint16_t* block_to_left_offset, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer) { + __shared__ uint16_t shared_mem_buffer[32]; + uint16_t thread_to_left_offset_cnt = 0; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(column_data[global_data_index]); + if (USE_MIN_BIN && (bin < min_bin || bin > max_bin)) { + thread_to_left_offset_cnt = split_default_to_left; + } else if (!USE_MIN_BIN && bin == 0) { + thread_to_left_offset_cnt = split_default_to_left; + } else if (CUDAFindInBitset(bitset, bitset_len, bin - min_bin + mfb_offset)) { + thread_to_left_offset_cnt = 1; + } + } + __syncthreads(); + PrepareOffset(num_data_in_leaf, block_to_left_offset + blockIdx.x * blockDim.x, block_to_left_offset_buffer, block_to_right_offset_buffer, + thread_to_left_offset_cnt, shared_mem_buffer); +} + #define GenBitVector_ARGS \ - split_feature_index, num_data_in_leaf, data_indices_in_leaf, \ - th, num_features_, \ - column_data, t_zero_bin, most_freq_bin, max_bin, min_bin, split_default_to_left, \ - split_missing_default_to_left, cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_, \ - cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index + num_data_in_leaf, data_indices_in_leaf, \ + th, \ + column_data, t_zero_bin, max_bin, min_bin, split_default_to_left, \ + split_missing_default_to_left, cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_ + +#define GenBitVector_MaxIsMin_ARGS \ + num_data_in_leaf, data_indices_in_leaf, \ + column_data, t_zero_bin, max_bin, split_default_to_left, \ + split_missing_default_to_left, cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_ template void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( @@ -378,100 +417,100 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start; if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_ARGS); + GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); } } @@ -598,6 +637,108 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( } #undef GenBitVector_ARGS +#undef GenBitVector_MaxIsMin_ARGS + + +template +__global__ void UpdateDataIndexToLeafIndexKernel_Categorical( + const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, + const uint32_t* bitset, const int bitset_len, const BIN_TYPE* column_data, + // values from feature + const uint32_t max_bin, const uint32_t min_bin, const int8_t mfb_offset, + int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, + const int default_leaf_index) { + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(column_data[global_data_index]); + if (USE_MIN_BIN && (bin < min_bin || bin > max_bin)) { + cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; + } else if (!USE_MIN_BIN && bin == 0) { + cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; + } else if (CUDAFindInBitset(bitset, bitset_len, bin - min_bin + mfb_offset)) { + cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; + } else { + cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + } + } +} + +#define GenBitVector_Categorical_ARGS \ + num_data_in_leaf, data_indices_in_leaf, \ + bitset, bitset_len, \ + column_data, max_bin, min_bin, mfb_offset, split_default_to_left, \ + cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_ + +#define UpdateDataIndexToLeafIndex_Categorical_ARGS \ + num_data_in_leaf, data_indices_in_leaf, \ + bitset, bitset_len, \ + column_data, max_bin, min_bin, mfb_offset, \ + cuda_data_index_to_leaf_index_, left_leaf_index, right_leaf_index, default_leaf_index + +void CUDADataPartition::LaunchGenDataToLeftBitVectorCategoricalKernel( + const data_size_t num_data_in_leaf, + const int split_feature_index, + const uint32_t* bitset, + const int bitset_len, + const uint8_t split_default_left, + const data_size_t leaf_data_start, + const int left_leaf_index, + const int right_leaf_index) { + const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start; + const int column_index = cuda_column_data_->feature_to_column(split_feature_index); + const uint8_t bit_type = cuda_column_data_->column_bit_type(column_index); + const uint32_t min_bin = cuda_column_data_->feature_min_bin(split_feature_index); + const uint32_t max_bin = cuda_column_data_->feature_max_bin(split_feature_index); + const uint32_t most_freq_bin = cuda_column_data_->feature_most_freq_bin(split_feature_index); + const uint32_t default_bin = cuda_column_data_->feature_default_bin(split_feature_index); + const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index); + const int8_t mfb_offset = static_cast(most_freq_bin == 0); + std::vector host_bitset(bitset_len, 0); + CopyFromCUDADeviceToHost(host_bitset.data(), bitset, bitset_len, __FILE__, __LINE__); + uint32_t t_zero_bin = min_bin + default_bin; + if (most_freq_bin == 0) { + --t_zero_bin; + } + uint8_t split_default_to_left = 0; + int default_leaf_index = right_leaf_index; + const int is_single_feature_in_group = is_single_feature_in_group_[split_feature_index]; + if (Common::FindInBitset(host_bitset.data(), bitset_len, most_freq_bin)) { + split_default_to_left = 1; + default_leaf_index = left_leaf_index; + } + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(column_data_pointer); + if (is_single_feature_in_group) { + GenDataToLeftBitVectorKernel_Categorical<<>>(GenBitVector_Categorical_ARGS); + UpdateDataIndexToLeafIndexKernel_Categorical<<>>(UpdateDataIndexToLeafIndex_Categorical_ARGS); + } else { + GenDataToLeftBitVectorKernel_Categorical<<>>(GenBitVector_Categorical_ARGS); + UpdateDataIndexToLeafIndexKernel_Categorical<<>>(UpdateDataIndexToLeafIndex_Categorical_ARGS); + } + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(column_data_pointer); + if (is_single_feature_in_group) { + GenDataToLeftBitVectorKernel_Categorical<<>>(GenBitVector_Categorical_ARGS); + UpdateDataIndexToLeafIndexKernel_Categorical<<>>(UpdateDataIndexToLeafIndex_Categorical_ARGS); + } else { + GenDataToLeftBitVectorKernel_Categorical<<>>(GenBitVector_Categorical_ARGS); + UpdateDataIndexToLeafIndexKernel_Categorical<<>>(UpdateDataIndexToLeafIndex_Categorical_ARGS); + } + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(column_data_pointer); + if (is_single_feature_in_group) { + GenDataToLeftBitVectorKernel_Categorical<<>>(GenBitVector_Categorical_ARGS); + UpdateDataIndexToLeafIndexKernel_Categorical<<>>(UpdateDataIndexToLeafIndex_Categorical_ARGS); + } else { + GenDataToLeftBitVectorKernel_Categorical<<>>(GenBitVector_Categorical_ARGS); + UpdateDataIndexToLeafIndexKernel_Categorical<<>>(UpdateDataIndexToLeafIndex_Categorical_ARGS); + } + } +} + +#undef GenBitVector_Categorical_ARGS +#undef UpdateDataIndexToLeafIndex_Categorical_ARGS void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index d3a59c6467aa..82c5a15d610f 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -150,6 +150,16 @@ class CUDADataPartition { const int left_leaf_index, const int right_leaf_index); + void LaunchGenDataToLeftBitVectorCategoricalKernel( + const data_size_t num_data_in_leaf, + const int split_feature_index, + const uint32_t* bitset, + const int bitset_len, + const uint8_t split_default_left, + const data_size_t leaf_data_start, + const int left_leaf_index, + const int right_leaf_index); + template void LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( const bool missing_is_zero, @@ -239,6 +249,10 @@ class CUDADataPartition { mutable std::vector add_train_score_; /*! \brief data indices used in this iteration */ const data_size_t* used_indices_; + /*! \brief marks whether a feature is a categorical feature */ + std::vector is_categorical_feature_; + /*! \brief marks whether a feature is the only feature in its group */ + std::vector is_single_feature_in_group_; // config information /*! \brief maximum number of leaves in a tree */ diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 8e38d936fc10..eb2144b24663 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -65,6 +65,7 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_ AllocateCUDAMemory(&cuda_gradients_, static_cast(num_data_), __FILE__, __LINE__); AllocateCUDAMemory(&cuda_hessians_, static_cast(num_data_), __FILE__, __LINE__); + AllocateBitset(); cuda_leaf_gradient_stat_buffer_ = nullptr; cuda_leaf_hessian_stat_buffer_ = nullptr; @@ -108,7 +109,8 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, BeforeTrain(); global_timer.Stop("CUDASingleGPUTreeLearner::BeforeTrain"); const bool track_branch_features = !(config_->interaction_constraints_vector.empty()); - std::unique_ptr tree(new CUDATree(config_->num_leaves, track_branch_features, config_->linear_tree, config_->gpu_device_id)); + std::unique_ptr tree(new CUDATree(config_->num_leaves, track_branch_features, + config_->linear_tree, config_->gpu_device_id, has_categorical_feature_)); for (int i = 0; i < config_->num_leaves - 1; ++i) { global_timer.Start("CUDASingleGPUTreeLearner::ConstructHistogramForLeaf"); const data_size_t num_data_in_smaller_leaf = leaf_num_data_[smaller_leaf_index_]; @@ -144,7 +146,8 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, &leaf_best_split_feature_[larger_leaf_index_], &leaf_best_split_threshold_[larger_leaf_index_], &leaf_best_split_default_left_[larger_leaf_index_], - &best_leaf_index_); + &best_leaf_index_, + &num_cat_threshold_); } else { best_split_info = cuda_best_split_finder_->FindBestFromAllSplits( tree->num_leaves(), @@ -156,7 +159,8 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, nullptr, nullptr, nullptr, - &best_leaf_index_); + &best_leaf_index_, + &num_cat_threshold_); } global_timer.Stop("CUDASingleGPUTreeLearner::FindBestFromAllSplits"); @@ -166,6 +170,10 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, } global_timer.Start("CUDASingleGPUTreeLearner::Split"); + if (num_cat_threshold_ > 0) { + ConstructBitsetForCategoricalSplit(best_split_info); + } + int right_leaf_index = tree->Split(best_leaf_index_, train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]), train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], @@ -325,10 +333,38 @@ void CUDASingleGPUTreeLearner::ReduceLeafStat( } void CUDASingleGPUTreeLearner::ConstructBitsetForCategoricalSplit( - const CUDASplitInfo* best_split_info) const { + const CUDASplitInfo* best_split_info) { LaunchConstructBitsetForCategoricalSplitKernel(best_split_info); } +void CUDASingleGPUTreeLearner::AllocateBitset() { + has_categorical_feature_ = false; + for (int i = 0; i < train_data_->num_features(); ++i) { + if (train_data_->FeatureBinMapper(i)->bin_type() == BinType::CategoricalBin) { + has_categorical_feature_ = true; + break; + } + } + if (has_categorical_feature_) { + int max_cat_value = 0; + int max_cat_num_bin = 0; + for (int i = 0; i < train_data_->num_features(); ++i) { + max_cat_value = std::max(train_data_->FeatureBinMapper(i)->MaxCatValue(), max_cat_value); + max_cat_num_bin = std::max(train_data_->FeatureBinMapper(i)->num_bin(), max_cat_num_bin); + } + AllocateCUDAMemory(&cuda_bitset_, static_cast(max_cat_value / 32), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_bitset_inner_, static_cast(max_cat_num_bin / 32), __FILE__, __LINE__); + const int max_cat_in_split = std::min(config_->max_cat_threshold, max_cat_num_bin / 2); + const int num_blocks = (max_cat_in_split + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; + AllocateCUDAMemory(&cuda_block_bitset_len_buffer_, num_blocks, __FILE__, __LINE__); + } else { + cuda_bitset_ = nullptr; + cuda_bitset_inner_ = nullptr; + } + cuda_bitset_len_ = 0; + cuda_bitset_inner_len_ = 0; +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu index 3734b720d1d4..d3ef260bbbe8 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu @@ -6,6 +6,8 @@ #ifdef USE_CUDA +#include + #include "cuda_single_gpu_tree_learner.hpp" namespace LightGBM { @@ -113,11 +115,76 @@ void CUDASingleGPUTreeLearner::LaunchReduceLeafStatKernel( config_->lambda_l1, use_l1, config_->lambda_l2, shrinkage_rate, config_->refit_decay_rate, cuda_leaf_value); } +template +__global__ void CalcBitsetLenKernel(const CUDASplitInfo* best_split_info, size_t* out_len_buffer) { + __shared__ size_t shared_mem_buffer[32]; + const T* vals = nullptr; + if (IS_INNER) { + vals = reinterpret_cast(best_split_info->cat_threshold); + } else { + vals = reinterpret_cast(best_split_info->cat_threshold_real); + } + const int i = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + size_t len = 0; + if (i < best_split_info->num_cat_threshold) { + const T val = vals[i]; + len = (val / 32) + 1; + } + const size_t block_max_len = ShuffleReduceMax(len, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + out_len_buffer[blockIdx.x] = block_max_len; + } +} + +__global__ void ReduceBlockMaxLen(size_t* out_len_buffer, const int num_blocks) { + __shared__ size_t shared_mem_buffer[32]; + size_t max_len = 0; + for (int i = static_cast(threadIdx.x); i < num_blocks; i += static_cast(blockDim.x)) { + max_len = max(out_len_buffer[i], max_len); + } + const size_t all_max_len = ShuffleReduceMax(max_len, shared_mem_buffer, blockDim.x); + if (threadIdx.x == 0) { + out_len_buffer[0] = max_len; + } +} + +template +__global__ void CUDAConstructBitsetKernel(const CUDASplitInfo* best_split_info, uint32_t* out) { + const T* vals = nullptr; + if (IS_INNER) { + vals = reinterpret_cast(best_split_info->cat_threshold); + } else { + vals = reinterpret_cast(best_split_info->cat_threshold_real); + } + const int i = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (i < best_split_info->num_cat_threshold) { + const T val = vals[i]; + out[val / 32] |= (0x1 << (val % 32)); + } +} + +template +void CUDAConstructBitset(const CUDASplitInfo* best_split_info, const int num_cat_threshold, uint32_t* out) { + const int num_blocks = (num_cat_threshold + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; + CUDAConstructBitsetKernel<<>>(best_split_info, out); +} +template +size_t CUDABitsetLen(const CUDASplitInfo* best_split_info, const int num_cat_threshold, size_t* out_len_buffer) { + const int num_blocks = (num_cat_threshold + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; + CalcBitsetLenKernel<<>>(best_split_info, out_len_buffer); + ReduceBlockMaxLen<<<1, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(out_len_buffer, num_blocks); + size_t host_max_len = 0; + CopyFromCUDADeviceToHost(&host_max_len, out_len_buffer, 1, __FILE__, __LINE__); + return host_max_len; +} void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel( - const CUDASplitInfo* best_split_info) const { - + const CUDASplitInfo* best_split_info) { + cuda_bitset_inner_len_ = CUDABitsetLen(best_split_info, num_cat_threshold_, cuda_block_bitset_len_buffer_); + CUDAConstructBitset(best_split_info, num_cat_threshold_, cuda_bitset_inner_); + cuda_bitset_len_ = CUDABitsetLen(best_split_info, num_cat_threshold_, cuda_block_bitset_len_buffer_); + CUDAConstructBitset(best_split_info, num_cat_threshold_, cuda_bitset_); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 7bc0ad48f025..eadb4c542289 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -57,9 +57,11 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { void LaunchReduceLeafStatKernel(const score_t* gradients, const score_t* hessians, const int num_leaves, const data_size_t num_data, double* cuda_leaf_value, const double shrinkage_rate) const; - void ConstructBitsetForCategoricalSplit(const CUDASplitInfo* best_split_info) const; + void ConstructBitsetForCategoricalSplit(const CUDASplitInfo* best_split_info); - void LaunchConstructBitsetForCategoricalSplitKernel(const CUDASplitInfo* best_split_info) const; + void LaunchConstructBitsetForCategoricalSplitKernel(const CUDASplitInfo* best_split_info); + + void AllocateBitset(); // GPU device ID int gpu_device_id_; @@ -87,11 +89,18 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { int smaller_leaf_index_; int larger_leaf_index_; int best_leaf_index_; + int num_cat_threshold_; + bool has_categorical_feature_; mutable double* cuda_leaf_gradient_stat_buffer_; mutable double* cuda_leaf_hessian_stat_buffer_; mutable data_size_t leaf_stat_buffer_size_; mutable data_size_t refit_num_data_; + uint32_t* cuda_bitset_; + size_t cuda_bitset_len_; + uint32_t* cuda_bitset_inner_; + size_t cuda_bitset_inner_len_; + size_t* cuda_block_bitset_len_buffer_; /*! \brief gradients on CUDA */ score_t* cuda_gradients_; From ca160704573ab9c198dad375f895628803d2e308 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 29 Oct 2021 02:52:02 +0000 Subject: [PATCH 107/166] fix split tree with categorical feature --- include/LightGBM/cuda/cuda_tree.hpp | 2 -- include/LightGBM/cuda/cuda_utils.h | 9 ++++++-- src/boosting/gbdt.cpp | 2 ++ src/io/cuda/cuda_tree.cpp | 7 +++---- src/io/cuda/cuda_tree.cu | 3 --- src/main.cpp | 10 +++++++-- src/treelearner/cuda/cuda_data_partition.cpp | 21 +++++++++++++++++-- src/treelearner/cuda/cuda_data_partition.hpp | 4 ++++ .../cuda/cuda_histogram_constructor.cpp | 3 --- .../cuda/cuda_single_gpu_tree_learner.cpp | 16 +++++++++++++- 10 files changed, 58 insertions(+), 19 deletions(-) diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index 59242854bb0f..9b1d271faf92 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -49,7 +49,6 @@ class CUDATree : public Tree { int SplitCategorical( const int leaf_index, const int real_feature_index, - const double real_threshold, const MissingType missing_type, const CUDASplitInfo* cuda_split_info, uint32_t* cuda_bitset, @@ -99,7 +98,6 @@ class CUDATree : public Tree { void LaunchSplitCategoricalKernel( const int leaf_index, const int real_feature_index, - const double real_threshold, const MissingType missing_type, const CUDASplitInfo* cuda_split_info, size_t cuda_bitset_len, diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index e6c74de1a181..a95c26f60834 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -113,9 +113,12 @@ class CUDAVector { CHECK_GT(size, 0); T* new_data = nullptr; AllocateCUDAMemory(&new_data, size, __FILE__, __LINE__); - CopyFromCUDADeviceToCUDADevice(new_data, data_, size, __FILE__, __LINE__); + if (size_ > 0 && data_ != nullptr) { + CopyFromCUDADeviceToCUDADevice(new_data, data_, size, __FILE__, __LINE__); + } DeallocateCUDAMemory(&data_, __FILE__, __LINE__); data_ = new_data; + size_ = size; } void PushBack(const T* values, size_t len) { @@ -138,7 +141,9 @@ class CUDAVector { std::vector ToHost() { std::vector host_vector(size_); - CopyFromCUDADeviceToCUDADevice(host_vector.data(), data_, size_, __FILE__, __LINE__); + if (size_ > 0 && data_ != nullptr) { + CopyFromCUDADeviceToHost(host_vector.data(), data_, size_, __FILE__, __LINE__); + } return host_vector; } diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 8f64a87d5714..63facba79795 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -510,7 +510,9 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { // update validation score for (auto& score_updater : valid_score_updater_) { + Log::Warning("before add prediciton to score for valid"); score_updater->AddScore(tree, cur_tree_id); + Log::Warning("before add prediciton to score for valid"); } } diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index 4974b6c8e414..252e1847766b 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -217,7 +217,6 @@ int CUDATree::Split(const int leaf_index, int CUDATree::SplitCategorical(const int leaf_index, const int real_feature_index, - const double real_threshold, const MissingType missing_type, const CUDASplitInfo* cuda_split_info, uint32_t* cuda_bitset, @@ -225,7 +224,7 @@ int CUDATree::SplitCategorical(const int leaf_index, uint32_t* cuda_bitset_inner, size_t cuda_bitset_inner_len) { LaunchSplitCategoricalKernel(leaf_index, real_feature_index, - real_threshold, missing_type, cuda_split_info, + missing_type, cuda_split_info, cuda_bitset_len, cuda_bitset_inner_len); cuda_bitset_.PushBack(cuda_bitset, cuda_bitset_len); cuda_bitset_inner_.PushBack(cuda_bitset_inner, cuda_bitset_inner_len); @@ -281,8 +280,8 @@ void CUDATree::ToHost() { CopyFromCUDADeviceToHost(leaf_depth_.data(), cuda_leaf_depth_, num_leaves_size, __FILE__, __LINE__); if (num_cat_ > 0) { - cuda_cat_boundaries_inner_.Resize(num_cat_); - cuda_cat_boundaries_.Resize(num_cat_); + cuda_cat_boundaries_inner_.Resize(num_cat_ + 1); + cuda_cat_boundaries_.Resize(num_cat_ + 1); cat_boundaries_ = cuda_cat_boundaries_.ToHost(); cat_boundaries_inner_ = cuda_cat_boundaries_inner_.ToHost(); cat_threshold_ = cuda_bitset_.ToHost(); diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index 70e0f0f640af..4b245bddab29 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -150,7 +150,6 @@ void CUDATree::LaunchSplitKernel(const int leaf_index, __global__ void SplitCategoricalKernel( // split information const int leaf_index, const int real_feature_index, - const double real_threshold, const MissingType missing_type, const CUDASplitInfo* cuda_split_info, // tree structure @@ -243,7 +242,6 @@ __global__ void SplitCategoricalKernel( // split information void CUDATree::LaunchSplitCategoricalKernel(const int leaf_index, const int real_feature_index, - const double real_threshold, const MissingType missing_type, const CUDASplitInfo* cuda_split_info, size_t cuda_bitset_len, @@ -252,7 +250,6 @@ void CUDATree::LaunchSplitCategoricalKernel(const int leaf_index, // split information leaf_index, real_feature_index, - real_threshold, missing_type, cuda_split_info, // tree structure diff --git a/src/main.cpp b/src/main.cpp index 8034da826811..4d69c53a1aec 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,10 +8,16 @@ #include "network/linkers.h" -int main(int argc, char** argv) { +int main(int /*argc*/, char** /*argv*/) { bool success = false; try { - LightGBM::Application app(argc, argv); + const std::string config_str = std::string("config=train.conf"); + char* argv = new char[config_str.size() + 1]; + for (size_t i = 0; i < config_str.size(); ++i) { + argv[i] = config_str[i]; + } + argv[config_str.size()] = '\0'; + LightGBM::Application app(2, &argv - 1); app.Run(); #ifdef USE_MPI diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index e4ac9131d817..f4287c0e3137 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -32,7 +32,6 @@ CUDADataPartition::CUDADataPartition( is_categorical_feature_.resize(train_data->num_features(), false); is_single_feature_in_group_.resize(train_data->num_features(), false); - int feature_group_index = train_data->Feature2Group(0); for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) { if (train_data->FeatureBinMapper(feature_index)->bin_type() == BinType::CategoricalBin) { is_categorical_feature_[feature_index] = true; @@ -145,6 +144,8 @@ void CUDADataPartition::Split( const int right_leaf_index, const int leaf_best_split_feature, const uint32_t leaf_best_split_threshold, + const uint32_t* categorical_bitset, + const int categorical_bitset_len, const uint8_t leaf_best_split_default_left, const data_size_t num_data_in_leaf, const data_size_t leaf_data_start, @@ -163,6 +164,8 @@ void CUDADataPartition::Split( GenDataToLeftBitVector(num_data_in_leaf, leaf_best_split_feature, leaf_best_split_threshold, + categorical_bitset, + categorical_bitset_len, leaf_best_split_default_left, leaf_data_start, left_leaf_index, @@ -189,17 +192,31 @@ void CUDADataPartition::GenDataToLeftBitVector( const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, + const uint32_t* categorical_bitset, + const int categorical_bitset_len, const uint8_t split_default_left, const data_size_t leaf_data_start, const int left_leaf_index, const int right_leaf_index) { - LaunchGenDataToLeftBitVectorKernel(num_data_in_leaf, + if (is_categorical_feature_[split_feature_index]) { + LaunchGenDataToLeftBitVectorCategoricalKernel( + num_data_in_leaf, + split_feature_index, + categorical_bitset, + categorical_bitset_len, + split_default_left, + leaf_data_start, + left_leaf_index, + right_leaf_index); + } else { + LaunchGenDataToLeftBitVectorKernel(num_data_in_leaf, split_feature_index, split_threshold, split_default_left, leaf_data_start, left_leaf_index, right_leaf_index); + } } void CUDADataPartition::SplitInner( diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 82c5a15d610f..cf8d15e00c70 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -48,6 +48,8 @@ class CUDADataPartition { const int right_leaf_index, const int leaf_best_split_feature, const uint32_t leaf_best_split_threshold, + const uint32_t* categorical_bitset, + const int categorical_bitset_len, const uint8_t leaf_best_split_default_left, const data_size_t num_data_in_leaf, const data_size_t leaf_data_start, @@ -99,6 +101,8 @@ class CUDADataPartition { const data_size_t num_data_in_leaf, const int split_feature_index, const uint32_t split_threshold, + const uint32_t* categorical_bitset, + const int categorical_bitset_len, const uint8_t split_default_left, const data_size_t leaf_data_start, const int left_leaf_index, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index eab85c690fb1..92053d898728 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -54,9 +54,6 @@ void CUDAHistogramConstructor::InitFeatureMetaInfo(const Dataset* train_data, co feature_most_freq_bins_.clear(); for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) { const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); - if (bin_mapper->bin_type() == BinType::CategoricalBin) { - Log::Fatal("CUDA tree learner doesn't support training categorical features."); - } const uint32_t most_freq_bin = bin_mapper->GetMostFreqBin(); if (most_freq_bin != 0) { need_fix_histogram_features_.emplace_back(feature_index); diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index eb2144b24663..4c382f63e734 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -174,18 +174,32 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, ConstructBitsetForCategoricalSplit(best_split_info); } - int right_leaf_index = tree->Split(best_leaf_index_, + int right_leaf_index = 0; + if (train_data_->FeatureBinMapper(leaf_best_split_feature_[best_leaf_index_])->bin_type() == BinType::CategoricalBin) { + right_leaf_index = tree->SplitCategorical(best_leaf_index_, + train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]), + train_data_->FeatureBinMapper(leaf_best_split_feature_[best_leaf_index_])->missing_type(), + best_split_info, + cuda_bitset_, + cuda_bitset_len_, + cuda_bitset_inner_, + cuda_bitset_inner_len_); + } else { + right_leaf_index = tree->Split(best_leaf_index_, train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]), train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], leaf_best_split_threshold_[best_leaf_index_]), train_data_->FeatureBinMapper(leaf_best_split_feature_[best_leaf_index_])->missing_type(), best_split_info); + } cuda_data_partition_->Split(best_split_info, best_leaf_index_, right_leaf_index, leaf_best_split_feature_[best_leaf_index_], leaf_best_split_threshold_[best_leaf_index_], + cuda_bitset_inner_, + static_cast(cuda_bitset_inner_len_), leaf_best_split_default_left_[best_leaf_index_], leaf_num_data_[best_leaf_index_], leaf_data_start_[best_leaf_index_], From c8716f1c571a707e9263d0c2ac100e0a7efc40b3 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 4 Nov 2021 09:36:49 +0000 Subject: [PATCH 108/166] fix categorical feature splits --- include/LightGBM/cuda/cuda_row_data.hpp | 2 +- include/LightGBM/cuda/cuda_split_info.hpp | 53 ++- include/LightGBM/cuda/cuda_utils.h | 6 +- src/boosting/gbdt.cpp | 2 - src/cuda/cuda_utils.cpp | 5 + src/io/cuda/cuda_tree.cu | 70 ---- .../cuda/cuda_best_split_finder.cpp | 31 +- .../cuda/cuda_best_split_finder.cu | 329 ++++++++++++------ .../cuda/cuda_best_split_finder.hpp | 12 +- src/treelearner/cuda/cuda_data_partition.cpp | 19 +- src/treelearner/cuda/cuda_data_partition.cu | 20 +- src/treelearner/cuda/cuda_data_partition.hpp | 12 +- .../cuda/cuda_histogram_constructor.cu | 26 +- .../cuda/cuda_histogram_constructor.hpp | 2 +- .../cuda/cuda_single_gpu_tree_learner.cpp | 87 ++++- .../cuda/cuda_single_gpu_tree_learner.cu | 31 +- .../cuda/cuda_single_gpu_tree_learner.hpp | 7 + 17 files changed, 482 insertions(+), 232 deletions(-) diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index a73cbb39f210..5ff97b0defd8 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -18,7 +18,7 @@ #include "../train_share_states.h" -#define SHRAE_HIST_SIZE (6144 * 2) +#define SHRAE_HIST_SIZE (6144) #define COPY_SUBROW_BLOCK_SIZE_ROW_DATA (1024) namespace LightGBM { diff --git a/include/LightGBM/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp index 8cdf31a8f93b..748d25b3dbe5 100644 --- a/include/LightGBM/cuda/cuda_split_info.hpp +++ b/include/LightGBM/cuda/cuda_split_info.hpp @@ -13,7 +13,7 @@ namespace LightGBM { -struct CUDASplitInfo { +class CUDASplitInfo { public: bool is_valid; int leaf_index; @@ -38,7 +38,16 @@ struct CUDASplitInfo { uint32_t* cat_threshold = nullptr; int* cat_threshold_real = nullptr; + __device__ CUDASplitInfo() { + printf("default constructor is called\n"); + num_cat_threshold = 0; + cat_threshold = nullptr; + cat_threshold_real = nullptr; + printf("default constructor is called, num_cat_threshold = %d\n", num_cat_threshold); + } + __device__ ~CUDASplitInfo() { + printf("default destructor is called\n"); if (num_cat_threshold > 0) { if (cat_threshold != nullptr) { cudaFree(cat_threshold); @@ -48,6 +57,48 @@ struct CUDASplitInfo { } } } + + __device__ CUDASplitInfo& operator=(const CUDASplitInfo& other) { + is_valid = other.is_valid; + leaf_index = other.leaf_index; + gain = other.gain; + inner_feature_index = other.inner_feature_index; + threshold = other.threshold; + default_left = other.default_left; + + left_sum_gradients = other.left_sum_gradients; + left_sum_hessians = other.left_sum_hessians; + left_count = other.left_count; + left_gain = other.left_gain; + left_value = other.left_value; + + right_sum_gradients = other.right_sum_gradients; + right_sum_hessians = other.right_sum_hessians; + right_count = other.right_count; + right_gain = other.right_gain; + right_value = other.right_value; + + num_cat_threshold = other.num_cat_threshold; + if (num_cat_threshold > 0 && cat_threshold == nullptr) { + cat_threshold = new uint32_t[num_cat_threshold]; + } + if (num_cat_threshold > 0 && cat_threshold_real == nullptr) { + cat_threshold_real = new int[num_cat_threshold]; + } + if (num_cat_threshold > 0) { + if (other.cat_threshold != nullptr) { + for (int i = 0; i < num_cat_threshold; ++i) { + cat_threshold[i] = other.cat_threshold[i]; + } + } + if (other.cat_threshold_real != nullptr) { + for (int i = 0; i < num_cat_threshold; ++i) { + cat_threshold_real[i] = other.cat_threshold_real[i]; + } + } + } + return *this; + } }; } // namespace LightGBM diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index a95c26f60834..20f8362d50e7 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -96,6 +96,8 @@ void DeallocateCUDAMemory(T** ptr, const char* file, const int line) { } } +void PrintLastCUDAError(); + template class CUDAVector { public: @@ -124,7 +126,9 @@ class CUDAVector { void PushBack(const T* values, size_t len) { T* new_data = nullptr; AllocateCUDAMemory(&new_data, size_ + len, __FILE__, __LINE__); - CopyFromCUDADeviceToCUDADevice(new_data, data_, size_, __FILE__, __LINE__); + if (size_ > 0 && data_ != nullptr) { + CopyFromCUDADeviceToCUDADevice(new_data, data_, size_, __FILE__, __LINE__); + } CopyFromCUDADeviceToCUDADevice(new_data + size_, values, len, __FILE__, __LINE__); DeallocateCUDAMemory(&data_, __FILE__, __LINE__); size_ += len; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 63facba79795..8f64a87d5714 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -510,9 +510,7 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { // update validation score for (auto& score_updater : valid_score_updater_) { - Log::Warning("before add prediciton to score for valid"); score_updater->AddScore(tree, cur_tree_id); - Log::Warning("before add prediciton to score for valid"); } } diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp index a1e6169e949a..e99bd1ec78f6 100644 --- a/src/cuda/cuda_utils.cpp +++ b/src/cuda/cuda_utils.cpp @@ -13,6 +13,11 @@ void SynchronizeCUDADevice(const char* file, const int line) { gpuAssert(cudaDeviceSynchronize(), file, line); } +void PrintLastCUDAError() { + const char* error_name = cudaGetErrorName(cudaGetLastError()); + Log::Warning(error_name); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index 4b245bddab29..67e60e6d7e51 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -277,76 +277,6 @@ void CUDATree::LaunchSplitCategoricalKernel(const int leaf_index, cuda_cat_boundaries_inner_.RawData()); } -template -__global__ void AddPredictionToScoreKernel( - // dataset information - const data_size_t num_data, - void* const* cuda_data_by_column, - const uint8_t* cuda_column_bit_type, - const uint32_t* cuda_feature_min_bin, - const uint32_t* cuda_feature_max_bin, - const uint32_t* cuda_feature_offset, - const uint32_t* cuda_feature_default_bin, - const uint32_t* cuda_feature_most_freq_bin, - const int* cuda_feature_to_column, - const data_size_t* cuda_used_indices, - // tree information - const uint32_t* cuda_threshold_in_bin, - const int8_t* cuda_decision_type, - const int* cuda_split_feature_inner, - const int* cuda_left_child, - const int* cuda_right_child, - const double* cuda_leaf_value, - // output - double* score) { - const data_size_t inner_data_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - const data_size_t data_index = USE_INDICES ? cuda_used_indices[inner_data_index] : inner_data_index; - if (data_index < num_data) { - int node = 0; - while (node >= 0) { - const int split_feature_inner = cuda_split_feature_inner[node]; - const int column = cuda_feature_to_column[split_feature_inner]; - const uint32_t default_bin = cuda_feature_default_bin[split_feature_inner]; - const uint32_t most_freq_bin = cuda_feature_most_freq_bin[split_feature_inner]; - const uint32_t max_bin = cuda_feature_max_bin[split_feature_inner]; - const uint32_t min_bin = cuda_feature_min_bin[split_feature_inner]; - const uint32_t offset = cuda_feature_offset[split_feature_inner]; - const uint8_t column_bit_type = cuda_column_bit_type[column]; - uint32_t bin = 0; - if (column_bit_type == 8) { - bin = static_cast((reinterpret_cast(cuda_data_by_column[column]))[data_index]); - } else if (column_bit_type == 16) { - bin = static_cast((reinterpret_cast(cuda_data_by_column[column]))[data_index]); - } else if (column_bit_type == 32) { - bin = static_cast((reinterpret_cast(cuda_data_by_column[column]))[data_index]); - } - if (bin >= min_bin && bin <= max_bin) { - bin = bin - min_bin + offset; - } else { - bin = most_freq_bin; - } - const int8_t decision_type = cuda_decision_type[node]; - const uint32_t threshold_in_bin = cuda_threshold_in_bin[node]; - const int8_t missing_type = ((decision_type >> 2) & 3); - const bool default_left = ((decision_type & kDefaultLeftMask) > 0); - if ((missing_type == 1 && bin == default_bin) || (missing_type == 2 && bin == max_bin)) { - if (default_left) { - node = cuda_left_child[node]; - } else { - node = cuda_right_child[node]; - } - } else { - if (bin <= threshold_in_bin) { - node = cuda_left_child[node]; - } else { - node = cuda_right_child[node]; - } - } - } - score[data_index] += cuda_leaf_value[~node]; - } -} - __global__ void ShrinkageKernel(const double rate, double* cuda_leaf_value, const int num_leaves) { const int leaf_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); if (leaf_index < num_leaves) { diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 937833a442df..6d9dfd583226 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -24,8 +24,12 @@ CUDABestSplitFinder::CUDABestSplitFinder( min_data_in_leaf_(config->min_data_in_leaf), min_sum_hessian_in_leaf_(config->min_sum_hessian_in_leaf), min_gain_to_split_(config->min_gain_to_split), + cat_smooth_(config->cat_smooth), + cat_l2_(config->cat_l2), max_cat_threshold_(config->max_cat_threshold), - num_total_bin_(feature_hist_offsets.back()), + min_data_per_group_(config->min_data_per_group), + max_cat_to_onehot_(config->max_cat_to_onehot), + num_total_bin_(feature_hist_offsets.empty() ? 0 : static_cast(feature_hist_offsets.back())), cuda_hist_(cuda_hist) { InitFeatureMetaInfo(train_data); cuda_leaf_best_split_info_ = nullptr; @@ -71,10 +75,12 @@ void CUDABestSplitFinder::InitFeatureMetaInfo(const Dataset* train_data) { max_num_bin_in_feature_ = 0; has_categorical_feature_ = false; max_num_categorical_bin_ = 0; + is_categorical_.resize(train_data->num_features(), 0); for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { const BinMapper* bin_mapper = train_data->FeatureBinMapper(inner_feature_index); if (bin_mapper->bin_type() == BinType::CategoricalBin) { has_categorical_feature_ = true; + is_categorical_[inner_feature_index] = 1; if (bin_mapper->num_bin() > max_num_categorical_bin_) { max_num_categorical_bin_ = bin_mapper->num_bin(); } @@ -105,7 +111,12 @@ void CUDABestSplitFinder::Init() { if (use_global_memory_) { AllocateCUDAMemory(&cuda_feature_hist_grad_buffer_, static_cast(num_total_bin_), __FILE__, __LINE__); AllocateCUDAMemory(&cuda_feature_hist_hess_buffer_, static_cast(num_total_bin_), __FILE__, __LINE__); + if (has_categorical_feature_) { + AllocateCUDAMemory(&cuda_feature_hist_stat_buffer_, static_cast(num_total_bin_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_feature_hist_index_buffer_, static_cast(num_total_bin_), __FILE__, __LINE__); + } } + InitCUDAMemoryFromHostMemory(&cuda_is_categorical_, is_categorical_.data(), is_categorical_.size(), __FILE__, __LINE__); } void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { @@ -134,7 +145,7 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { const uint32_t num_bin = feature_num_bins_[inner_feature_index]; const uint8_t missing_type = feature_missing_type_[inner_feature_index]; - if (num_bin > 2 && missing_type != MissingType::None) { + if (num_bin > 2 && missing_type != MissingType::None && !is_categorical_[inner_feature_index]) { if (missing_type == MissingType::Zero) { host_task_reverse_.emplace_back(0); host_task_reverse_.emplace_back(1); @@ -161,7 +172,11 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { num_tasks_ += 2; } } else { - host_task_reverse_.emplace_back(1); + if (is_categorical_[inner_feature_index]) { + host_task_reverse_.emplace_back(0); + } else { + host_task_reverse_.emplace_back(1); + } host_task_skip_default_bin_.emplace_back(0); host_task_na_as_missing_.emplace_back(0); host_task_feature_index_.emplace_back(inner_feature_index); @@ -209,10 +224,8 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { const size_t output_buffer_size = 2 * static_cast(num_tasks_); AllocateCUDAMemory(&cuda_best_split_info_, output_buffer_size, __FILE__, __LINE__); - if (has_categorical_feature_) { - AllocateCatVectors(cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size); - AllocateCatVectors(cuda_best_split_info_, output_buffer_size); - } + AllocateCatVectors(cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size); + AllocateCatVectors(cuda_best_split_info_, output_buffer_size); } void CUDABestSplitFinder::ResetTrainingData( @@ -252,9 +265,7 @@ void CUDABestSplitFinder::ResetConfig(const Config* config) { cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__); - if (has_categorical_feature_) { - AllocateCatVectors(cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size); - } + AllocateCatVectors(cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size); } void CUDABestSplitFinder::BeforeTrain(const std::vector& is_feature_used_bytree) { diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index b28842765ea8..2d7b5df68dd0 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -365,6 +365,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const double min_sum_hessian_in_leaf, const double min_gain_to_split, const double cat_smooth, + const double cat_l2, const int max_cat_threshold, const int min_data_per_group, // input parent node information @@ -383,8 +384,9 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const double cnt_factor = num_data / sum_hessians; const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; + const double l2 = lambda_l2 + cat_l2; - double local_gain = kMinScore; + double local_gain = min_gain_shift; bool threshold_found = false; cuda_best_split_info->is_valid = false; @@ -408,9 +410,9 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( double current_gain = GetSplitGains( sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon, lambda_l1, use_l1, - lambda_l2); + l2); if (current_gain > min_gain_shift) { - local_gain = current_gain - min_gain_shift; + local_gain = current_gain; threshold_found = true; } } @@ -426,8 +428,8 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( if (threshold_found && threadIdx_x == best_thread_index) { cuda_best_split_info->is_valid = true; cuda_best_split_info->num_cat_threshold = 1; - cuda_best_split_info->cat_threshold = new uint32_t[1]; - *(cuda_best_split_info->cat_threshold) = static_cast(threadIdx_x); + cuda_best_split_info->gain = local_gain - min_gain_shift; + *(cuda_best_split_info->cat_threshold) = static_cast(threadIdx_x + feature_mfb_offset); cuda_best_split_info->default_left = false; const int bin_offset = (threadIdx_x << 1); const hist_t sum_left_gradient = feature_hist_ptr[bin_offset]; @@ -437,9 +439,9 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2); + sum_left_hessian, lambda_l1, use_l1, l2); const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2); + sum_right_hessian, lambda_l1, use_l1, l2); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -448,10 +450,10 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + sum_left_hessian, lambda_l1, use_l1, l2, left_output); cuda_best_split_info->right_value = right_output; cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + sum_right_hessian, lambda_l1, use_l1, l2, right_output); } } else { __shared__ double shared_value_buffer[NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER]; @@ -469,23 +471,23 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( if (__double2int_rn(hess * cnt_factor) >= cat_smooth) { const double grad = feature_hist_ptr[bin_offset]; shared_value_buffer[threadIdx_x] = grad / (hess + cat_smooth); - shared_index_buffer[threadIdx_x] = threadIdx_x; is_valid_bin = 1; } else { shared_value_buffer[threadIdx_x] = kMaxScore; - shared_index_buffer[threadIdx_x] = -1; } } else { shared_value_buffer[threadIdx_x] = kMaxScore; - shared_index_buffer[threadIdx_x] = -1; } + shared_index_buffer[threadIdx_x] = threadIdx_x; __syncthreads(); const int local_used_bin = ShuffleReduceSum(is_valid_bin, shared_mem_buffer_uint16, blockDim.x); if (threadIdx_x == 0) { used_bin = local_used_bin; } __syncthreads(); - BitonicArgSort_1024(shared_value_buffer, shared_index_buffer, blockDim.x); + // TODO(shiyu1994): with more threads, this kernel may use out registers + BitonicArgSort_1024(shared_value_buffer, shared_index_buffer, bin_end); + __syncthreads(); const int max_num_cat = min(max_cat_threshold, (used_bin + 1) / 2); // left to right @@ -494,7 +496,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { const int bin_offset = (shared_index_buffer[threadIdx_x] << 1); grad = feature_hist_ptr[bin_offset]; - hess = feature_hist_ptr[bin_offset]; + hess = feature_hist_ptr[bin_offset + 1]; } if (threadIdx_x == 0) { hess += kEpsilon; @@ -514,10 +516,10 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, - lambda_l2); + l2); // gain with split is worse than without split - if (current_gain > min_gain_shift) { - local_gain = current_gain - min_gain_shift; + if (current_gain > local_gain) { + local_gain = current_gain; threshold_found = true; best_dir = 1; best_sum_left_gradient = sum_left_gradient; @@ -533,7 +535,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { const int bin_offset = (shared_index_buffer[used_bin - 1 - threadIdx_x] << 1); grad = feature_hist_ptr[bin_offset]; - hess = feature_hist_ptr[bin_offset]; + hess = feature_hist_ptr[bin_offset + 1]; } if (threadIdx_x == 0) { hess += kEpsilon; @@ -553,10 +555,10 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, - lambda_l2); + l2); // gain with split is worse than without split - if (current_gain > min_gain_shift) { - local_gain = current_gain - min_gain_shift; + if (current_gain > local_gain) { + local_gain = current_gain; threshold_found = true; best_dir = -1; best_sum_left_gradient = sum_left_gradient; @@ -574,7 +576,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( if (threshold_found && threadIdx_x == best_thread_index) { cuda_best_split_info->is_valid = true; cuda_best_split_info->num_cat_threshold = threadIdx_x + 1; - cuda_best_split_info->cat_threshold = new uint32_t[threadIdx_x + 1]; + cuda_best_split_info->gain = local_gain - min_gain_shift; if (best_dir == 1) { for (int i = 0; i < threadIdx_x + 1; ++i) { (cuda_best_split_info->cat_threshold)[i] = shared_index_buffer[i] + feature_mfb_offset; @@ -592,9 +594,9 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2); + sum_left_hessian, lambda_l1, use_l1, l2); const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2); + sum_right_hessian, lambda_l1, use_l1, l2); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -603,10 +605,10 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + sum_left_hessian, lambda_l1, use_l1, l2, left_output); cuda_best_split_info->right_value = right_output; cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + sum_right_hessian, lambda_l1, use_l1, l2, right_output); } } } @@ -618,6 +620,7 @@ __global__ void FindBestSplitsForLeafKernel( const uint32_t* feature_default_bins, const uint32_t* feature_num_bins, const int8_t* is_feature_used_bytree, + const int8_t* is_categorical, // input task information const bool larger_only, const int num_tasks, @@ -637,15 +640,16 @@ __global__ void FindBestSplitsForLeafKernel( const double min_gain_to_split, const double lambda_l1, const double lambda_l2, + const double cat_smooth, + const double cat_l2, + const int max_cat_threshold, + const int min_data_per_group, + const int max_cat_to_onehot, // output CUDASplitInfo* cuda_best_split_info) { const unsigned int task_index = blockIdx.x % num_tasks; const bool is_larger = static_cast(blockIdx.x >= num_tasks || larger_only); const int inner_feature_index = task_feature_index[task_index]; - const bool reverse = static_cast(task_reverse[task_index]); - const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); - const bool na_as_missing = static_cast(task_na_as_missing[task_index]); - const bool assume_out_default_left = task_out_default_left[task_index]; const double parent_gain = is_larger ? larger_leaf_splits->gain : smaller_leaf_splits->gain; const double sum_gradients = is_larger ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients; const double sum_hessians = (is_larger ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon; @@ -654,31 +658,65 @@ __global__ void FindBestSplitsForLeafKernel( CUDASplitInfo* out = cuda_best_split_info + output_offset; if (is_feature_used_bytree[inner_feature_index]) { const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + feature_hist_offsets[inner_feature_index] * 2; - FindBestSplitsForLeafKernelInner( - // input feature information - hist_ptr, - feature_num_bins[inner_feature_index], - feature_mfb_offsets[inner_feature_index], - feature_default_bins[inner_feature_index], - inner_feature_index, - // input config parameter values - lambda_l1, - lambda_l2, - min_data_in_leaf, - min_sum_hessian_in_leaf, - min_gain_to_split, - // input parent node information - parent_gain, - sum_gradients, - sum_hessians, - num_data, - // input task information - reverse, - skip_default_bin, - na_as_missing, - assume_out_default_left, - // output parameters - out); + if (is_categorical[inner_feature_index]) { + const bool is_one_hot = feature_num_bins[inner_feature_index] <= max_cat_to_onehot; + FindBestSplitsForLeafKernelCategoricalInner( + // input feature information + hist_ptr, + feature_num_bins[inner_feature_index], + feature_mfb_offsets[inner_feature_index], + feature_default_bins[inner_feature_index], + inner_feature_index, + // input config parameter values + lambda_l1, + lambda_l2, + min_data_in_leaf, + min_sum_hessian_in_leaf, + min_gain_to_split, + cat_smooth, + cat_l2, + max_cat_threshold, + min_data_per_group, + // input parent node information + parent_gain, + sum_gradients, + sum_hessians, + num_data, + // input task information + is_one_hot, + // output parameters + out); + } else { + const bool reverse = static_cast(task_reverse[task_index]); + const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); + const bool na_as_missing = static_cast(task_na_as_missing[task_index]); + const bool assume_out_default_left = task_out_default_left[task_index]; + FindBestSplitsForLeafKernelInner( + // input feature information + hist_ptr, + feature_num_bins[inner_feature_index], + feature_mfb_offsets[inner_feature_index], + feature_default_bins[inner_feature_index], + inner_feature_index, + // input config parameter values + lambda_l1, + lambda_l2, + min_data_in_leaf, + min_sum_hessian_in_leaf, + min_gain_to_split, + // input parent node information + parent_gain, + sum_gradients, + sum_hessians, + num_data, + // input task information + reverse, + skip_default_bin, + na_as_missing, + assume_out_default_left, + // output parameters + out); + } } else { out->is_valid = false; } @@ -892,6 +930,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( const double min_sum_hessian_in_leaf, const double min_gain_to_split, const double cat_smooth, + const double cat_l2, const int max_cat_threshold, const int min_data_per_group, // input parent node information @@ -915,6 +954,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( const double cnt_factor = num_data / sum_hessians; const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; + const double l2 = lambda_l2 + cat_l2; double local_gain = kMinScore; bool threshold_found = false; @@ -941,7 +981,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( double current_gain = GetSplitGains( sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon, lambda_l1, use_l1, - lambda_l2); + l2); if (current_gain > min_gain_shift) { best_threshold = bin; local_gain = current_gain - min_gain_shift; @@ -971,9 +1011,9 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2); + sum_left_hessian, lambda_l1, use_l1, l2); const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2); + sum_right_hessian, lambda_l1, use_l1, l2); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -982,10 +1022,10 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + sum_left_hessian, lambda_l1, use_l1, l2, left_output); cuda_best_split_info->right_value = right_output; cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + sum_right_hessian, lambda_l1, use_l1, l2, right_output); } } else { __shared__ uint16_t shared_mem_buffer_uint16[32]; @@ -1046,7 +1086,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, - lambda_l2); + l2); // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; @@ -1085,7 +1125,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, - lambda_l2); + l2); // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; @@ -1107,6 +1147,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( cuda_best_split_info->is_valid = true; cuda_best_split_info->num_cat_threshold = threadIdx_x + 1; cuda_best_split_info->cat_threshold = new uint32_t[threadIdx_x + 1]; + cuda_best_split_info->gain = local_gain; if (best_dir == 1) { for (int i = 0; i < threadIdx_x + 1; ++i) { (cuda_best_split_info->cat_threshold)[i] = hist_index_buffer_ptr[i] + feature_mfb_offset; @@ -1124,9 +1165,9 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2); + sum_left_hessian, lambda_l1, use_l1, l2); const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2); + sum_right_hessian, lambda_l1, use_l1, l2); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -1135,10 +1176,10 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + sum_left_hessian, lambda_l1, use_l1, l2, left_output); cuda_best_split_info->right_value = right_output; cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + sum_right_hessian, lambda_l1, use_l1, l2, right_output); } } } @@ -1150,6 +1191,7 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( const uint32_t* feature_default_bins, const uint32_t* feature_num_bins, const int8_t* is_feature_used_bytree, + const int8_t* is_categorical, // input task information const bool larger_only, const int num_tasks, @@ -1169,18 +1211,21 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( const double min_gain_to_split, const double lambda_l1, const double lambda_l2, + const double cat_smooth, + const double cat_l2, + const int max_cat_threshold, + const int min_data_per_group, + const int max_cat_to_onehot, // buffer hist_t* feature_hist_grad_buffer, hist_t* feature_hist_hess_buffer, + hist_t* feature_hist_stat_buffer, + data_size_t* feature_hist_index_buffer, // output CUDASplitInfo* cuda_best_split_info) { const unsigned int task_index = blockIdx.x % num_tasks; const bool is_larger = static_cast(blockIdx.x >= num_tasks || larger_only); const int inner_feature_index = task_feature_index[task_index]; - const bool reverse = static_cast(task_reverse[task_index]); - const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); - const bool na_as_missing = static_cast(task_na_as_missing[task_index]); - const bool assume_out_default_left = task_out_default_left[task_index]; const double parent_gain = is_larger ? larger_leaf_splits->gain : smaller_leaf_splits->gain; const double sum_gradients = is_larger ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients; const double sum_hessians = (is_larger ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon; @@ -1191,34 +1236,75 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + feature_hist_offsets[inner_feature_index] * 2; hist_t* hist_grad_buffer_ptr = feature_hist_grad_buffer + feature_hist_offsets[inner_feature_index] * 2; hist_t* hist_hess_buffer_ptr = feature_hist_hess_buffer + feature_hist_offsets[inner_feature_index] * 2; - FindBestSplitsForLeafKernelInner_GlobalMemory( - // input feature information - hist_ptr, - feature_num_bins[inner_feature_index], - feature_mfb_offsets[inner_feature_index], - feature_default_bins[inner_feature_index], - inner_feature_index, - // input config parameter values - lambda_l1, - lambda_l2, - min_data_in_leaf, - min_sum_hessian_in_leaf, - min_gain_to_split, - // input parent node information - parent_gain, - sum_gradients, - sum_hessians, - num_data, - // input task information - reverse, - skip_default_bin, - na_as_missing, - assume_out_default_left, - // buffer - hist_grad_buffer_ptr, - hist_hess_buffer_ptr, - // output parameters - out); + hist_t* hist_stat_buffer_ptr = feature_hist_stat_buffer + feature_hist_offsets[inner_feature_index] * 2; + data_size_t* hist_index_buffer_ptr = feature_hist_index_buffer + feature_hist_offsets[inner_feature_index] * 2; + if (is_categorical[inner_feature_index]) { + const bool is_one_hot = feature_num_bins[inner_feature_index] <= max_cat_to_onehot; + FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( + // input feature information + hist_ptr, + feature_num_bins[inner_feature_index], + feature_mfb_offsets[inner_feature_index], + feature_default_bins[inner_feature_index], + inner_feature_index, + // input config parameter values + lambda_l1, + lambda_l2, + min_data_in_leaf, + min_sum_hessian_in_leaf, + min_gain_to_split, + cat_smooth, + cat_l2, + max_cat_threshold, + min_data_per_group, + // input parent node information + parent_gain, + sum_gradients, + sum_hessians, + num_data, + // input task information + is_one_hot, + // buffer + hist_grad_buffer_ptr, + hist_hess_buffer_ptr, + hist_stat_buffer_ptr, + hist_index_buffer_ptr, + // output parameters + out); + } else { + const bool reverse = static_cast(task_reverse[task_index]); + const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); + const bool na_as_missing = static_cast(task_na_as_missing[task_index]); + const bool assume_out_default_left = task_out_default_left[task_index]; + FindBestSplitsForLeafKernelInner_GlobalMemory( + // input feature information + hist_ptr, + feature_num_bins[inner_feature_index], + feature_mfb_offsets[inner_feature_index], + feature_default_bins[inner_feature_index], + inner_feature_index, + // input config parameter values + lambda_l1, + lambda_l2, + min_data_in_leaf, + min_sum_hessian_in_leaf, + min_gain_to_split, + // input parent node information + parent_gain, + sum_gradients, + sum_hessians, + num_data, + // input task information + reverse, + skip_default_bin, + na_as_missing, + assume_out_default_left, + // buffer + hist_grad_buffer_ptr, + hist_hess_buffer_ptr, + // output parameters + out); + } } else { out->is_valid = false; } @@ -1247,6 +1333,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_feature_default_bins_, cuda_feature_num_bins_, cuda_is_feature_used_bytree_, + cuda_is_categorical_, // input task information larger_only, num_tasks_, @@ -1266,6 +1353,11 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( min_gain_to_split_, lambda_l1_, lambda_l2_, + cat_smooth_, + cat_l2_, + max_cat_threshold_, + min_data_per_group_, + max_cat_to_onehot_, // output parameters cuda_best_split_info_); } @@ -1278,6 +1370,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_feature_default_bins_, cuda_feature_num_bins_, cuda_is_feature_used_bytree_, + cuda_is_categorical_, // input task information true, num_tasks_, @@ -1297,6 +1390,11 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( min_gain_to_split_, lambda_l1_, lambda_l2_, + cat_smooth_, + cat_l2_, + max_cat_threshold_, + min_data_per_group_, + max_cat_to_onehot_, // output parameters cuda_best_split_info_); } @@ -1309,6 +1407,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_feature_default_bins_, cuda_feature_num_bins_, cuda_is_feature_used_bytree_, + cuda_is_categorical_, // input task information larger_only, num_tasks_, @@ -1328,9 +1427,16 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( min_gain_to_split_, lambda_l1_, lambda_l2_, + cat_smooth_, + cat_l2_, + max_cat_threshold_, + min_data_per_group_, + max_cat_to_onehot_, // buffer cuda_feature_hist_grad_buffer_, cuda_feature_hist_hess_buffer_, + cuda_feature_hist_stat_buffer_, + cuda_feature_hist_index_buffer_, // output parameters cuda_best_split_info_); } @@ -1343,6 +1449,7 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( cuda_feature_default_bins_, cuda_feature_num_bins_, cuda_is_feature_used_bytree_, + cuda_is_categorical_, // input task information true, num_tasks_, @@ -1362,9 +1469,16 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( min_gain_to_split_, lambda_l1_, lambda_l2_, + cat_smooth_, + cat_l2_, + max_cat_threshold_, + min_data_per_group_, + max_cat_to_onehot_, // buffer cuda_feature_hist_grad_buffer_, cuda_feature_hist_hess_buffer_, + cuda_feature_hist_stat_buffer_, + cuda_feature_hist_index_buffer_, // output parameters cuda_best_split_info_); } @@ -1678,7 +1792,7 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel( PrepareLeafBestSplitInfo<<<6, 1, 0, cuda_streams_[0]>>>(smaller_leaf_index, larger_leaf_index, cuda_best_split_info_buffer_, cuda_leaf_best_split_info_); - std::vector host_leaf_best_split_info_buffer(8); + std::vector host_leaf_best_split_info_buffer(8, 0); SynchronizeCUDADevice(__FILE__, __LINE__); CopyFromCUDADeviceToHost(host_leaf_best_split_info_buffer.data(), cuda_best_split_info_buffer_, 8, __FILE__, __LINE__); *smaller_leaf_best_split_feature = host_leaf_best_split_info_buffer[0]; @@ -1695,18 +1809,27 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel( __global__ void AllocateCatVectorsKernel( CUDASplitInfo* cuda_split_infos, size_t len, - const int max_num_categories_in_split) { + const int max_num_categories_in_split, + const bool has_categorical_feature) { const size_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i < len) { - cuda_split_infos[i].cat_threshold = new uint32_t[max_num_categories_in_split]; - cuda_split_infos[i].cat_threshold_real = new int[max_num_categories_in_split]; + if (has_categorical_feature) { + cuda_split_infos[i].cat_threshold = new uint32_t[max_num_categories_in_split]; + cuda_split_infos[i].cat_threshold_real = new int[max_num_categories_in_split]; + cuda_split_infos[i].num_cat_threshold = 0; + } else { + cuda_split_infos[i].cat_threshold = nullptr; + cuda_split_infos[i].cat_threshold_real = nullptr; + cuda_split_infos[i].num_cat_threshold = 0; + } } } void CUDABestSplitFinder::LaunchAllocateCatVectorsKernel(CUDASplitInfo* cuda_split_infos, size_t len) const { const int max_num_categories_in_split = min(max_cat_threshold_ / 2, max_num_categorical_bin_); const int num_blocks = (static_cast(len) + NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER - 1) / NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER; - AllocateCatVectorsKernel<<>>(cuda_split_infos, len, max_num_categories_in_split); + AllocateCatVectorsKernel<<>>( + cuda_split_infos, len, max_num_categories_in_split, has_categorical_feature_); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 1a0edd8c0249..1040eaeb0c95 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -18,7 +18,7 @@ #include "cuda_leaf_splits.hpp" -#define NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER (1024) +#define NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER (256) #define NUM_THREADS_FIND_BEST_LEAF (256) #define NUM_TASKS_PER_SYNC_BLOCK (1024) @@ -119,7 +119,11 @@ class CUDABestSplitFinder { data_size_t min_data_in_leaf_; double min_sum_hessian_in_leaf_; double min_gain_to_split_; + double cat_smooth_; + double cat_l2_; int max_cat_threshold_; + int min_data_per_group_; + int max_cat_to_onehot_; std::vector cuda_streams_; // for best split find tasks std::vector host_task_feature_index_; @@ -127,6 +131,7 @@ class CUDABestSplitFinder { std::vector host_task_skip_default_bin_; std::vector host_task_na_as_missing_; std::vector host_task_out_default_left_; + std::vector host_task_one_hot_; int num_tasks_; // use global memory bool use_global_memory_; @@ -136,6 +141,8 @@ class CUDABestSplitFinder { bool has_categorical_feature_; // maximum number of bins of categorical features int max_num_categorical_bin_; + // marks whether a feature is categorical + std::vector is_categorical_; // CUDA memory, held by this object // for per leaf best split information @@ -159,6 +166,9 @@ class CUDABestSplitFinder { // used when finding best split with global memory hist_t* cuda_feature_hist_grad_buffer_; hist_t* cuda_feature_hist_hess_buffer_; + hist_t* cuda_feature_hist_stat_buffer_; + data_size_t* cuda_feature_hist_index_buffer_; + int8_t* cuda_is_categorical_; // CUDA memory, held by other object const hist_t* cuda_hist_; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index f4287c0e3137..3732796b877e 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -100,7 +100,7 @@ void CUDADataPartition::Init() { AllocateCUDAMemory(&cuda_hist_pool_, static_cast(num_leaves_), __FILE__, __LINE__); CopyFromHostToCUDADevice(cuda_hist_pool_, &cuda_hist_, 1, __FILE__, __LINE__); - AllocateCUDAMemory(&cuda_split_info_buffer_, 12, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_split_info_buffer_, 16, __FILE__, __LINE__); AllocateCUDAMemory(&cuda_leaf_output_, static_cast(num_leaves_), __FILE__, __LINE__); @@ -112,7 +112,6 @@ void CUDADataPartition::Init() { InitCUDAMemoryFromHostMemory(&cuda_num_data_, &num_data_, 1, __FILE__, __LINE__); add_train_score_.resize(num_data_, 0.0f); - Log::Warning("cuda_add_train_score_ size = %d", num_data_); AllocateCUDAMemory(&cuda_add_train_score_, static_cast(num_data_), __FILE__, __LINE__); use_bagging_ = false; used_indices_ = nullptr; @@ -158,7 +157,9 @@ void CUDADataPartition::Split( data_size_t* left_leaf_start, data_size_t* right_leaf_start, double* left_leaf_sum_of_hessians, - double* right_leaf_sum_of_hessians) { + double* right_leaf_sum_of_hessians, + double* left_leaf_sum_of_gradients, + double* right_leaf_sum_of_gradients) { CalcBlockDim(num_data_in_leaf); global_timer.Start("GenDataToLeftBitVector"); GenDataToLeftBitVector(num_data_in_leaf, @@ -184,7 +185,9 @@ void CUDADataPartition::Split( left_leaf_start, right_leaf_start, left_leaf_sum_of_hessians, - right_leaf_sum_of_hessians); + right_leaf_sum_of_hessians, + left_leaf_sum_of_gradients, + right_leaf_sum_of_gradients); global_timer.Stop("SplitInner"); } @@ -232,7 +235,9 @@ void CUDADataPartition::SplitInner( data_size_t* left_leaf_start, data_size_t* right_leaf_start, double* left_leaf_sum_of_hessians, - double* right_leaf_sum_of_hessians) { + double* right_leaf_sum_of_hessians, + double* left_leaf_sum_of_gradients, + double* right_leaf_sum_of_gradients) { LaunchSplitInnerKernel( num_data_in_leaf, best_split_info, @@ -245,7 +250,9 @@ void CUDADataPartition::SplitInner( left_leaf_start, right_leaf_start, left_leaf_sum_of_hessians, - right_leaf_sum_of_hessians); + right_leaf_sum_of_hessians, + left_leaf_sum_of_gradients, + right_leaf_sum_of_gradients); ++cur_num_leaves_; } diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index cc48d2dcc4c1..638e60687668 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -657,9 +657,9 @@ __global__ void UpdateDataIndexToLeafIndexKernel_Categorical( } else if (!USE_MIN_BIN && bin == 0) { cuda_data_index_to_leaf_index[global_data_index] = default_leaf_index; } else if (CUDAFindInBitset(bitset, bitset_len, bin - min_bin + mfb_offset)) { - cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; - } else { cuda_data_index_to_leaf_index[global_data_index] = left_leaf_index; + } else { + cuda_data_index_to_leaf_index[global_data_index] = right_leaf_index; } } } @@ -696,10 +696,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorCategoricalKernel( const int8_t mfb_offset = static_cast(most_freq_bin == 0); std::vector host_bitset(bitset_len, 0); CopyFromCUDADeviceToHost(host_bitset.data(), bitset, bitset_len, __FILE__, __LINE__); - uint32_t t_zero_bin = min_bin + default_bin; - if (most_freq_bin == 0) { - --t_zero_bin; - } uint8_t split_default_to_left = 0; int default_leaf_index = right_leaf_index; const int is_single_feature_in_group = is_single_feature_in_group_[split_feature_index]; @@ -1088,8 +1084,10 @@ __global__ void SplitTreeStructureKernel(const int left_leaf_index, cuda_split_info_buffer[5] = cuda_leaf_data_start[right_leaf_index]; } else if (global_thread_index == 8) { cuda_split_info_buffer_for_hessians[0] = best_split_info->left_sum_hessians; + cuda_split_info_buffer_for_hessians[2] = best_split_info->left_sum_gradients; } else if (global_thread_index == 9) { cuda_split_info_buffer_for_hessians[1] = best_split_info->right_sum_hessians; + cuda_split_info_buffer_for_hessians[3] = best_split_info->right_sum_gradients; } if (cuda_leaf_num_data[left_leaf_index] < cuda_leaf_num_data[right_leaf_index]) { @@ -1226,7 +1224,9 @@ void CUDADataPartition::LaunchSplitInnerKernel( data_size_t* left_leaf_start_ref, data_size_t* right_leaf_start_ref, double* left_leaf_sum_of_hessians_ref, - double* right_leaf_sum_of_hessians_ref) { + double* right_leaf_sum_of_hessians_ref, + double* left_leaf_sum_of_gradients_ref, + double* right_leaf_sum_of_gradients_ref) { int num_blocks_final_ref = grid_dim_ - 1; int num_blocks_final_aligned = 1; while (num_blocks_final_ref > 0) { @@ -1275,10 +1275,10 @@ void CUDADataPartition::LaunchSplitInnerKernel( cuda_hist_pool_, cuda_leaf_output_, cuda_split_info_buffer_); global_timer.Stop("CUDADataPartition::SplitTreeStructureKernel"); - std::vector cpu_split_info_buffer(12); + std::vector cpu_split_info_buffer(16); const double* cpu_sum_hessians_info = reinterpret_cast(cpu_split_info_buffer.data() + 8); global_timer.Start("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); - CopyFromCUDADeviceToHostAsync(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 12, cuda_streams_[0], __FILE__, __LINE__); + CopyFromCUDADeviceToHostAsync(cpu_split_info_buffer.data(), cuda_split_info_buffer_, 16, cuda_streams_[0], __FILE__, __LINE__); SynchronizeCUDADevice(__FILE__, __LINE__); global_timer.Stop("CUDADataPartition::CopyFromCUDADeviceToHostAsync"); const data_size_t left_leaf_num_data = cpu_split_info_buffer[1]; @@ -1295,6 +1295,8 @@ void CUDADataPartition::LaunchSplitInnerKernel( *right_leaf_start_ref = right_leaf_data_start; *left_leaf_sum_of_hessians_ref = cpu_sum_hessians_info[0]; *right_leaf_sum_of_hessians_ref = cpu_sum_hessians_info[1]; + *left_leaf_sum_of_gradients_ref = cpu_sum_hessians_info[2]; + *right_leaf_sum_of_gradients_ref = cpu_sum_hessians_info[3]; } template diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index cf8d15e00c70..c740b0a1b22d 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -62,7 +62,9 @@ class CUDADataPartition { data_size_t* left_leaf_start, data_size_t* right_leaf_start, double* left_leaf_sum_of_hessians, - double* right_leaf_sum_of_hessians); + double* right_leaf_sum_of_hessians, + double* left_leaf_sum_of_gradients, + double* right_leaf_sum_of_gradients); void UpdateTrainScore(const Tree* tree, double* cuda_scores); @@ -123,7 +125,9 @@ class CUDADataPartition { data_size_t* left_leaf_start, data_size_t* right_leaf_start, double* left_leaf_sum_of_hessians, - double* right_leaf_sum_of_hessians); + double* right_leaf_sum_of_hessians, + double* left_leaf_sum_of_gradients, + double* right_leaf_sum_of_gradients); // kernel launch functions void LaunchFillDataIndicesBeforeTrain(); @@ -143,7 +147,9 @@ class CUDADataPartition { data_size_t* left_leaf_start, data_size_t* right_leaf_start, double* left_leaf_sum_of_hessians, - double* right_leaf_sum_of_hessians); + double* right_leaf_sum_of_hessians, + double* left_leaf_sum_of_gradients, + double* right_leaf_sum_of_gradients); void LaunchGenDataToLeftBitVectorKernel( const data_size_t num_data_in_leaf, diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index f83649249f3b..f4e0ba66a9f3 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -28,7 +28,7 @@ __global__ void CUDAConstructHistogramDenseKernel( const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf; - __shared__ float shared_hist[SHRAE_HIST_SIZE]; + __shared__ double shared_hist[SHRAE_HIST_SIZE]; const unsigned int num_threads_per_block = blockDim.x * blockDim.y; const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x]; const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1]; @@ -53,14 +53,14 @@ __global__ void CUDAConstructHistogramDenseKernel( data_size_t inner_data_index = static_cast(threadIdx_y); const int column_index = static_cast(threadIdx.x) + partition_column_start; if (threadIdx.x < static_cast(num_columns_in_partition)) { - float* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); + double* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); for (data_size_t i = 0; i < num_iteration_this; ++i) { const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; const score_t grad = cuda_gradients[data_index]; const score_t hess = cuda_hessians[data_index]; const uint32_t bin = static_cast(data_ptr[data_index * num_columns_in_partition + threadIdx.x]); const uint32_t pos = bin << 1; - float* pos_ptr = shared_hist_ptr + pos; + double* pos_ptr = shared_hist_ptr + pos; atomicAdd_block(pos_ptr, grad); atomicAdd_block(pos_ptr + 1, hess); inner_data_index += blockDim.y; @@ -87,7 +87,7 @@ __global__ void CUDAConstructHistogramSparseKernel( const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf; - __shared__ float shared_hist[SHRAE_HIST_SIZE]; + __shared__ double shared_hist[SHRAE_HIST_SIZE]; const unsigned int num_threads_per_block = blockDim.x * blockDim.y; const DATA_PTR_TYPE* block_row_ptr = row_ptr + blockIdx.x * (num_data + 1); const BIN_TYPE* data_ptr = data + partition_ptr[blockIdx.x]; @@ -118,7 +118,7 @@ __global__ void CUDAConstructHistogramSparseKernel( const score_t hess = cuda_hessians[data_index]; const uint32_t bin = static_cast(data_ptr[row_start + threadIdx.x]); const uint32_t pos = bin << 1; - float* pos_ptr = shared_hist + pos; + double* pos_ptr = shared_hist + pos; atomicAdd_block(pos_ptr, grad); atomicAdd_block(pos_ptr + 1, hess); } @@ -567,13 +567,15 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel( const int num_subtract_threads = 2 * num_total_bin_; const int num_subtract_blocks = (num_subtract_threads + SUBTRACT_BLOCK_SIZE - 1) / SUBTRACT_BLOCK_SIZE; global_timer.Start("CUDAHistogramConstructor::FixHistogramKernel"); - FixHistogramKernel<<>>( - cuda_feature_num_bins_, - cuda_feature_hist_offsets_, - cuda_feature_most_freq_bins_, - cuda_need_fix_histogram_features_, - cuda_need_fix_histogram_features_num_bin_aligned_, - cuda_smaller_leaf_splits); + if (need_fix_histogram_features_.size() > 0) { + FixHistogramKernel<<>>( + cuda_feature_num_bins_, + cuda_feature_hist_offsets_, + cuda_feature_most_freq_bins_, + cuda_need_fix_histogram_features_, + cuda_need_fix_histogram_features_num_bin_aligned_, + cuda_smaller_leaf_splits); + } global_timer.Stop("CUDAHistogramConstructor::FixHistogramKernel"); global_timer.Start("CUDAHistogramConstructor::SubtractHistogramKernel"); SubtractHistogramKernel<<>>( diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index bb1bf62a1db6..86cbf64151a8 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -17,7 +17,7 @@ #include "cuda_leaf_splits.hpp" -#define SHRAE_HIST_SIZE (6144 * 2) +#define SHRAE_HIST_SIZE (6144) #define NUM_DATA_PER_THREAD (400) #define NUM_THRADS_PER_BLOCK (504) #define NUM_FEATURE_PER_THREAD_GROUP (28) diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 4c382f63e734..ef2cad29b35a 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -70,6 +70,7 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_ cuda_leaf_gradient_stat_buffer_ = nullptr; cuda_leaf_hessian_stat_buffer_ = nullptr; leaf_stat_buffer_size_ = 0; + num_cat_threshold_ = 0; } void CUDASingleGPUTreeLearner::BeforeTrain() { @@ -193,6 +194,8 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, best_split_info); } + double sum_left_gradients = 0.0f; + double sum_right_gradients = 0.0f; cuda_data_partition_->Split(best_split_info, best_leaf_index_, right_leaf_index, @@ -210,7 +213,10 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, &leaf_data_start_[best_leaf_index_], &leaf_data_start_[right_leaf_index], &leaf_sum_hessians_[best_leaf_index_], - &leaf_sum_hessians_[right_leaf_index]); + &leaf_sum_hessians_[right_leaf_index], + &sum_left_gradients, + &sum_right_gradients); + //CheckSplitValid(best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients); smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index); larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_); global_timer.Stop("CUDASingleGPUTreeLearner::Split"); @@ -353,8 +359,12 @@ void CUDASingleGPUTreeLearner::ConstructBitsetForCategoricalSplit( void CUDASingleGPUTreeLearner::AllocateBitset() { has_categorical_feature_ = false; + categorical_bin_offsets_.clear(); + categorical_bin_offsets_.push_back(0); + categorical_bin_to_value_.clear(); for (int i = 0; i < train_data_->num_features(); ++i) { - if (train_data_->FeatureBinMapper(i)->bin_type() == BinType::CategoricalBin) { + const BinMapper* bin_mapper = train_data_->FeatureBinMapper(i); + if (bin_mapper->bin_type() == BinType::CategoricalBin) { has_categorical_feature_ = true; break; } @@ -363,14 +373,44 @@ void CUDASingleGPUTreeLearner::AllocateBitset() { int max_cat_value = 0; int max_cat_num_bin = 0; for (int i = 0; i < train_data_->num_features(); ++i) { - max_cat_value = std::max(train_data_->FeatureBinMapper(i)->MaxCatValue(), max_cat_value); - max_cat_num_bin = std::max(train_data_->FeatureBinMapper(i)->num_bin(), max_cat_num_bin); + const BinMapper* bin_mapper = train_data_->FeatureBinMapper(i); + if (bin_mapper->bin_type() == BinType::CategoricalBin) { + max_cat_value = std::max(bin_mapper->MaxCatValue(), max_cat_value); + max_cat_num_bin = std::max(bin_mapper->num_bin(), max_cat_num_bin); + } } - AllocateCUDAMemory(&cuda_bitset_, static_cast(max_cat_value / 32), __FILE__, __LINE__); - AllocateCUDAMemory(&cuda_bitset_inner_, static_cast(max_cat_num_bin / 32), __FILE__, __LINE__); + // std::max(..., 1UL) to avoid error in the case when there are NaN's in the categorical values + const size_t cuda_bitset_max_size = std::max(static_cast((max_cat_value + 31) / 32), 1UL); + const size_t cuda_bitset_inner_max_size = std::max(static_cast((max_cat_num_bin + 31) / 32), 1UL); + AllocateCUDAMemory(&cuda_bitset_, cuda_bitset_max_size, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_bitset_inner_, cuda_bitset_inner_max_size, __FILE__, __LINE__); const int max_cat_in_split = std::min(config_->max_cat_threshold, max_cat_num_bin / 2); const int num_blocks = (max_cat_in_split + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; AllocateCUDAMemory(&cuda_block_bitset_len_buffer_, num_blocks, __FILE__, __LINE__); + + for (int i = 0; i < train_data_->num_features(); ++i) { + const BinMapper* bin_mapper = train_data_->FeatureBinMapper(i); + if (bin_mapper->bin_type() == BinType::CategoricalBin) { + categorical_bin_offsets_.push_back(bin_mapper->num_bin()); + } else { + categorical_bin_offsets_.push_back(0); + } + } + for (size_t i = 1; i < categorical_bin_offsets_.size(); ++i) { + categorical_bin_offsets_[i] += categorical_bin_offsets_[i - 1]; + } + categorical_bin_to_value_.resize(categorical_bin_offsets_.back(), 0); + for (int i = 0; i < train_data_->num_features(); ++i) { + const BinMapper* bin_mapper = train_data_->FeatureBinMapper(i); + if (bin_mapper->bin_type() == BinType::CategoricalBin) { + const int offset = categorical_bin_offsets_[i]; + for (int bin = 0; bin < bin_mapper->num_bin(); ++bin) { + categorical_bin_to_value_[offset + bin] = bin_mapper->BinToValue(bin); + } + } + } + InitCUDAMemoryFromHostMemory(&cuda_categorical_bin_offsets_, categorical_bin_offsets_.data(), categorical_bin_offsets_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_categorical_bin_to_value_, categorical_bin_to_value_.data(), categorical_bin_to_value_.size(), __FILE__, __LINE__); } else { cuda_bitset_ = nullptr; cuda_bitset_inner_ = nullptr; @@ -379,6 +419,41 @@ void CUDASingleGPUTreeLearner::AllocateBitset() { cuda_bitset_inner_len_ = 0; } +void CUDASingleGPUTreeLearner::CheckSplitValid( + const int left_leaf, + const int right_leaf, + const double split_sum_left_gradients, + const double split_sum_right_gradients) { + std::vector left_data_indices(leaf_num_data_[left_leaf]); + std::vector right_data_indices(leaf_num_data_[right_leaf]); + CopyFromCUDADeviceToHost(left_data_indices.data(), + cuda_data_partition_->cuda_data_indices() + leaf_data_start_[left_leaf], + leaf_num_data_[left_leaf], __FILE__, __LINE__); + CopyFromCUDADeviceToHost(right_data_indices.data(), + cuda_data_partition_->cuda_data_indices() + leaf_data_start_[right_leaf], + leaf_num_data_[right_leaf], __FILE__, __LINE__); + double sum_left_gradients = 0.0f, sum_left_hessians = 0.0f; + double sum_right_gradients = 0.0f, sum_right_hessians = 0.0f; + for (size_t i = 0; i < left_data_indices.size(); ++i) { + const data_size_t index = left_data_indices[i]; + sum_left_gradients += gradients_[index]; + sum_left_hessians += hessians_[index]; + } + for (size_t i = 0; i < right_data_indices.size(); ++i) { + const data_size_t index = right_data_indices[i]; + sum_right_gradients += gradients_[index]; + sum_right_hessians += hessians_[index]; + } + Log::Warning("sum_left_gradients = %f, split_sum_left_gradients = %f", sum_left_gradients, split_sum_left_gradients); + Log::Warning("sum_left_hessians = %f, leaf_sum_hessians_[%d] = %f", sum_left_hessians, left_leaf, leaf_sum_hessians_[left_leaf]); + Log::Warning("sum_right_gradients = %f, split_sum_right_gradients = %f", sum_right_gradients, split_sum_right_gradients); + Log::Warning("sum_right_hessians = %f, leaf_sum_hessians_[%d] = %f", sum_right_hessians, right_leaf, leaf_sum_hessians_[right_leaf]); + CHECK_LE(std::fabs(sum_left_gradients - split_sum_left_gradients), 1e-6f); + CHECK_LE(std::fabs(sum_left_hessians - leaf_sum_hessians_[left_leaf]), 1e-6f); + CHECK_LE(std::fabs(sum_right_gradients - split_sum_right_gradients), 1e-6f); + CHECK_LE(std::fabs(sum_right_hessians - leaf_sum_hessians_[right_leaf]), 1e-6f); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu index d3ef260bbbe8..a4ae3fb65166 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu @@ -149,8 +149,12 @@ __global__ void ReduceBlockMaxLen(size_t* out_len_buffer, const int num_blocks) } template -__global__ void CUDAConstructBitsetKernel(const CUDASplitInfo* best_split_info, uint32_t* out) { +__global__ void CUDAConstructBitsetKernel(const CUDASplitInfo* best_split_info, uint32_t* out, size_t cuda_bitset_len) { const T* vals = nullptr; + for (size_t i = threadIdx.x + blockIdx.x * blockDim.x; i < cuda_bitset_len; i += blockDim.x) { + out[i] = 0; + } + __syncthreads(); if (IS_INNER) { vals = reinterpret_cast(best_split_info->cat_threshold); } else { @@ -159,14 +163,28 @@ __global__ void CUDAConstructBitsetKernel(const CUDASplitInfo* best_split_info, const int i = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (i < best_split_info->num_cat_threshold) { const T val = vals[i]; - out[val / 32] |= (0x1 << (val % 32)); + // can use add instead of or here, because each bit will only be added once + atomicAdd_system(out + (val / 32), (0x1 << (val % 32))); + } +} + +__global__ void SetRealThresholdKernel( + const CUDASplitInfo* best_split_info, + const int* categorical_bin_to_value, + const int* categorical_bin_offsets) { + const int num_cat_threshold = best_split_info->num_cat_threshold; + const int* categorical_bin_to_value_ptr = categorical_bin_to_value + categorical_bin_offsets[best_split_info->inner_feature_index]; + int* cat_threshold_real = best_split_info->cat_threshold_real; + const uint32_t* cat_threshold = best_split_info->cat_threshold; + for (int i = 0; i < num_cat_threshold; ++i) { + cat_threshold_real[i] = categorical_bin_to_value_ptr[cat_threshold[i]]; } } template -void CUDAConstructBitset(const CUDASplitInfo* best_split_info, const int num_cat_threshold, uint32_t* out) { +void CUDAConstructBitset(const CUDASplitInfo* best_split_info, const int num_cat_threshold, uint32_t* out, size_t bitset_len) { const int num_blocks = (num_cat_threshold + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; - CUDAConstructBitsetKernel<<>>(best_split_info, out); + CUDAConstructBitsetKernel<<>>(best_split_info, out, bitset_len); } template @@ -181,10 +199,11 @@ size_t CUDABitsetLen(const CUDASplitInfo* best_split_info, const int num_cat_thr void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel( const CUDASplitInfo* best_split_info) { + SetRealThresholdKernel<<<1, 1>>>(best_split_info, cuda_categorical_bin_to_value_, cuda_categorical_bin_offsets_); cuda_bitset_inner_len_ = CUDABitsetLen(best_split_info, num_cat_threshold_, cuda_block_bitset_len_buffer_); - CUDAConstructBitset(best_split_info, num_cat_threshold_, cuda_bitset_inner_); + CUDAConstructBitset(best_split_info, num_cat_threshold_, cuda_bitset_inner_, cuda_bitset_inner_len_); cuda_bitset_len_ = CUDABitsetLen(best_split_info, num_cat_threshold_, cuda_block_bitset_len_buffer_); - CUDAConstructBitset(best_split_info, num_cat_threshold_, cuda_bitset_); + CUDAConstructBitset(best_split_info, num_cat_threshold_, cuda_bitset_, cuda_bitset_len_); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index eadb4c542289..8784410c1516 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -63,6 +63,8 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { void AllocateBitset(); + void CheckSplitValid(const int left_leaf, const int right_leaf, const double sum_left_gradients, const double sum_right_gradients); + // GPU device ID int gpu_device_id_; // number of threads on CPU @@ -92,6 +94,9 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { int num_cat_threshold_; bool has_categorical_feature_; + std::vector categorical_bin_to_value_; + std::vector categorical_bin_offsets_; + mutable double* cuda_leaf_gradient_stat_buffer_; mutable double* cuda_leaf_hessian_stat_buffer_; mutable data_size_t leaf_stat_buffer_size_; @@ -101,6 +106,8 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { uint32_t* cuda_bitset_inner_; size_t cuda_bitset_inner_len_; size_t* cuda_block_bitset_len_buffer_; + int* cuda_categorical_bin_to_value_; + int* cuda_categorical_bin_offsets_; /*! \brief gradients on CUDA */ score_t* cuda_gradients_; From 4bcaa0334b7774762011f4cea4a00e094e3b4e4d Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 5 Nov 2021 07:20:49 +0000 Subject: [PATCH 109/166] refactor cuda_data_partition.cu with multi-level templates --- .../cuda/cuda_best_split_finder.cu | 86 +- src/treelearner/cuda/cuda_data_partition.cu | 1065 ++++++----------- src/treelearner/cuda/cuda_data_partition.hpp | 128 +- .../cuda/cuda_single_gpu_tree_learner.cpp | 6 + 4 files changed, 517 insertions(+), 768 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 2d7b5df68dd0..a0fba7ab8f12 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -176,6 +176,7 @@ __device__ double GetSplitGains(double sum_left_gradients, l1, use_l1, l2); } +template __device__ void FindBestSplitsForLeafKernelInner( // input feature information const hist_t* feature_hist_ptr, @@ -195,7 +196,6 @@ __device__ void FindBestSplitsForLeafKernelInner( const double sum_hessians, const data_size_t num_data, // input task information - const bool reverse, const bool skip_default_bin, const bool na_as_missing, const uint8_t assume_out_default_left, @@ -218,11 +218,11 @@ __device__ void FindBestSplitsForLeafKernelInner( __shared__ bool shared_found_buffer[32]; __shared__ uint32_t shared_thread_index_buffer[32]; const unsigned int threadIdx_x = threadIdx.x; - const bool skip_sum = reverse ? + const bool skip_sum = REVERSE ? (skip_default_bin && (feature_num_bin - 1 - threadIdx_x) == static_cast(feature_default_bin)) : (skip_default_bin && (threadIdx_x + feature_mfb_offset) == static_cast(feature_default_bin)); const uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; - if (!reverse) { + if (!REVERSE) { if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { const unsigned int bin_offset = threadIdx_x << 1; local_grad_hist = feature_hist_ptr[bin_offset]; @@ -245,7 +245,7 @@ __device__ void FindBestSplitsForLeafKernelInner( local_grad_hist = ShufflePrefixSum(local_grad_hist, shared_mem_buffer); __syncthreads(); local_hess_hist = ShufflePrefixSum(local_hess_hist, shared_mem_buffer); - if (reverse) { + if (REVERSE) { if (threadIdx_x >= static_cast(na_as_missing) && threadIdx_x <= feature_num_bin - 2 && !skip_sum) { const double sum_right_gradient = local_grad_hist; const double sum_right_hessian = local_hess_hist; @@ -301,7 +301,7 @@ __device__ void FindBestSplitsForLeafKernelInner( cuda_best_split_info->threshold = threshold_value; cuda_best_split_info->gain = local_gain; cuda_best_split_info->default_left = assume_out_default_left; - if (reverse) { + if (REVERSE) { const double sum_right_gradient = local_grad_hist; const double sum_right_hessian = local_hess_hist - kEpsilon; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); @@ -691,31 +691,57 @@ __global__ void FindBestSplitsForLeafKernel( const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); const bool na_as_missing = static_cast(task_na_as_missing[task_index]); const bool assume_out_default_left = task_out_default_left[task_index]; - FindBestSplitsForLeafKernelInner( - // input feature information - hist_ptr, - feature_num_bins[inner_feature_index], - feature_mfb_offsets[inner_feature_index], - feature_default_bins[inner_feature_index], - inner_feature_index, - // input config parameter values - lambda_l1, - lambda_l2, - min_data_in_leaf, - min_sum_hessian_in_leaf, - min_gain_to_split, - // input parent node information - parent_gain, - sum_gradients, - sum_hessians, - num_data, - // input task information - reverse, - skip_default_bin, - na_as_missing, - assume_out_default_left, - // output parameters - out); + if (reverse) { + FindBestSplitsForLeafKernelInner( + // input feature information + hist_ptr, + feature_num_bins[inner_feature_index], + feature_mfb_offsets[inner_feature_index], + feature_default_bins[inner_feature_index], + inner_feature_index, + // input config parameter values + lambda_l1, + lambda_l2, + min_data_in_leaf, + min_sum_hessian_in_leaf, + min_gain_to_split, + // input parent node information + parent_gain, + sum_gradients, + sum_hessians, + num_data, + // input task information + skip_default_bin, + na_as_missing, + assume_out_default_left, + // output parameters + out); + } else { + FindBestSplitsForLeafKernelInner( + // input feature information + hist_ptr, + feature_num_bins[inner_feature_index], + feature_mfb_offsets[inner_feature_index], + feature_default_bins[inner_feature_index], + inner_feature_index, + // input config parameter values + lambda_l1, + lambda_l2, + min_data_in_leaf, + min_sum_hessian_in_leaf, + min_gain_to_split, + // input parent node information + parent_gain, + sum_gradients, + sum_hessians, + num_data, + // input task information + skip_default_bin, + na_as_missing, + assume_out_default_left, + // output parameters + out); + } } } else { out->is_valid = false; diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 638e60687668..287c1d54b20d 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -47,14 +47,14 @@ void CUDADataPartition::LaunchFillDataIndexToLeafIndex() { FillDataIndexToLeafIndexKernel<<>>(num_data_in_root, cuda_data_indices_, cuda_data_index_to_leaf_index_); } -__device__ __forceinline__ void PrepareOffset(const data_size_t num_data_in_leaf_ref, uint16_t* block_to_left_offset, +__device__ __forceinline__ void PrepareOffset(const data_size_t num_data_in_leaf, uint16_t* block_to_left_offset, data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer, const uint16_t thread_to_left_offset_cnt, uint16_t* shared_mem_buffer) { const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockDim_x = blockDim.x; const uint16_t thread_to_left_offset = ShufflePrefixSum(thread_to_left_offset_cnt, shared_mem_buffer); - const data_size_t num_data_in_block = (blockIdx.x + 1) * blockDim_x <= num_data_in_leaf_ref ? static_cast(blockDim_x) : - num_data_in_leaf_ref - static_cast(blockIdx.x * blockDim_x); + const data_size_t num_data_in_block = (blockIdx.x + 1) * blockDim_x <= num_data_in_leaf ? static_cast(blockDim_x) : + num_data_in_leaf - static_cast(blockIdx.x * blockDim_x); if (static_cast(threadIdx_x) < num_data_in_block) { block_to_left_offset[threadIdx_x] = thread_to_left_offset; } @@ -70,23 +70,56 @@ __device__ __forceinline__ void PrepareOffset(const data_size_t num_data_in_leaf } } +template +__device__ bool CUDAFindInBitset(const uint32_t* bits, int n, T pos) { + int i1 = pos / 32; + if (i1 >= n) { + return false; + } + int i2 = pos % 32; + return (bits[i1] >> i2) & 1; +} + + + +#define UpdateDataIndexToLeafIndexKernel_PARAMS \ + const BIN_TYPE* column_data, \ + const data_size_t num_data_in_leaf, \ + const data_size_t* data_indices_in_leaf, \ + const uint32_t th, \ + const uint32_t t_zero_bin, \ + const uint32_t max_bin, \ + const uint32_t min_bin, \ + const int left_leaf_index, \ + const int right_leaf_index, \ + const int default_leaf_index, \ + const int missing_default_leaf_index + +#define UpdateDataIndexToLeafIndex_ARGS \ + column_data, \ + num_data_in_leaf, \ + data_indices_in_leaf, th, \ + t_zero_bin, \ + max_bin, \ + min_bin, \ + left_leaf_index, \ + right_leaf_index, \ + default_leaf_index, \ + missing_default_leaf_index + template __global__ void UpdateDataIndexToLeafIndexKernel( - const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const uint32_t th, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index) { + UpdateDataIndexToLeafIndexKernel_PARAMS, + int* cuda_data_index_to_leaf_index) { const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); if (!MIN_IS_MAX) { if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || - (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin)) { cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin < min_bin_ref || bin > max_bin_ref) { + } else if (bin < min_bin || bin > max_bin) { if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else { @@ -100,7 +133,7 @@ __global__ void UpdateDataIndexToLeafIndexKernel( } else { if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin != max_bin_ref) { + } else if (bin != max_bin) { if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else { @@ -121,258 +154,158 @@ __global__ void UpdateDataIndexToLeafIndexKernel( } } -#define UpdateDataIndexToLeafIndex_ARGS \ - num_data_in_leaf, data_indices_in_leaf, th, column_data, \ - t_zero_bin, max_bin_ref, min_bin_ref, cuda_data_index_to_leaf_index, left_leaf_index, right_leaf_index, \ - default_leaf_index, missing_default_leaf_index - template void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel( - const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const uint32_t th, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - int* cuda_data_index_to_leaf_index, const int left_leaf_index, const int right_leaf_index, - const int default_leaf_index, const int missing_default_leaf_index, - const bool missing_is_zero, const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, const bool max_to_left) { - if (min_bin_ref < max_bin_ref) { - if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + UpdateDataIndexToLeafIndexKernel_PARAMS, + const bool missing_is_zero, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_to_left) { + if (min_bin < max_bin) { + if (!missing_is_zero) { + LaunchUpdateDataIndexToLeafIndexKernel_Inner0 + (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left); + } else { + LaunchUpdateDataIndexToLeafIndexKernel_Inner0 + (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left); } } else { - if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_to_left) { - UpdateDataIndexToLeafIndexKernel<<>>(UpdateDataIndexToLeafIndex_ARGS); + if (!missing_is_zero) { + LaunchUpdateDataIndexToLeafIndexKernel_Inner0 + (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left); + } else { + LaunchUpdateDataIndexToLeafIndexKernel_Inner0 + (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left); } } } -// min_bin_ref < max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel0( - const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const uint32_t th, const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t max_bin_ref, const uint32_t min_bin_ref, - const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint16_t* block_to_left_offset, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer) { - __shared__ uint16_t shared_mem_buffer[32]; - uint16_t thread_to_left_offset_cnt = 0; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || - (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin_ref)) { - thread_to_left_offset_cnt = split_missing_default_to_left; - } else if ((bin < min_bin_ref || bin > max_bin_ref)) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { - thread_to_left_offset_cnt = split_missing_default_to_left; - } else { - thread_to_left_offset_cnt = split_default_to_left; - } - } else if (bin <= th) { - thread_to_left_offset_cnt = 1; - } +template +void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner0( + UpdateDataIndexToLeafIndexKernel_PARAMS, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_to_left) { + if (!missing_is_na) { + LaunchUpdateDataIndexToLeafIndexKernel_Inner1 + (UpdateDataIndexToLeafIndex_ARGS, mfb_is_zero, mfb_is_na, max_to_left); + } else { + LaunchUpdateDataIndexToLeafIndexKernel_Inner1 + (UpdateDataIndexToLeafIndex_ARGS, mfb_is_zero, mfb_is_na, max_to_left); } - __syncthreads(); - PrepareOffset(num_data_in_leaf, block_to_left_offset + blockIdx.x * blockDim.x, block_to_left_offset_buffer, block_to_right_offset_buffer, - thread_to_left_offset_cnt, shared_mem_buffer); } -// min_bin_ref == max_bin_ref -template -__global__ void GenDataToLeftBitVectorKernel16( - const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, const uint32_t max_bin_ref, - const uint8_t split_default_to_left, const uint8_t split_missing_default_to_left, - uint16_t* block_to_left_offset, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer) { - __shared__ uint16_t shared_mem_buffer[32]; - uint16_t thread_to_left_offset_cnt = 0; - const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; - if (local_data_index < num_data_in_leaf) { - const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; - const uint32_t bin = static_cast(column_data[global_data_index]); - if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { - thread_to_left_offset_cnt = split_missing_default_to_left; - } else if (bin != max_bin_ref) { - if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { - thread_to_left_offset_cnt = split_missing_default_to_left; - } else { - thread_to_left_offset_cnt = split_default_to_left; - } - } else { - if (MISSING_IS_NA && !MFB_IS_NA) { - thread_to_left_offset_cnt = split_missing_default_to_left; - } else if (MAX_TO_LEFT) { - thread_to_left_offset_cnt = 1; - } - } +template +void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner1( + UpdateDataIndexToLeafIndexKernel_PARAMS, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_to_left) { + if (!mfb_is_zero) { + LaunchUpdateDataIndexToLeafIndexKernel_Inner2 + (UpdateDataIndexToLeafIndex_ARGS, mfb_is_na, max_to_left); + } else { + LaunchUpdateDataIndexToLeafIndexKernel_Inner2 + (UpdateDataIndexToLeafIndex_ARGS, mfb_is_na, max_to_left); } - __syncthreads(); - PrepareOffset(num_data_in_leaf, block_to_left_offset + blockIdx.x * blockDim.x, block_to_left_offset_buffer, block_to_right_offset_buffer, - thread_to_left_offset_cnt, shared_mem_buffer); } -template -__device__ bool CUDAFindInBitset(const uint32_t* bits, int n, T pos) { - int i1 = pos / 32; - if (i1 >= n) { - return false; +template +void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner2( + UpdateDataIndexToLeafIndexKernel_PARAMS, + const bool mfb_is_na, + const bool max_to_left) { + if (!mfb_is_na) { + LaunchUpdateDataIndexToLeafIndexKernel_Inner3 + (UpdateDataIndexToLeafIndex_ARGS, max_to_left); + } else { + LaunchUpdateDataIndexToLeafIndexKernel_Inner3 + (UpdateDataIndexToLeafIndex_ARGS, max_to_left); } - int i2 = pos % 32; - return (bits[i1] >> i2) & 1; } -// for categorical features -template -__global__ void GenDataToLeftBitVectorKernel_Categorical( - const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, - const uint32_t* bitset, int bitset_len, const BIN_TYPE* column_data, - // values from feature - const uint32_t max_bin, const uint32_t min_bin, const int8_t mfb_offset, - const uint8_t split_default_to_left, +template +void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner3( + UpdateDataIndexToLeafIndexKernel_PARAMS, + const bool max_to_left) { + if (!max_to_left) { + UpdateDataIndexToLeafIndexKernel + <<>>( + UpdateDataIndexToLeafIndex_ARGS, + cuda_data_index_to_leaf_index_); + } else { + UpdateDataIndexToLeafIndexKernel + <<>>( + UpdateDataIndexToLeafIndex_ARGS, + cuda_data_index_to_leaf_index_); + } +} + +#define GenDataToLeftBitVectorKernel_PARMS \ + const BIN_TYPE* column_data, \ + const data_size_t num_data_in_leaf, \ + const data_size_t* data_indices_in_leaf, \ + const uint32_t th, \ + const uint32_t t_zero_bin, \ + const uint32_t max_bin, \ + const uint32_t min_bin, \ + const uint8_t split_default_to_left, \ + const uint8_t split_missing_default_to_left + +#define GenBitVector_ARGS \ + column_data, \ + num_data_in_leaf, \ + data_indices_in_leaf, \ + th, \ + t_zero_bin, \ + max_bin, \ + min_bin, \ + split_default_to_left, \ + split_missing_default_to_left + +template +__global__ void GenDataToLeftBitVectorKernel( + GenDataToLeftBitVectorKernel_PARMS, uint16_t* block_to_left_offset, - data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer) { + data_size_t* block_to_left_offset_buffer, + data_size_t* block_to_right_offset_buffer) { __shared__ uint16_t shared_mem_buffer[32]; uint16_t thread_to_left_offset_cnt = 0; const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; if (local_data_index < num_data_in_leaf) { const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; const uint32_t bin = static_cast(column_data[global_data_index]); - if (USE_MIN_BIN && (bin < min_bin || bin > max_bin)) { - thread_to_left_offset_cnt = split_default_to_left; - } else if (!USE_MIN_BIN && bin == 0) { - thread_to_left_offset_cnt = split_default_to_left; - } else if (CUDAFindInBitset(bitset, bitset_len, bin - min_bin + mfb_offset)) { - thread_to_left_offset_cnt = 1; + if (!MIN_IS_MAX) { + if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || + (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin)) { + thread_to_left_offset_cnt = split_missing_default_to_left; + } else if ((bin < min_bin || bin > max_bin)) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { + thread_to_left_offset_cnt = split_missing_default_to_left; + } else { + thread_to_left_offset_cnt = split_default_to_left; + } + } else if (bin <= th) { + thread_to_left_offset_cnt = 1; + } + } else { + if (MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) { + thread_to_left_offset_cnt = split_missing_default_to_left; + } else if (bin != max_bin) { + if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { + thread_to_left_offset_cnt = split_missing_default_to_left; + } else { + thread_to_left_offset_cnt = split_default_to_left; + } + } else { + if (MISSING_IS_NA && !MFB_IS_NA) { + thread_to_left_offset_cnt = split_missing_default_to_left; + } else if (MAX_TO_LEFT) { + thread_to_left_offset_cnt = 1; + } + } } } __syncthreads(); @@ -380,265 +313,196 @@ __global__ void GenDataToLeftBitVectorKernel_Categorical( thread_to_left_offset_cnt, shared_mem_buffer); } -#define GenBitVector_ARGS \ - num_data_in_leaf, data_indices_in_leaf, \ - th, \ - column_data, t_zero_bin, max_bin, min_bin, split_default_to_left, \ - split_missing_default_to_left, cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_ - -#define GenBitVector_MaxIsMin_ARGS \ - num_data_in_leaf, data_indices_in_leaf, \ - column_data, t_zero_bin, max_bin, split_default_to_left, \ - split_missing_default_to_left, cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_ - template -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner( + GenDataToLeftBitVectorKernel_PARMS, const bool missing_is_zero, const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_bin_to_left, - const int column_index, - const int split_feature_index, - const data_size_t leaf_data_start, - const data_size_t num_data_in_leaf, - const uint32_t th, - const uint32_t t_zero_bin, - const uint32_t most_freq_bin, - const uint32_t max_bin, - const uint32_t min_bin, - const uint8_t split_default_to_left, - const uint8_t split_missing_default_to_left, - const int left_leaf_index, - const int right_leaf_index, - const int default_leaf_index, - const int missing_default_leaf_index) { - const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index); - const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start; - if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && !max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na && max_bin_to_left) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel16<<>>(GenBitVector_MaxIsMin_ARGS); + const bool max_bin_to_left) { + if (min_bin < max_bin) { + if (!missing_is_zero) { + LaunchGenDataToLeftBitVectorKernelInner0 + (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left); + } else { + LaunchGenDataToLeftBitVectorKernelInner0 + (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left); + } + } else { + if (!missing_is_zero) { + LaunchGenDataToLeftBitVectorKernelInner0 + (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left); + } else { + LaunchGenDataToLeftBitVectorKernelInner0 + (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left); + } } } -template -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( - const bool missing_is_zero, +template +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner0( + GenDataToLeftBitVectorKernel_PARMS, const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const int column_index, + const bool max_bin_to_left) { + if (!missing_is_na) { + LaunchGenDataToLeftBitVectorKernelInner1 + (GenBitVector_ARGS, mfb_is_zero, mfb_is_na, max_bin_to_left); + } else { + LaunchGenDataToLeftBitVectorKernelInner1 + (GenBitVector_ARGS, mfb_is_zero, mfb_is_na, max_bin_to_left); + } +} + +template +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner1( + GenDataToLeftBitVectorKernel_PARMS, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_bin_to_left) { + if (!mfb_is_zero) { + LaunchGenDataToLeftBitVectorKernelInner2 + (GenBitVector_ARGS, mfb_is_na, max_bin_to_left); + } else { + LaunchGenDataToLeftBitVectorKernelInner2 + (GenBitVector_ARGS, mfb_is_na, max_bin_to_left); + } +} + +template +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner2( + GenDataToLeftBitVectorKernel_PARMS, + const bool mfb_is_na, + const bool max_bin_to_left) { + if (!mfb_is_na) { + LaunchGenDataToLeftBitVectorKernelInner3 + + (GenBitVector_ARGS, max_bin_to_left); + } else { + LaunchGenDataToLeftBitVectorKernelInner3 + + (GenBitVector_ARGS, max_bin_to_left); + } +} + +template +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner3( + GenDataToLeftBitVectorKernel_PARMS, + const bool max_bin_to_left) { + if (!max_bin_to_left) { + GenDataToLeftBitVectorKernel + + <<>>(GenBitVector_ARGS, + cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_); + } else { + GenDataToLeftBitVectorKernel + + <<>>(GenBitVector_ARGS, + cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_); + } +} + +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel( + const data_size_t num_data_in_leaf, const int split_feature_index, + const uint32_t split_threshold, + const uint8_t split_default_left, const data_size_t leaf_data_start, - const data_size_t num_data_in_leaf, - const uint32_t th, - const uint32_t t_zero_bin, - const uint32_t most_freq_bin, - const uint32_t max_bin, - const uint32_t min_bin, - const uint8_t split_default_to_left, - const uint8_t split_missing_default_to_left, const int left_leaf_index, - const int right_leaf_index, - const int default_leaf_index, - const int missing_default_leaf_index) { - const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index); + const int right_leaf_index) { + const bool missing_is_zero = static_cast(cuda_column_data_->feature_missing_is_zero(split_feature_index)); + const bool missing_is_na = static_cast(cuda_column_data_->feature_missing_is_na(split_feature_index)); + const bool mfb_is_zero = static_cast(cuda_column_data_->feature_mfb_is_zero(split_feature_index)); + const bool mfb_is_na = static_cast(cuda_column_data_->feature_mfb_is_na(split_feature_index)); + const uint32_t default_bin = cuda_column_data_->feature_default_bin(split_feature_index); + const uint32_t most_freq_bin = cuda_column_data_->feature_most_freq_bin(split_feature_index); + const uint32_t min_bin = cuda_column_data_->feature_min_bin(split_feature_index); + const uint32_t max_bin = cuda_column_data_->feature_max_bin(split_feature_index); + uint32_t th = split_threshold + min_bin; + uint32_t t_zero_bin = min_bin + default_bin; + if (most_freq_bin == 0) { + --th; + --t_zero_bin; + } + uint8_t split_default_to_left = 0; + uint8_t split_missing_default_to_left = 0; + int default_leaf_index = right_leaf_index; + int missing_default_leaf_index = right_leaf_index; + if (most_freq_bin <= split_threshold) { + split_default_to_left = 1; + default_leaf_index = left_leaf_index; + } + if (missing_is_zero || missing_is_na) { + if (split_default_left) { + split_missing_default_to_left = 1; + missing_default_leaf_index = left_leaf_index; + } + } + const int column_index = cuda_column_data_->feature_to_column(split_feature_index); + const uint8_t bit_type = cuda_column_data_->column_bit_type(column_index); + + const bool max_bin_to_left = (max_bin <= th); + const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start; - if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (!missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && !missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && !mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && !mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); - } else if (missing_is_zero && missing_is_na && mfb_is_zero && mfb_is_na) { - const BIN_TYPE* column_data = reinterpret_cast(column_data_pointer); - GenDataToLeftBitVectorKernel0<<>>(GenBitVector_ARGS); + const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index); + + if (bit_type == 8) { + const uint8_t* column_data = reinterpret_cast(column_data_pointer); + LaunchGenDataToLeftBitVectorKernelInner( + GenBitVector_ARGS, + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left); + LaunchUpdateDataIndexToLeafIndexKernel( + UpdateDataIndexToLeafIndex_ARGS, + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left); + } else if (bit_type == 16) { + const uint16_t* column_data = reinterpret_cast(column_data_pointer); + LaunchGenDataToLeftBitVectorKernelInner( + GenBitVector_ARGS, + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left); + LaunchUpdateDataIndexToLeafIndexKernel( + UpdateDataIndexToLeafIndex_ARGS, + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left); + } else if (bit_type == 32) { + const uint32_t* column_data = reinterpret_cast(column_data_pointer); + LaunchGenDataToLeftBitVectorKernelInner( + GenBitVector_ARGS, + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left); + LaunchUpdateDataIndexToLeafIndexKernel( + UpdateDataIndexToLeafIndex_ARGS, + missing_is_zero, + missing_is_na, + mfb_is_zero, + mfb_is_na, + max_bin_to_left); } } +#undef UpdateDataIndexToLeafIndexKernel_PARAMS +#undef UpdateDataIndexToLeafIndex_ARGS +#undef GenDataToLeftBitVectorKernel_PARMS #undef GenBitVector_ARGS -#undef GenBitVector_MaxIsMin_ARGS - template __global__ void UpdateDataIndexToLeafIndexKernel_Categorical( @@ -664,6 +528,35 @@ __global__ void UpdateDataIndexToLeafIndexKernel_Categorical( } } +// for categorical features +template +__global__ void GenDataToLeftBitVectorKernel_Categorical( + const data_size_t num_data_in_leaf, const data_size_t* data_indices_in_leaf, + const uint32_t* bitset, int bitset_len, const BIN_TYPE* column_data, + // values from feature + const uint32_t max_bin, const uint32_t min_bin, const int8_t mfb_offset, + const uint8_t split_default_to_left, + uint16_t* block_to_left_offset, + data_size_t* block_to_left_offset_buffer, data_size_t* block_to_right_offset_buffer) { + __shared__ uint16_t shared_mem_buffer[32]; + uint16_t thread_to_left_offset_cnt = 0; + const unsigned int local_data_index = blockIdx.x * blockDim.x + threadIdx.x; + if (local_data_index < num_data_in_leaf) { + const unsigned int global_data_index = data_indices_in_leaf[local_data_index]; + const uint32_t bin = static_cast(column_data[global_data_index]); + if (USE_MIN_BIN && (bin < min_bin || bin > max_bin)) { + thread_to_left_offset_cnt = split_default_to_left; + } else if (!USE_MIN_BIN && bin == 0) { + thread_to_left_offset_cnt = split_default_to_left; + } else if (CUDAFindInBitset(bitset, bitset_len, bin - min_bin + mfb_offset)) { + thread_to_left_offset_cnt = 1; + } + } + __syncthreads(); + PrepareOffset(num_data_in_leaf, block_to_left_offset + blockIdx.x * blockDim.x, block_to_left_offset_buffer, block_to_right_offset_buffer, + thread_to_left_offset_cnt, shared_mem_buffer); +} + #define GenBitVector_Categorical_ARGS \ num_data_in_leaf, data_indices_in_leaf, \ bitset, bitset_len, \ @@ -736,214 +629,6 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorCategoricalKernel( #undef GenBitVector_Categorical_ARGS #undef UpdateDataIndexToLeafIndex_Categorical_ARGS -void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel(const data_size_t num_data_in_leaf, - const int split_feature_index, const uint32_t split_threshold, - const uint8_t split_default_left, const data_size_t leaf_data_start, - const int left_leaf_index, const int right_leaf_index) { - const uint8_t missing_is_zero = cuda_column_data_->feature_missing_is_zero(split_feature_index); - const uint8_t missing_is_na = cuda_column_data_->feature_missing_is_na(split_feature_index); - const uint8_t mfb_is_zero = cuda_column_data_->feature_mfb_is_zero(split_feature_index); - const uint8_t mfb_is_na = cuda_column_data_->feature_mfb_is_na(split_feature_index); - const uint32_t default_bin = cuda_column_data_->feature_default_bin(split_feature_index); - const uint32_t most_freq_bin = cuda_column_data_->feature_most_freq_bin(split_feature_index); - const uint32_t min_bin = cuda_column_data_->feature_min_bin(split_feature_index); - const uint32_t max_bin = cuda_column_data_->feature_max_bin(split_feature_index); - uint32_t th = split_threshold + min_bin; - uint32_t t_zero_bin = min_bin + default_bin; - if (most_freq_bin == 0) { - --th; - --t_zero_bin; - } - uint8_t split_default_to_left = 0; - uint8_t split_missing_default_to_left = 0; - int default_leaf_index = right_leaf_index; - int missing_default_leaf_index = right_leaf_index; - if (most_freq_bin <= split_threshold) { - split_default_to_left = 1; - default_leaf_index = left_leaf_index; - } - if (missing_is_zero || missing_is_na) { - if (split_default_left) { - split_missing_default_to_left = 1; - missing_default_leaf_index = left_leaf_index; - } - } - const int column_index = cuda_column_data_->feature_to_column(split_feature_index); - const uint8_t bit_type = cuda_column_data_->column_bit_type(column_index); - - const bool max_bin_to_left = (max_bin <= th); - - const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start; - - if (min_bin < max_bin) { - if (bit_type == 8) { - LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - column_index, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } else if (bit_type == 16) { - LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - column_index, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } else if (bit_type == 32) { - LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - column_index, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } - } else { - if (bit_type == 8) { - LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - max_bin_to_left, - column_index, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } else if (bit_type == 16) { - LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - max_bin_to_left, - column_index, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } else if (bit_type == 32) { - LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( - missing_is_zero, - missing_is_na, - mfb_is_zero, - mfb_is_na, - max_bin_to_left, - column_index, - split_feature_index, - leaf_data_start, - num_data_in_leaf, - th, - t_zero_bin, - most_freq_bin, - max_bin, - min_bin, - split_default_to_left, - split_missing_default_to_left, - left_leaf_index, - right_leaf_index, - default_leaf_index, - missing_default_leaf_index); - } - } - - const void* column_data_pointer = cuda_column_data_->GetColumnData(column_index); - if (bit_type == 8) { - const uint8_t* column_data = reinterpret_cast(column_data_pointer); - LaunchUpdateDataIndexToLeafIndexKernel(num_data_in_leaf, - data_indices_in_leaf, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, - left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, - static_cast(missing_is_zero), - static_cast(missing_is_na), - static_cast(mfb_is_zero), - static_cast(mfb_is_na), - max_bin_to_left); - } else if (bit_type == 16) { - const uint16_t* column_data = reinterpret_cast(column_data_pointer); - LaunchUpdateDataIndexToLeafIndexKernel(num_data_in_leaf, - data_indices_in_leaf, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, - left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, - static_cast(missing_is_zero), - static_cast(missing_is_na), - static_cast(mfb_is_zero), - static_cast(mfb_is_na), - max_bin_to_left); - } else if (bit_type == 32) { - const uint32_t* column_data = reinterpret_cast(column_data_pointer); - LaunchUpdateDataIndexToLeafIndexKernel(num_data_in_leaf, - data_indices_in_leaf, th, column_data, t_zero_bin, max_bin, min_bin, cuda_data_index_to_leaf_index_, - left_leaf_index, right_leaf_index, default_leaf_index, missing_default_leaf_index, - static_cast(missing_is_zero), - static_cast(missing_is_na), - static_cast(mfb_is_zero), - static_cast(mfb_is_na), - max_bin_to_left); - } -} - __global__ void AggregateBlockOffsetKernel0( const int left_leaf_index, const int right_leaf_index, @@ -1178,7 +863,7 @@ __global__ void SplitInnerKernel(const int left_leaf_index, const int right_leaf const data_size_t* block_to_left_offset_buffer, const data_size_t* block_to_right_offset_buffer, const uint16_t* block_to_left_offset, data_size_t* out_data_indices_in_leaf) { const data_size_t leaf_num_data_offset = cuda_leaf_data_start[left_leaf_index]; - const data_size_t num_data_in_leaf_ref = cuda_leaf_num_data[left_leaf_index] + cuda_leaf_num_data[right_leaf_index]; + const data_size_t num_data_in_leaf = cuda_leaf_num_data[left_leaf_index] + cuda_leaf_num_data[right_leaf_index]; const unsigned int threadIdx_x = threadIdx.x; const unsigned int blockDim_x = blockDim.x; const unsigned int global_thread_index = blockIdx.x * blockDim_x + threadIdx_x; @@ -1188,7 +873,7 @@ __global__ void SplitInnerKernel(const int left_leaf_index, const int right_leaf const uint32_t to_left_block_offset = block_to_left_offset_buffer[blockIdx.x]; data_size_t* left_out_data_indices_in_leaf = out_data_indices_in_leaf + to_left_block_offset; data_size_t* right_out_data_indices_in_leaf = out_data_indices_in_leaf + to_right_block_offset; - if (static_cast(global_thread_index) < num_data_in_leaf_ref) { + if (static_cast(global_thread_index) < num_data_in_leaf) { const uint32_t thread_to_left_offset = (threadIdx_x == 0 ? 0 : block_to_left_offset_ptr[threadIdx_x - 1]); const bool to_left = block_to_left_offset_ptr[threadIdx_x] > thread_to_left_offset; if (to_left) { diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index c740b0a1b22d..ab26493b6766 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -170,72 +170,104 @@ class CUDADataPartition { const int left_leaf_index, const int right_leaf_index); +#define GenDataToLeftBitVectorKernel_PARMS \ + const BIN_TYPE* column_data, \ + const data_size_t num_data_in_leaf, \ + const data_size_t* data_indices_in_leaf, \ + const uint32_t th, \ + const uint32_t t_zero_bin, \ + const uint32_t max_bin, \ + const uint32_t min_bin, \ + const uint8_t split_default_to_left, \ + const uint8_t split_missing_default_to_left + template - void LaunchGenDataToLeftBitVectorKernelMaxIsMinInner( + void LaunchGenDataToLeftBitVectorKernelInner( + GenDataToLeftBitVectorKernel_PARMS, const bool missing_is_zero, const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_bin_to_left, - const int column_index, - const int split_feature_index, - const data_size_t leaf_data_start, - const data_size_t num_data_in_leaf, - const uint32_t th, - const uint32_t t_zero_bin, - const uint32_t most_freq_bin, - const uint32_t max_bin, - const uint32_t min_bin, - const uint8_t split_default_to_left, - const uint8_t split_missing_default_to_left, - const int left_leaf_index, - const int right_leaf_index, - const int default_leaf_index, - const int missing_default_leaf_index); + const bool max_bin_to_left); - template - void LaunchGenDataToLeftBitVectorKernelMaxIsNotMinInner( - const bool missing_is_zero, + template + void LaunchGenDataToLeftBitVectorKernelInner0( + GenDataToLeftBitVectorKernel_PARMS, const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const int column_index, - const int split_feature_index, - const data_size_t leaf_data_start, - const data_size_t num_data_in_leaf, - const uint32_t th, - const uint32_t t_zero_bin, - const uint32_t most_freq_bin, - const uint32_t max_bin, - const uint32_t min_bin, - const uint8_t split_default_to_left, - const uint8_t split_missing_default_to_left, - const int left_leaf_index, - const int right_leaf_index, - const int default_leaf_index, - const int missing_default_leaf_index); + const bool max_bin_to_left); + + template + void LaunchGenDataToLeftBitVectorKernelInner1( + GenDataToLeftBitVectorKernel_PARMS, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_bin_to_left); + + template + void LaunchGenDataToLeftBitVectorKernelInner2( + GenDataToLeftBitVectorKernel_PARMS, + const bool mfb_is_na, + const bool max_bin_to_left); + + template + void LaunchGenDataToLeftBitVectorKernelInner3( + GenDataToLeftBitVectorKernel_PARMS, + const bool max_bin_to_left); + +#undef GenDataToLeftBitVectorKernel_PARMS + +#define UpdateDataIndexToLeafIndexKernel_PARAMS \ + const BIN_TYPE* column_data, \ + const data_size_t num_data_in_leaf, \ + const data_size_t* data_indices_in_leaf, \ + const uint32_t th, \ + const uint32_t t_zero_bin, \ + const uint32_t max_bin_ref, \ + const uint32_t min_bin_ref, \ + const int left_leaf_index, \ + const int right_leaf_index, \ + const int default_leaf_index, \ + const int missing_default_leaf_index template void LaunchUpdateDataIndexToLeafIndexKernel( - const data_size_t num_data_in_leaf, - const data_size_t* data_indices_in_leaf, - const uint32_t th, - const BIN_TYPE* column_data, - // values from feature - const uint32_t t_zero_bin, - const uint32_t max_bin_ref, - const uint32_t min_bin_ref, - int* cuda_data_index_to_leaf_index, - const int left_leaf_index, - const int right_leaf_index, - const int default_leaf_index, - const int missing_default_leaf_index, + UpdateDataIndexToLeafIndexKernel_PARAMS, const bool missing_is_zero, const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, const bool max_to_left); + template + void LaunchUpdateDataIndexToLeafIndexKernel_Inner0( + UpdateDataIndexToLeafIndexKernel_PARAMS, + const bool missing_is_na, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_to_left); + + template + void LaunchUpdateDataIndexToLeafIndexKernel_Inner1( + UpdateDataIndexToLeafIndexKernel_PARAMS, + const bool mfb_is_zero, + const bool mfb_is_na, + const bool max_to_left); + + template + void LaunchUpdateDataIndexToLeafIndexKernel_Inner2( + UpdateDataIndexToLeafIndexKernel_PARAMS, + const bool mfb_is_na, + const bool max_to_left); + + template + void LaunchUpdateDataIndexToLeafIndexKernel_Inner3( + UpdateDataIndexToLeafIndexKernel_PARAMS, + const bool max_to_left); + +#undef UpdateDataIndexToLeafIndexKernel_PARAMS + void LaunchAddPredictionToScoreKernel(const double* leaf_value, double* cuda_scores); void LaunchFillDataIndexToLeafIndex(); diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index ef2cad29b35a..d94acafa3f40 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -186,6 +186,12 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, cuda_bitset_inner_, cuda_bitset_inner_len_); } else { + if (train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]) == 7) { + Log::Warning("inner_feature_index = %d, leaf_best_split_threshold_[best_leaf_index_] = %d", + leaf_best_split_feature_[best_leaf_index_], leaf_best_split_threshold_[best_leaf_index_]); + Log::Warning("real threshold = %f", train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], + leaf_best_split_threshold_[best_leaf_index_])); + } right_leaf_index = tree->Split(best_leaf_index_, train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]), train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], From 536f603bd9f5d4fa1170db41c5c1b6d6d22f67d0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 5 Nov 2021 11:12:39 +0000 Subject: [PATCH 110/166] refactor CUDABestSplitFinder by grouping task information into struct --- include/LightGBM/cuda/cuda_random.hpp | 74 +++ include/LightGBM/cuda/cuda_utils.h | 7 + .../cuda/cuda_best_split_finder.cpp | 206 +++--- .../cuda/cuda_best_split_finder.cu | 589 ++++++++---------- .../cuda/cuda_best_split_finder.hpp | 42 +- .../cuda/cuda_single_gpu_tree_learner.cpp | 6 - 6 files changed, 448 insertions(+), 476 deletions(-) create mode 100644 include/LightGBM/cuda/cuda_random.hpp diff --git a/include/LightGBM/cuda/cuda_random.hpp b/include/LightGBM/cuda/cuda_random.hpp new file mode 100644 index 000000000000..c9a194e2c93e --- /dev/null +++ b/include/LightGBM/cuda/cuda_random.hpp @@ -0,0 +1,74 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_CUDA_CUDA_RANDOM_HPP_ +#define LIGHTGBM_CUDA_CUDA_RANDOM_HPP_ + +#ifdef USE_CUDA + +#include +#include + +namespace LightGBM { + +/*! +* \brief A wrapper for random generator +*/ +class CUDARandom { + public: + /*! + * \brief Set specific seed + */ + __device__ void SetSeed(int seed) { + x = seed; + } + /*! + * \brief Generate random integer, int16 range. [0, 65536] + * \param lower_bound lower bound + * \param upper_bound upper bound + * \return The random integer between [lower_bound, upper_bound) + */ + __device__ inline int NextShort(int lower_bound, int upper_bound) { + return (RandInt16()) % (upper_bound - lower_bound) + lower_bound; + } + + /*! + * \brief Generate random integer, int32 range + * \param lower_bound lower bound + * \param upper_bound upper bound + * \return The random integer between [lower_bound, upper_bound) + */ + __device__ inline int NextInt(int lower_bound, int upper_bound) { + return (RandInt32()) % (upper_bound - lower_bound) + lower_bound; + } + + /*! + * \brief Generate random float data + * \return The random float between [0.0, 1.0) + */ + __device__ inline float NextFloat() { + // get random float in [0,1) + return static_cast(RandInt16()) / (32768.0f); + } + + private: + __device__ inline int RandInt16() { + x = (214013 * x + 2531011); + return static_cast((x >> 16) & 0x7FFF); + } + + __device__ inline int RandInt32() { + x = (214013 * x + 2531011); + return static_cast(x & 0x7FFFFFFF); + } + + unsigned int x = 123456789; +}; + + +} // namespace LightGBM + +#endif // USE_CUDA + +#endif // LIGHTGBM_CUDA_CUDA_RANDOM_HPP_ diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 20f8362d50e7..8b3646206b6e 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -123,6 +123,13 @@ class CUDAVector { size_ = size; } + void Clear() { + if (size_ > 0 && data_ != nullptr) { + DeallocateCUDAMemory(&data_, __FILE__, __LINE__); + } + size_ = 0; + } + void PushBack(const T* values, size_t len) { T* new_data = nullptr; AllocateCUDAMemory(&new_data, size_ + len, __FILE__, __LINE__); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 6d9dfd583226..e77be2d42431 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -29,37 +29,22 @@ CUDABestSplitFinder::CUDABestSplitFinder( max_cat_threshold_(config->max_cat_threshold), min_data_per_group_(config->min_data_per_group), max_cat_to_onehot_(config->max_cat_to_onehot), + extra_trees_(config->extra_trees), + extra_seed_(config->extra_seed), num_total_bin_(feature_hist_offsets.empty() ? 0 : static_cast(feature_hist_offsets.back())), cuda_hist_(cuda_hist) { InitFeatureMetaInfo(train_data); cuda_leaf_best_split_info_ = nullptr; cuda_best_split_info_ = nullptr; - cuda_feature_hist_offsets_ = nullptr; - cuda_feature_mfb_offsets_ = nullptr; - cuda_feature_default_bins_ = nullptr; - cuda_feature_num_bins_ = nullptr; cuda_best_split_info_buffer_ = nullptr; - cuda_task_feature_index_ = nullptr; - cuda_task_reverse_ = nullptr; - cuda_task_skip_default_bin_ = nullptr; - cuda_task_na_as_missing_ = nullptr; - cuda_task_out_default_left_ = nullptr; cuda_is_feature_used_bytree_ = nullptr; } CUDABestSplitFinder::~CUDABestSplitFinder() { DeallocateCUDAMemory(&cuda_leaf_best_split_info_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_best_split_info_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_feature_hist_offsets_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_feature_mfb_offsets_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_feature_default_bins_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_feature_num_bins_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_best_split_info_buffer_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_task_feature_index_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_task_reverse_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_task_skip_default_bin_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_task_na_as_missing_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_task_out_default_left_, __FILE__, __LINE__); + cuda_split_find_tasks_.Clear(); DeallocateCUDAMemory(&cuda_is_feature_used_bytree_, __FILE__, __LINE__); gpuAssert(cudaStreamDestroy(cuda_streams_[0]), __FILE__, __LINE__); gpuAssert(cudaStreamDestroy(cuda_streams_[1]), __FILE__, __LINE__); @@ -116,76 +101,111 @@ void CUDABestSplitFinder::Init() { AllocateCUDAMemory(&cuda_feature_hist_index_buffer_, static_cast(num_total_bin_), __FILE__, __LINE__); } } - InitCUDAMemoryFromHostMemory(&cuda_is_categorical_, is_categorical_.data(), is_categorical_.size(), __FILE__, __LINE__); } void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { - InitCUDAMemoryFromHostMemory(&cuda_feature_hist_offsets_, - feature_hist_offsets_.data(), - feature_hist_offsets_.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_feature_mfb_offsets_, - feature_mfb_offsets_.data(), - feature_mfb_offsets_.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_feature_default_bins_, - feature_default_bins_.data(), - feature_default_bins_.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_feature_num_bins_, - feature_num_bins_.data(), - static_cast(num_features_), - __FILE__, - __LINE__); AllocateCUDAMemory(&cuda_is_feature_used_bytree_, static_cast(num_features_), __FILE__, __LINE__); - num_tasks_ = 0; + + // intialize split find task information (a split find task is one pass through the histogram of a feature) + split_find_tasks_.clear(); for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { const uint32_t num_bin = feature_num_bins_[inner_feature_index]; - const uint8_t missing_type = feature_missing_type_[inner_feature_index]; + const MissingType missing_type = feature_missing_type_[inner_feature_index]; if (num_bin > 2 && missing_type != MissingType::None && !is_categorical_[inner_feature_index]) { if (missing_type == MissingType::Zero) { - host_task_reverse_.emplace_back(0); - host_task_reverse_.emplace_back(1); - host_task_skip_default_bin_.emplace_back(1); - host_task_skip_default_bin_.emplace_back(1); - host_task_na_as_missing_.emplace_back(0); - host_task_na_as_missing_.emplace_back(0); - host_task_feature_index_.emplace_back(inner_feature_index); - host_task_feature_index_.emplace_back(inner_feature_index); - host_task_out_default_left_.emplace_back(0); - host_task_out_default_left_.emplace_back(1); - num_tasks_ += 2; + split_find_tasks_.emplace_back(); + SplitFindTask& new_task = split_find_tasks_.back(); + new_task.reverse = false; + new_task.skip_default_bin = true; + new_task.na_as_missing = false; + new_task.inner_feature_index = inner_feature_index; + new_task.assume_out_default_left = false; + new_task.is_categorical = false; + uint32_t num_bin = feature_num_bins_[inner_feature_index]; + new_task.is_one_hot = false; + new_task.hist_offset = feature_hist_offsets_[inner_feature_index]; + new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; + new_task.default_bin = feature_default_bins_[inner_feature_index]; + new_task.num_bin = num_bin; + + split_find_tasks_.emplace_back(); + new_task.reverse = true; + new_task.skip_default_bin = true; + new_task.na_as_missing = false; + new_task.inner_feature_index = inner_feature_index; + new_task.assume_out_default_left = true; + new_task.is_categorical = false; + num_bin = feature_num_bins_[inner_feature_index]; + new_task.is_one_hot = false; + new_task.hist_offset = feature_hist_offsets_[inner_feature_index]; + new_task.default_bin = feature_default_bins_[inner_feature_index]; + new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; + new_task.num_bin = num_bin; } else { - host_task_reverse_.emplace_back(0); - host_task_reverse_.emplace_back(1); - host_task_skip_default_bin_.emplace_back(0); - host_task_skip_default_bin_.emplace_back(0); - host_task_na_as_missing_.emplace_back(1); - host_task_na_as_missing_.emplace_back(1); - host_task_feature_index_.emplace_back(inner_feature_index); - host_task_feature_index_.emplace_back(inner_feature_index); - host_task_out_default_left_.emplace_back(0); - host_task_out_default_left_.emplace_back(1); - num_tasks_ += 2; + split_find_tasks_.emplace_back(); + SplitFindTask& new_task = split_find_tasks_.back(); + new_task.reverse = false; + new_task.skip_default_bin = false; + new_task.na_as_missing = true; + new_task.inner_feature_index = inner_feature_index; + new_task.assume_out_default_left = false; + new_task.is_categorical = false; + uint32_t num_bin = feature_num_bins_[inner_feature_index]; + new_task.is_one_hot = false; + new_task.hist_offset = feature_hist_offsets_[inner_feature_index]; + new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; + new_task.default_bin = feature_default_bins_[inner_feature_index]; + new_task.num_bin = num_bin; + + split_find_tasks_.emplace_back(); + new_task.reverse = true; + new_task.skip_default_bin = false; + new_task.na_as_missing = true; + new_task.inner_feature_index = inner_feature_index; + new_task.assume_out_default_left = true; + new_task.is_categorical = false; + num_bin = feature_num_bins_[inner_feature_index]; + new_task.is_one_hot = false; + new_task.hist_offset = feature_hist_offsets_[inner_feature_index]; + new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; + new_task.default_bin = feature_default_bins_[inner_feature_index]; + new_task.num_bin = num_bin; } } else { + split_find_tasks_.emplace_back(); + SplitFindTask& new_task = split_find_tasks_.back(); + const uint32_t num_bin = feature_num_bins_[inner_feature_index]; if (is_categorical_[inner_feature_index]) { - host_task_reverse_.emplace_back(0); + new_task.reverse = false; + new_task.is_categorical = true; + new_task.is_one_hot = (static_cast(num_bin) < max_cat_to_onehot_); } else { - host_task_reverse_.emplace_back(1); + new_task.reverse = true; + new_task.is_categorical = false; + new_task.is_one_hot = false; } - host_task_skip_default_bin_.emplace_back(0); - host_task_na_as_missing_.emplace_back(0); - host_task_feature_index_.emplace_back(inner_feature_index); - if (missing_type != 2) { - host_task_out_default_left_.emplace_back(1); + new_task.skip_default_bin = false; + new_task.na_as_missing = false; + new_task.inner_feature_index = inner_feature_index; + if (missing_type != MissingType::NaN && is_categorical_[inner_feature_index]) { + new_task.assume_out_default_left = true; } else { - host_task_out_default_left_.emplace_back(0); + new_task.assume_out_default_left = false; } - ++num_tasks_; + new_task.hist_offset = feature_hist_offsets_[inner_feature_index]; + new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; + new_task.default_bin = feature_default_bins_[inner_feature_index]; + new_task.num_bin = num_bin; + } + } + num_tasks_ = static_cast(split_find_tasks_.size()); + + if (extra_trees_) { + cuda_randoms_.Resize(num_tasks_); + LaunchInitCUDARandomKernel(); + for (int task_index = 0; task_index < num_tasks_; ++task_index) { + split_find_tasks_[task_index].cuda_random = cuda_randoms_.RawData() + task_index; + split_find_tasks_[task_index].rand_threshold = 0; } } @@ -196,31 +216,13 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_task_feature_index_, - host_task_feature_index_.data(), - host_task_feature_index_.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_task_reverse_, - host_task_reverse_.data(), - host_task_reverse_.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_task_skip_default_bin_, - host_task_skip_default_bin_.data(), - host_task_skip_default_bin_.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_task_na_as_missing_, - host_task_na_as_missing_.data(), - host_task_na_as_missing_.size(), - __FILE__, - __LINE__); - InitCUDAMemoryFromHostMemory(&cuda_task_out_default_left_, - host_task_out_default_left_.data(), - host_task_out_default_left_.size(), - __FILE__, - __LINE__); + + cuda_split_find_tasks_.Resize(num_tasks_); + CopyFromHostToCUDADevice(cuda_split_find_tasks_.RawData(), + split_find_tasks_.data(), + split_find_tasks_.size(), + __FILE__, + __LINE__); const size_t output_buffer_size = 2 * static_cast(num_tasks_); AllocateCUDAMemory(&cuda_best_split_info_, output_buffer_size, __FILE__, __LINE__); @@ -236,18 +238,8 @@ void CUDABestSplitFinder::ResetTrainingData( num_features_ = train_data->num_features(); feature_hist_offsets_ = feature_hist_offsets; InitFeatureMetaInfo(train_data); - DeallocateCUDAMemory(&cuda_feature_hist_offsets_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_feature_hist_offsets_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_feature_mfb_offsets_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_feature_default_bins_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_feature_num_bins_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_is_feature_used_bytree_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_best_split_info_, __FILE__, __LINE__); - host_task_reverse_.clear(); - host_task_skip_default_bin_.clear(); - host_task_na_as_missing_.clear(); - host_task_feature_index_.clear(); - host_task_out_default_left_.clear(); InitCUDAFeatureMetaInfo(); } diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index a0fba7ab8f12..60e8d5062522 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -176,14 +176,12 @@ __device__ double GetSplitGains(double sum_left_gradients, l1, use_l1, l2); } -template +template __device__ void FindBestSplitsForLeafKernelInner( // input feature information const hist_t* feature_hist_ptr, - const uint32_t feature_num_bin, - const uint8_t feature_mfb_offset, - const uint32_t feature_default_bin, - const int inner_feature_index, + // input task information + const SplitFindTask* task, // input config parameter values const double lambda_l1, const double lambda_l2, @@ -195,10 +193,6 @@ __device__ void FindBestSplitsForLeafKernelInner( const double sum_gradients, const double sum_hessians, const data_size_t num_data, - // input task information - const bool skip_default_bin, - const bool na_as_missing, - const uint8_t assume_out_default_left, // output parameters CUDASplitInfo* cuda_best_split_info) { const double cnt_factor = num_data / sum_hessians; @@ -207,21 +201,20 @@ __device__ void FindBestSplitsForLeafKernelInner( cuda_best_split_info->is_valid = false; - __shared__ hist_t shared_mem_buffer[32]; hist_t local_grad_hist = 0.0f; hist_t local_hess_hist = 0.0f; double local_gain = 0.0f; bool threshold_found = false; uint32_t threshold_value = 0; __shared__ uint32_t best_thread_index; - __shared__ double shared_gain_buffer[32]; - __shared__ bool shared_found_buffer[32]; - __shared__ uint32_t shared_thread_index_buffer[32]; + __shared__ double shared_double_buffer[32]; + __shared__ bool shared_bool_buffer[32]; + __shared__ uint32_t shared_int_buffer[32]; const unsigned int threadIdx_x = threadIdx.x; const bool skip_sum = REVERSE ? - (skip_default_bin && (feature_num_bin - 1 - threadIdx_x) == static_cast(feature_default_bin)) : - (skip_default_bin && (threadIdx_x + feature_mfb_offset) == static_cast(feature_default_bin)); - const uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; + (task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast(task->default_bin)) : + (task->skip_default_bin && (threadIdx_x + task->mfb_offset) == static_cast(task->default_bin)); + const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset; if (!REVERSE) { if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { const unsigned int bin_offset = threadIdx_x << 1; @@ -229,7 +222,7 @@ __device__ void FindBestSplitsForLeafKernelInner( local_hess_hist = feature_hist_ptr[bin_offset + 1]; } } else { - if (threadIdx_x >= static_cast(na_as_missing) && + if (threadIdx_x >= static_cast(task->na_as_missing) && threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { const unsigned int read_index = feature_num_bin_minus_offset - 1 - threadIdx_x; const unsigned int bin_offset = read_index << 1; @@ -242,11 +235,11 @@ __device__ void FindBestSplitsForLeafKernelInner( local_hess_hist += kEpsilon; } local_gain = kMinScore; - local_grad_hist = ShufflePrefixSum(local_grad_hist, shared_mem_buffer); + local_grad_hist = ShufflePrefixSum(local_grad_hist, shared_double_buffer); __syncthreads(); - local_hess_hist = ShufflePrefixSum(local_hess_hist, shared_mem_buffer); + local_hess_hist = ShufflePrefixSum(local_hess_hist, shared_double_buffer); if (REVERSE) { - if (threadIdx_x >= static_cast(na_as_missing) && threadIdx_x <= feature_num_bin - 2 && !skip_sum) { + if (threadIdx_x >= static_cast(task->na_as_missing) && threadIdx_x <= task->num_bin - 2 && !skip_sum) { const double sum_right_gradient = local_grad_hist; const double sum_right_hessian = local_hess_hist; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); @@ -262,7 +255,7 @@ __device__ void FindBestSplitsForLeafKernelInner( // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; - threshold_value = static_cast(feature_num_bin - 2 - threadIdx_x); + threshold_value = static_cast(task->num_bin - 2 - threadIdx_x); threshold_found = true; } } @@ -284,14 +277,14 @@ __device__ void FindBestSplitsForLeafKernelInner( // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; - threshold_value = static_cast(threadIdx_x + feature_mfb_offset); + threshold_value = static_cast(threadIdx_x + task->mfb_offset); threshold_found = true; } } } } __syncthreads(); - const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer); + const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_double_buffer, shared_bool_buffer, shared_int_buffer); if (threadIdx_x == 0) { best_thread_index = result; } @@ -300,7 +293,7 @@ __device__ void FindBestSplitsForLeafKernelInner( cuda_best_split_info->is_valid = true; cuda_best_split_info->threshold = threshold_value; cuda_best_split_info->gain = local_gain; - cuda_best_split_info->default_left = assume_out_default_left; + cuda_best_split_info->default_left = task->assume_out_default_left; if (REVERSE) { const double sum_right_gradient = local_grad_hist; const double sum_right_hessian = local_hess_hist - kEpsilon; @@ -351,13 +344,12 @@ __device__ void FindBestSplitsForLeafKernelInner( } } +template __device__ void FindBestSplitsForLeafKernelCategoricalInner( // input feature information const hist_t* feature_hist_ptr, - const uint32_t feature_num_bin, - const uint8_t feature_mfb_offset, - const uint32_t feature_default_bin, - const int inner_feature_index, + // input task information + const SplitFindTask* task, // input config parameter values const double lambda_l1, const double lambda_l2, @@ -373,8 +365,6 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const double sum_gradients, const double sum_hessians, const data_size_t num_data, - // task information - const bool is_one_hot, // output parameters CUDASplitInfo* cuda_best_split_info) { __shared__ double shared_gain_buffer[32]; @@ -391,10 +381,10 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( cuda_best_split_info->is_valid = false; - const int bin_start = 1 - feature_mfb_offset; - const int bin_end = feature_num_bin - feature_mfb_offset; + const int bin_start = 1 - task->mfb_offset; + const int bin_end = task->num_bin - task->mfb_offset; const int threadIdx_x = static_cast(threadIdx.x); - if (is_one_hot) { + if (task->is_one_hot) { if (threadIdx_x >= bin_start && threadIdx_x < bin_end) { const int bin_offset = (threadIdx_x << 1); const hist_t grad = feature_hist_ptr[bin_offset]; @@ -429,7 +419,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( cuda_best_split_info->is_valid = true; cuda_best_split_info->num_cat_threshold = 1; cuda_best_split_info->gain = local_gain - min_gain_shift; - *(cuda_best_split_info->cat_threshold) = static_cast(threadIdx_x + feature_mfb_offset); + *(cuda_best_split_info->cat_threshold) = static_cast(threadIdx_x + task->mfb_offset); cuda_best_split_info->default_left = false; const int bin_offset = (threadIdx_x << 1); const hist_t sum_left_gradient = feature_hist_ptr[bin_offset]; @@ -579,11 +569,11 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( cuda_best_split_info->gain = local_gain - min_gain_shift; if (best_dir == 1) { for (int i = 0; i < threadIdx_x + 1; ++i) { - (cuda_best_split_info->cat_threshold)[i] = shared_index_buffer[i] + feature_mfb_offset; + (cuda_best_split_info->cat_threshold)[i] = shared_index_buffer[i] + task->mfb_offset; } } else { for (int i = 0; i < threadIdx_x + 1; ++i) { - (cuda_best_split_info->cat_threshold)[i] = shared_index_buffer[used_bin - 1 - i] + feature_mfb_offset; + (cuda_best_split_info->cat_threshold)[i] = shared_index_buffer[used_bin - 1 - i] + task->mfb_offset; } } cuda_best_split_info->default_left = false; @@ -613,26 +603,15 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( } } +template __global__ void FindBestSplitsForLeafKernel( // input feature information - const uint32_t* feature_hist_offsets, - const uint8_t* feature_mfb_offsets, - const uint32_t* feature_default_bins, - const uint32_t* feature_num_bins, const int8_t* is_feature_used_bytree, - const int8_t* is_categorical, // input task information - const bool larger_only, const int num_tasks, - const int* task_feature_index, - const uint8_t* task_reverse, - const uint8_t* task_skip_default_bin, - const uint8_t* task_na_as_missing, - const uint8_t* task_out_default_left, + const SplitFindTask* tasks, // input leaf information - const int smaller_leaf_index, const CUDALeafSplitsStruct* smaller_leaf_splits, - const int larger_leaf_index, const CUDALeafSplitsStruct* larger_leaf_splits, // input config parameter values const data_size_t min_data_in_leaf, @@ -648,8 +627,9 @@ __global__ void FindBestSplitsForLeafKernel( // output CUDASplitInfo* cuda_best_split_info) { const unsigned int task_index = blockIdx.x % num_tasks; - const bool is_larger = static_cast(blockIdx.x >= num_tasks || larger_only); - const int inner_feature_index = task_feature_index[task_index]; + const SplitFindTask* task = tasks + task_index; + const bool is_larger = static_cast(blockIdx.x >= num_tasks || LARGER_ONLY); + const int inner_feature_index = task->inner_feature_index; const double parent_gain = is_larger ? larger_leaf_splits->gain : smaller_leaf_splits->gain; const double sum_gradients = is_larger ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients; const double sum_hessians = (is_larger ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon; @@ -657,16 +637,13 @@ __global__ void FindBestSplitsForLeafKernel( const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; CUDASplitInfo* out = cuda_best_split_info + output_offset; if (is_feature_used_bytree[inner_feature_index]) { - const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + feature_hist_offsets[inner_feature_index] * 2; - if (is_categorical[inner_feature_index]) { - const bool is_one_hot = feature_num_bins[inner_feature_index] <= max_cat_to_onehot; - FindBestSplitsForLeafKernelCategoricalInner( + const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset * 2; + if (task->is_categorical) { + FindBestSplitsForLeafKernelCategoricalInner( // input feature information hist_ptr, - feature_num_bins[inner_feature_index], - feature_mfb_offsets[inner_feature_index], - feature_default_bins[inner_feature_index], - inner_feature_index, + // input task information + task, // input config parameter values lambda_l1, lambda_l2, @@ -682,23 +659,15 @@ __global__ void FindBestSplitsForLeafKernel( sum_gradients, sum_hessians, num_data, - // input task information - is_one_hot, // output parameters out); } else { - const bool reverse = static_cast(task_reverse[task_index]); - const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); - const bool na_as_missing = static_cast(task_na_as_missing[task_index]); - const bool assume_out_default_left = task_out_default_left[task_index]; - if (reverse) { - FindBestSplitsForLeafKernelInner( + if (!task->reverse) { + FindBestSplitsForLeafKernelInner( // input feature information hist_ptr, - feature_num_bins[inner_feature_index], - feature_mfb_offsets[inner_feature_index], - feature_default_bins[inner_feature_index], - inner_feature_index, + // input task information + task, // input config parameter values lambda_l1, lambda_l2, @@ -710,20 +679,14 @@ __global__ void FindBestSplitsForLeafKernel( sum_gradients, sum_hessians, num_data, - // input task information - skip_default_bin, - na_as_missing, - assume_out_default_left, // output parameters out); } else { - FindBestSplitsForLeafKernelInner( + FindBestSplitsForLeafKernelInner( // input feature information hist_ptr, - feature_num_bins[inner_feature_index], - feature_mfb_offsets[inner_feature_index], - feature_default_bins[inner_feature_index], - inner_feature_index, + // input task information + task, // input config parameter values lambda_l1, lambda_l2, @@ -735,10 +698,6 @@ __global__ void FindBestSplitsForLeafKernel( sum_gradients, sum_hessians, num_data, - // input task information - skip_default_bin, - na_as_missing, - assume_out_default_left, // output parameters out); } @@ -748,13 +707,12 @@ __global__ void FindBestSplitsForLeafKernel( } } +template __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( // input feature information const hist_t* feature_hist_ptr, - const uint32_t feature_num_bin, - const uint8_t feature_mfb_offset, - const uint32_t feature_default_bin, - const int inner_feature_index, + // input task information + const SplitFindTask* task, // input config parameter values const double lambda_l1, const double lambda_l2, @@ -766,16 +724,11 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( const double sum_gradients, const double sum_hessians, const data_size_t num_data, - // input task information - const bool reverse, - const bool skip_default_bin, - const bool na_as_missing, - const uint8_t assume_out_default_left, + // output parameters + CUDASplitInfo* cuda_best_split_info, // buffer hist_t* hist_grad_buffer_ptr, - hist_t* hist_hess_buffer_ptr, - // output parameters - CUDASplitInfo* cuda_best_split_info) { + hist_t* hist_hess_buffer_ptr) { const double cnt_factor = num_data / sum_hessians; const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; @@ -789,11 +742,11 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( __shared__ bool shared_found_buffer[32]; __shared__ uint32_t shared_thread_index_buffer[32]; const unsigned int threadIdx_x = threadIdx.x; - const uint32_t feature_num_bin_minus_offset = feature_num_bin - feature_mfb_offset; - if (!reverse) { + const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset; + if (!REVERSE) { for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; ++bin) { const bool skip_sum = - (skip_default_bin && (bin + feature_mfb_offset) == static_cast(feature_default_bin)); + (task->skip_default_bin && (bin + task->mfb_offset) == static_cast(task->default_bin)); if (!skip_sum) { const unsigned int bin_offset = bin << 1; hist_grad_buffer_ptr[bin] = feature_hist_ptr[bin_offset]; @@ -805,8 +758,8 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( } } else { for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; ++bin) { - const bool skip_sum = bin >= static_cast(na_as_missing) && - (skip_default_bin && (feature_num_bin - 1 - bin) == static_cast(feature_default_bin)); + const bool skip_sum = bin >= static_cast(task->na_as_missing) && + (task->skip_default_bin && (task->num_bin - 1 - bin) == static_cast(task->default_bin)); if (!skip_sum) { const unsigned int read_index = feature_num_bin_minus_offset - 1 - bin; const unsigned int bin_offset = read_index << 1; @@ -826,10 +779,10 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( GlobalMemoryPrefixSum(hist_grad_buffer_ptr, static_cast(feature_num_bin_minus_offset)); __syncthreads(); GlobalMemoryPrefixSum(hist_hess_buffer_ptr, static_cast(feature_num_bin_minus_offset)); - if (reverse) { + if (REVERSE) { for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; ++bin) { - const bool skip_sum = (bin >= static_cast(na_as_missing) && - (skip_default_bin && (feature_num_bin - 1 - bin) == static_cast(feature_default_bin))); + const bool skip_sum = (bin >= static_cast(task->na_as_missing) && + (task->skip_default_bin && (task->num_bin - 1 - bin) == static_cast(task->default_bin))); if (!skip_sum) { const double sum_right_gradient = hist_grad_buffer_ptr[bin]; const double sum_right_hessian = hist_hess_buffer_ptr[bin]; @@ -846,7 +799,7 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; - threshold_value = static_cast(feature_num_bin - 2 - bin); + threshold_value = static_cast(task->num_bin - 2 - bin); threshold_found = true; } } @@ -855,7 +808,7 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( } else { for (unsigned int bin = threadIdx_x; bin <= feature_num_bin_minus_offset - 2; ++bin) { const bool skip_sum = - (skip_default_bin && (bin + feature_mfb_offset) == static_cast(feature_default_bin)); + (task->skip_default_bin && (bin + task->mfb_offset) == static_cast(task->default_bin)); if (!skip_sum) { const double sum_left_gradient = hist_grad_buffer_ptr[bin]; const double sum_left_hessian = hist_hess_buffer_ptr[bin]; @@ -872,7 +825,7 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; - threshold_value = static_cast(bin + feature_mfb_offset); + threshold_value = static_cast(bin + task->mfb_offset); threshold_found = true; } } @@ -889,9 +842,9 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( cuda_best_split_info->is_valid = true; cuda_best_split_info->threshold = threshold_value; cuda_best_split_info->gain = local_gain; - cuda_best_split_info->default_left = assume_out_default_left; - if (reverse) { - const unsigned int best_bin = static_cast(feature_num_bin - 2 - threshold_value); + cuda_best_split_info->default_left = task->assume_out_default_left; + if (REVERSE) { + const unsigned int best_bin = static_cast(task->num_bin - 2 - threshold_value); const double sum_right_gradient = hist_grad_buffer_ptr[best_bin]; const double sum_right_hessian = hist_hess_buffer_ptr[best_bin] - kEpsilon; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); @@ -915,7 +868,7 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); } else { - const unsigned int best_bin = static_cast(threshold_value - feature_mfb_offset); + const unsigned int best_bin = static_cast(threshold_value - task->mfb_offset); const double sum_left_gradient = hist_grad_buffer_ptr[best_bin]; const double sum_left_hessian = hist_hess_buffer_ptr[best_bin] - kEpsilon; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); @@ -942,13 +895,12 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( } } +template __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( // input feature information const hist_t* feature_hist_ptr, - const uint32_t feature_num_bin, - const uint8_t feature_mfb_offset, - const uint32_t feature_default_bin, - const int inner_feature_index, + // input task information + const SplitFindTask* task, // input config parameter values const double lambda_l1, const double lambda_l2, @@ -964,8 +916,6 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( const double sum_gradients, const double sum_hessians, const data_size_t num_data, - // task information - const bool is_one_hot, // buffer hist_t* hist_grad_buffer_ptr, hist_t* hist_hess_buffer_ptr, @@ -987,11 +937,11 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( cuda_best_split_info->is_valid = false; - const int bin_start = 1 - feature_mfb_offset; - const int bin_end = feature_num_bin - feature_mfb_offset; + const int bin_start = 1 - task->mfb_offset; + const int bin_end = task->num_bin - task->mfb_offset; int best_threshold = -1; const int threadIdx_x = static_cast(threadIdx.x); - if (is_one_hot) { + if (task->is_one_hot) { for (int bin = bin_start + threadIdx_x; bin < bin_end; bin += static_cast(blockDim.x)) { const int bin_offset = (bin << 1); const hist_t grad = feature_hist_ptr[bin_offset]; @@ -1082,7 +1032,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( } __syncthreads(); BitonicArgSortDevice( - hist_stat_buffer_ptr, hist_index_buffer_ptr, feature_num_bin - feature_mfb_offset); + hist_stat_buffer_ptr, hist_index_buffer_ptr, task->num_bin - task->mfb_offset); const int max_num_cat = min(max_cat_threshold, (used_bin + 1) / 2); __syncthreads(); @@ -1176,11 +1126,11 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( cuda_best_split_info->gain = local_gain; if (best_dir == 1) { for (int i = 0; i < threadIdx_x + 1; ++i) { - (cuda_best_split_info->cat_threshold)[i] = hist_index_buffer_ptr[i] + feature_mfb_offset; + (cuda_best_split_info->cat_threshold)[i] = hist_index_buffer_ptr[i] + task->mfb_offset; } } else { for (int i = 0; i < threadIdx_x + 1; ++i) { - (cuda_best_split_info->cat_threshold)[i] = hist_index_buffer_ptr[used_bin - 1 - i] + feature_mfb_offset; + (cuda_best_split_info->cat_threshold)[i] = hist_index_buffer_ptr[used_bin - 1 - i] + task->mfb_offset; } } cuda_best_split_info->default_left = false; @@ -1210,26 +1160,15 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( } } +template __global__ void FindBestSplitsForLeafKernel_GlobalMemory( // input feature information - const uint32_t* feature_hist_offsets, - const uint8_t* feature_mfb_offsets, - const uint32_t* feature_default_bins, - const uint32_t* feature_num_bins, const int8_t* is_feature_used_bytree, - const int8_t* is_categorical, // input task information - const bool larger_only, const int num_tasks, - const int* task_feature_index, - const uint8_t* task_reverse, - const uint8_t* task_skip_default_bin, - const uint8_t* task_na_as_missing, - const uint8_t* task_out_default_left, + const SplitFindTask* tasks, // input leaf information - const int smaller_leaf_index, const CUDALeafSplitsStruct* smaller_leaf_splits, - const int larger_leaf_index, const CUDALeafSplitsStruct* larger_leaf_splits, // input config parameter values const data_size_t min_data_in_leaf, @@ -1242,37 +1181,35 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( const int max_cat_threshold, const int min_data_per_group, const int max_cat_to_onehot, + // output + CUDASplitInfo* cuda_best_split_info, // buffer hist_t* feature_hist_grad_buffer, hist_t* feature_hist_hess_buffer, hist_t* feature_hist_stat_buffer, - data_size_t* feature_hist_index_buffer, - // output - CUDASplitInfo* cuda_best_split_info) { + data_size_t* feature_hist_index_buffer) { const unsigned int task_index = blockIdx.x % num_tasks; - const bool is_larger = static_cast(blockIdx.x >= num_tasks || larger_only); - const int inner_feature_index = task_feature_index[task_index]; + const SplitFindTask* task = tasks + task_index; + const bool is_larger = static_cast(blockIdx.x >= num_tasks || LARGER_ONLY); const double parent_gain = is_larger ? larger_leaf_splits->gain : smaller_leaf_splits->gain; const double sum_gradients = is_larger ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients; const double sum_hessians = (is_larger ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon; const double num_data = is_larger ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf; const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; CUDASplitInfo* out = cuda_best_split_info + output_offset; - if (is_feature_used_bytree[inner_feature_index]) { - const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + feature_hist_offsets[inner_feature_index] * 2; - hist_t* hist_grad_buffer_ptr = feature_hist_grad_buffer + feature_hist_offsets[inner_feature_index] * 2; - hist_t* hist_hess_buffer_ptr = feature_hist_hess_buffer + feature_hist_offsets[inner_feature_index] * 2; - hist_t* hist_stat_buffer_ptr = feature_hist_stat_buffer + feature_hist_offsets[inner_feature_index] * 2; - data_size_t* hist_index_buffer_ptr = feature_hist_index_buffer + feature_hist_offsets[inner_feature_index] * 2; - if (is_categorical[inner_feature_index]) { - const bool is_one_hot = feature_num_bins[inner_feature_index] <= max_cat_to_onehot; - FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( + if (is_feature_used_bytree[task->inner_feature_index]) { + const uint32_t hist_offset = task->hist_offset; + const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + hist_offset * 2; + hist_t* hist_grad_buffer_ptr = feature_hist_grad_buffer + hist_offset * 2; + hist_t* hist_hess_buffer_ptr = feature_hist_hess_buffer + hist_offset * 2; + hist_t* hist_stat_buffer_ptr = feature_hist_stat_buffer + hist_offset * 2; + data_size_t* hist_index_buffer_ptr = feature_hist_index_buffer + hist_offset * 2; + if (task->is_categorical) { + FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( // input feature information hist_ptr, - feature_num_bins[inner_feature_index], - feature_mfb_offsets[inner_feature_index], - feature_default_bins[inner_feature_index], - inner_feature_index, + // input task information + task, // input config parameter values lambda_l1, lambda_l2, @@ -1288,8 +1225,6 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( sum_gradients, sum_hessians, num_data, - // input task information - is_one_hot, // buffer hist_grad_buffer_ptr, hist_hess_buffer_ptr, @@ -1298,39 +1233,52 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( // output parameters out); } else { - const bool reverse = static_cast(task_reverse[task_index]); - const bool skip_default_bin = static_cast(task_skip_default_bin[task_index]); - const bool na_as_missing = static_cast(task_na_as_missing[task_index]); - const bool assume_out_default_left = task_out_default_left[task_index]; - FindBestSplitsForLeafKernelInner_GlobalMemory( - // input feature information - hist_ptr, - feature_num_bins[inner_feature_index], - feature_mfb_offsets[inner_feature_index], - feature_default_bins[inner_feature_index], - inner_feature_index, - // input config parameter values - lambda_l1, - lambda_l2, - min_data_in_leaf, - min_sum_hessian_in_leaf, - min_gain_to_split, - // input parent node information - parent_gain, - sum_gradients, - sum_hessians, - num_data, - // input task information - reverse, - skip_default_bin, - na_as_missing, - assume_out_default_left, - // buffer - hist_grad_buffer_ptr, - hist_hess_buffer_ptr, - // output parameters - out); + if (!task->reverse) { + FindBestSplitsForLeafKernelInner_GlobalMemory( + // input feature information + hist_ptr, + // input task information + task, + // input config parameter values + lambda_l1, + lambda_l2, + min_data_in_leaf, + min_sum_hessian_in_leaf, + min_gain_to_split, + // input parent node information + parent_gain, + sum_gradients, + sum_hessians, + num_data, + // output parameters + out, + // buffer + hist_grad_buffer_ptr, + hist_hess_buffer_ptr); + } else { + FindBestSplitsForLeafKernelInner_GlobalMemory( + // input feature information + hist_ptr, + // input task information + task, + // input config parameter values + lambda_l1, + lambda_l2, + min_data_in_leaf, + min_sum_hessian_in_leaf, + min_gain_to_split, + // input parent node information + parent_gain, + sum_gradients, + sum_hessians, + num_data, + // output parameters + out, + // buffer + hist_grad_buffer_ptr, + hist_hess_buffer_ptr); } + } } else { out->is_valid = false; } @@ -1350,165 +1298,103 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( if (!is_smaller_leaf_valid) { larger_only = true; } + + #define FindBestSplitsForLeafKernel_ARGS \ + cuda_is_feature_used_bytree_, \ + num_tasks_, \ + cuda_split_find_tasks_.RawData(), \ + smaller_leaf_splits, \ + larger_leaf_splits, \ + min_data_in_leaf_, \ + min_sum_hessian_in_leaf_, \ + min_gain_to_split_, \ + lambda_l1_, \ + lambda_l2_, \ + cat_smooth_, \ + cat_l2_, \ + max_cat_threshold_, \ + min_data_per_group_, \ + max_cat_to_onehot_, \ + cuda_best_split_info_ + + #define GlobalMemory_Buffer_ARGS \ + cuda_feature_hist_grad_buffer_, \ + cuda_feature_hist_hess_buffer_, \ + cuda_feature_hist_stat_buffer_, \ + cuda_feature_hist_index_buffer_ + if (!use_global_memory_) { - if (!larger_only) { - FindBestSplitsForLeafKernel<<>>( - // input feature information - cuda_feature_hist_offsets_, - cuda_feature_mfb_offsets_, - cuda_feature_default_bins_, - cuda_feature_num_bins_, - cuda_is_feature_used_bytree_, - cuda_is_categorical_, - // input task information - larger_only, - num_tasks_, - cuda_task_feature_index_, - cuda_task_reverse_, - cuda_task_skip_default_bin_, - cuda_task_na_as_missing_, - cuda_task_out_default_left_, - // input leaf information - smaller_leaf_index, - smaller_leaf_splits, - larger_leaf_index, - larger_leaf_splits, - // configuration parameter values - min_data_in_leaf_, - min_sum_hessian_in_leaf_, - min_gain_to_split_, - lambda_l1_, - lambda_l2_, - cat_smooth_, - cat_l2_, - max_cat_threshold_, - min_data_per_group_, - max_cat_to_onehot_, - // output parameters - cuda_best_split_info_); + if (!extra_trees_) { + if (!larger_only) { + FindBestSplitsForLeafKernel + <<>> + (FindBestSplitsForLeafKernel_ARGS); + } else { + FindBestSplitsForLeafKernel + <<>> + (FindBestSplitsForLeafKernel_ARGS); + } + } else { + if (!larger_only) { + FindBestSplitsForLeafKernel + <<>> + (FindBestSplitsForLeafKernel_ARGS); + } else { + FindBestSplitsForLeafKernel + <<>> + (FindBestSplitsForLeafKernel_ARGS); + } } SynchronizeCUDADevice(__FILE__, __LINE__); if (larger_leaf_index >= 0) { - FindBestSplitsForLeafKernel<<>>( - // input feature information - cuda_feature_hist_offsets_, - cuda_feature_mfb_offsets_, - cuda_feature_default_bins_, - cuda_feature_num_bins_, - cuda_is_feature_used_bytree_, - cuda_is_categorical_, - // input task information - true, - num_tasks_, - cuda_task_feature_index_, - cuda_task_reverse_, - cuda_task_skip_default_bin_, - cuda_task_na_as_missing_, - cuda_task_out_default_left_, - // input leaf information - smaller_leaf_index, - smaller_leaf_splits, - larger_leaf_index, - larger_leaf_splits, - // configuration parameter values - min_data_in_leaf_, - min_sum_hessian_in_leaf_, - min_gain_to_split_, - lambda_l1_, - lambda_l2_, - cat_smooth_, - cat_l2_, - max_cat_threshold_, - min_data_per_group_, - max_cat_to_onehot_, - // output parameters - cuda_best_split_info_); + if (!extra_trees_) { + FindBestSplitsForLeafKernel + <<>> + (FindBestSplitsForLeafKernel_ARGS); + } else { + FindBestSplitsForLeafKernel + <<>> + (FindBestSplitsForLeafKernel_ARGS); + } } } else { - if (!larger_only) { - FindBestSplitsForLeafKernel_GlobalMemory<<>>( - // input feature information - cuda_feature_hist_offsets_, - cuda_feature_mfb_offsets_, - cuda_feature_default_bins_, - cuda_feature_num_bins_, - cuda_is_feature_used_bytree_, - cuda_is_categorical_, - // input task information - larger_only, - num_tasks_, - cuda_task_feature_index_, - cuda_task_reverse_, - cuda_task_skip_default_bin_, - cuda_task_na_as_missing_, - cuda_task_out_default_left_, - // input leaf information - smaller_leaf_index, - smaller_leaf_splits, - larger_leaf_index, - larger_leaf_splits, - // configuration parameter values - min_data_in_leaf_, - min_sum_hessian_in_leaf_, - min_gain_to_split_, - lambda_l1_, - lambda_l2_, - cat_smooth_, - cat_l2_, - max_cat_threshold_, - min_data_per_group_, - max_cat_to_onehot_, - // buffer - cuda_feature_hist_grad_buffer_, - cuda_feature_hist_hess_buffer_, - cuda_feature_hist_stat_buffer_, - cuda_feature_hist_index_buffer_, - // output parameters - cuda_best_split_info_); + if (!extra_trees_) { + if (!larger_only) { + FindBestSplitsForLeafKernel_GlobalMemory + <<>> + (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); + } else { + FindBestSplitsForLeafKernel_GlobalMemory + <<>> + (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); + } + } else { + if (!larger_only) { + FindBestSplitsForLeafKernel_GlobalMemory + <<>> + (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); + } else { + FindBestSplitsForLeafKernel_GlobalMemory + <<>> + (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); + } } SynchronizeCUDADevice(__FILE__, __LINE__); if (larger_leaf_index >= 0) { - FindBestSplitsForLeafKernel_GlobalMemory<<>>( - // input feature information - cuda_feature_hist_offsets_, - cuda_feature_mfb_offsets_, - cuda_feature_default_bins_, - cuda_feature_num_bins_, - cuda_is_feature_used_bytree_, - cuda_is_categorical_, - // input task information - true, - num_tasks_, - cuda_task_feature_index_, - cuda_task_reverse_, - cuda_task_skip_default_bin_, - cuda_task_na_as_missing_, - cuda_task_out_default_left_, - // input leaf information - smaller_leaf_index, - smaller_leaf_splits, - larger_leaf_index, - larger_leaf_splits, - // configuration parameter values - min_data_in_leaf_, - min_sum_hessian_in_leaf_, - min_gain_to_split_, - lambda_l1_, - lambda_l2_, - cat_smooth_, - cat_l2_, - max_cat_threshold_, - min_data_per_group_, - max_cat_to_onehot_, - // buffer - cuda_feature_hist_grad_buffer_, - cuda_feature_hist_hess_buffer_, - cuda_feature_hist_stat_buffer_, - cuda_feature_hist_index_buffer_, - // output parameters - cuda_best_split_info_); + if (!extra_trees_) { + FindBestSplitsForLeafKernel_GlobalMemory + <<>> + (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); + } else { + FindBestSplitsForLeafKernel_GlobalMemory + <<>> + (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); + } } } + + #undef FindBestSplitsForLeafKernel_ARGS + #undef GlobalMemory_Buffer_ARGS } __device__ void ReduceBestSplit(bool* found, double* gain, uint32_t* shared_read_index, @@ -1531,9 +1417,8 @@ __device__ void ReduceBestSplit(bool* found, double* gain, uint32_t* shared_read __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const int larger_leaf_index, CUDASplitInfo* cuda_leaf_best_split_info, // input parameters - const int* cuda_task_feature_index, + const SplitFindTask* tasks, const CUDASplitInfo* cuda_best_split_info, - const uint32_t* cuda_feature_default_bins, const int num_tasks, const int num_tasks_aligned, const int num_blocks_per_leaf, @@ -1571,8 +1456,8 @@ __global__ void SyncBestSplitForLeafKernel(const int smaller_leaf_index, const i const CUDASplitInfo* best_split_info = cuda_best_split_info + best_read_index; if (best_split_info->is_valid) { *cuda_split_info = *best_split_info; - cuda_split_info->inner_feature_index = is_smaller ? cuda_task_feature_index[best_read_index] : - cuda_task_feature_index[static_cast(best_read_index) - num_tasks]; + cuda_split_info->inner_feature_index = is_smaller ? tasks[best_read_index].inner_feature_index : + tasks[static_cast(best_read_index) - num_tasks].inner_feature_index; cuda_split_info->is_valid = true; } else { cuda_split_info->gain = kMinScore; @@ -1687,9 +1572,8 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( host_smaller_leaf_index, host_larger_leaf_index, cuda_leaf_best_split_info_, - cuda_task_feature_index_, + cuda_split_find_tasks_.RawData(), cuda_best_split_info_, - cuda_feature_default_bins_, num_tasks_, num_tasks_aligned, num_blocks_per_leaf, @@ -1709,9 +1593,8 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( host_smaller_leaf_index, host_larger_leaf_index, cuda_leaf_best_split_info_, - cuda_task_feature_index_, + cuda_split_find_tasks_.RawData(), cuda_best_split_info_, - cuda_feature_default_bins_, num_tasks_, num_tasks_aligned, num_blocks_per_leaf, @@ -1732,9 +1615,8 @@ void CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel( host_smaller_leaf_index, host_larger_leaf_index, cuda_leaf_best_split_info_, - cuda_task_feature_index_, + cuda_split_find_tasks_.RawData(), cuda_best_split_info_, - cuda_feature_default_bins_, num_tasks_, num_tasks_aligned, num_blocks_per_leaf, @@ -1858,6 +1740,21 @@ void CUDABestSplitFinder::LaunchAllocateCatVectorsKernel(CUDASplitInfo* cuda_spl cuda_split_infos, len, max_num_categories_in_split, has_categorical_feature_); } +__global__ void InitCUDARandomKernel( + const int seed, + const int num_tasks, + CUDARandom* cuda_randoms) { + const int task_index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (task_index < num_tasks) { + cuda_randoms[task_index].SetSeed(seed + task_index); + } +} + +void CUDABestSplitFinder::LaunchInitCUDARandomKernel() { + const int num_blocks = (num_tasks_ + NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER - 1) / NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER; + InitCUDARandomKernel<<>>(extra_seed_, num_tasks_, cuda_randoms_.RawData()); +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 1040eaeb0c95..6af881435247 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -14,6 +14,7 @@ #include +#include #include #include "cuda_leaf_splits.hpp" @@ -24,6 +25,22 @@ namespace LightGBM { +struct SplitFindTask { + int inner_feature_index; + bool reverse; + bool skip_default_bin; + bool na_as_missing; + bool assume_out_default_left; + bool is_categorical; + bool is_one_hot; + uint32_t hist_offset; + uint8_t mfb_offset; + uint32_t num_bin; + uint32_t default_bin; + CUDARandom* cuda_random; + int rand_threshold; +}; + class CUDABestSplitFinder { public: CUDABestSplitFinder( @@ -105,6 +122,8 @@ class CUDABestSplitFinder { void LaunchAllocateCatVectorsKernel(CUDASplitInfo* cuda_split_infos, size_t len) const; + void LaunchInitCUDARandomKernel(); + // Host memory int num_features_; int num_leaves_; @@ -124,14 +143,11 @@ class CUDABestSplitFinder { int max_cat_threshold_; int min_data_per_group_; int max_cat_to_onehot_; + bool extra_trees_; + int extra_seed_; std::vector cuda_streams_; // for best split find tasks - std::vector host_task_feature_index_; - std::vector host_task_reverse_; - std::vector host_task_skip_default_bin_; - std::vector host_task_na_as_missing_; - std::vector host_task_out_default_left_; - std::vector host_task_one_hot_; + std::vector split_find_tasks_; int num_tasks_; // use global memory bool use_global_memory_; @@ -149,26 +165,18 @@ class CUDABestSplitFinder { CUDASplitInfo* cuda_leaf_best_split_info_; // for best split information when finding best split CUDASplitInfo* cuda_best_split_info_; - // feature information - uint32_t* cuda_feature_hist_offsets_; - uint8_t* cuda_feature_mfb_offsets_; - uint32_t* cuda_feature_default_bins_; - uint32_t* cuda_feature_num_bins_; // best split information buffer, to be copied to host int* cuda_best_split_info_buffer_; // find best split task information - int* cuda_task_feature_index_; - uint8_t* cuda_task_reverse_; - uint8_t* cuda_task_skip_default_bin_; - uint8_t* cuda_task_na_as_missing_; - uint8_t* cuda_task_out_default_left_; + CUDAVector cuda_split_find_tasks_; int8_t* cuda_is_feature_used_bytree_; // used when finding best split with global memory hist_t* cuda_feature_hist_grad_buffer_; hist_t* cuda_feature_hist_hess_buffer_; hist_t* cuda_feature_hist_stat_buffer_; data_size_t* cuda_feature_hist_index_buffer_; - int8_t* cuda_is_categorical_; + // used for extremely randomized trees + CUDAVector cuda_randoms_; // CUDA memory, held by other object const hist_t* cuda_hist_; diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index d94acafa3f40..ef2cad29b35a 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -186,12 +186,6 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, cuda_bitset_inner_, cuda_bitset_inner_len_); } else { - if (train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]) == 7) { - Log::Warning("inner_feature_index = %d, leaf_best_split_threshold_[best_leaf_index_] = %d", - leaf_best_split_feature_[best_leaf_index_], leaf_best_split_threshold_[best_leaf_index_]); - Log::Warning("real threshold = %f", train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], - leaf_best_split_threshold_[best_leaf_index_])); - } right_leaf_index = tree->Split(best_leaf_index_, train_data_->RealFeatureIndex(leaf_best_split_feature_[best_leaf_index_]), train_data_->RealThreshold(leaf_best_split_feature_[best_leaf_index_], From 015e099415c1342f6de4a8d70c3fde19bd81b09d Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 8 Nov 2021 10:18:17 +0000 Subject: [PATCH 111/166] pre-allocate space for vector split_find_tasks_ in CUDABestSplitFinder --- include/LightGBM/cuda/cuda_utils.h | 4 +- .../cuda/cuda_best_split_finder.cpp | 43 +++++++++++++------ 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 8b3646206b6e..a4efad6ebaca 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -112,7 +112,9 @@ class CUDAVector { } void Resize(size_t size) { - CHECK_GT(size, 0); + if (size == 0) { + Clear(); + } T* new_data = nullptr; AllocateCUDAMemory(&new_data, size, __FILE__, __LINE__); if (size_ > 0 && data_ != nullptr) { diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index e77be2d42431..4edae1e3ffbc 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -107,14 +107,25 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { AllocateCUDAMemory(&cuda_is_feature_used_bytree_, static_cast(num_features_), __FILE__, __LINE__); // intialize split find task information (a split find task is one pass through the histogram of a feature) - split_find_tasks_.clear(); + num_tasks_ = 0; + for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { + const uint32_t num_bin = feature_num_bins_[inner_feature_index]; + const MissingType missing_type = feature_missing_type_[inner_feature_index]; + if (num_bin > 2 && missing_type != MissingType::None && !is_categorical_[inner_feature_index]) { + num_tasks_ += 2; + } else { + ++num_tasks_; + } + } + split_find_tasks_.resize(num_tasks_); + split_find_tasks_.shrink_to_fit(); + int cur_task_index = 0; for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { const uint32_t num_bin = feature_num_bins_[inner_feature_index]; const MissingType missing_type = feature_missing_type_[inner_feature_index]; if (num_bin > 2 && missing_type != MissingType::None && !is_categorical_[inner_feature_index]) { if (missing_type == MissingType::Zero) { - split_find_tasks_.emplace_back(); - SplitFindTask& new_task = split_find_tasks_.back(); + SplitFindTask& new_task = split_find_tasks_[cur_task_index]; new_task.reverse = false; new_task.skip_default_bin = true; new_task.na_as_missing = false; @@ -127,8 +138,10 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; new_task.default_bin = feature_default_bins_[inner_feature_index]; new_task.num_bin = num_bin; + new_task.cuda_random = nullptr; + ++cur_task_index; - split_find_tasks_.emplace_back(); + new_task = split_find_tasks_[cur_task_index]; new_task.reverse = true; new_task.skip_default_bin = true; new_task.na_as_missing = false; @@ -141,9 +154,10 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task.default_bin = feature_default_bins_[inner_feature_index]; new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; new_task.num_bin = num_bin; + new_task.cuda_random = nullptr; + ++cur_task_index; } else { - split_find_tasks_.emplace_back(); - SplitFindTask& new_task = split_find_tasks_.back(); + SplitFindTask& new_task = split_find_tasks_[cur_task_index]; new_task.reverse = false; new_task.skip_default_bin = false; new_task.na_as_missing = true; @@ -156,8 +170,10 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; new_task.default_bin = feature_default_bins_[inner_feature_index]; new_task.num_bin = num_bin; + new_task.cuda_random = nullptr; + ++cur_task_index; - split_find_tasks_.emplace_back(); + new_task = split_find_tasks_[cur_task_index]; new_task.reverse = true; new_task.skip_default_bin = false; new_task.na_as_missing = true; @@ -170,15 +186,16 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; new_task.default_bin = feature_default_bins_[inner_feature_index]; new_task.num_bin = num_bin; + new_task.cuda_random = nullptr; + ++cur_task_index; } } else { - split_find_tasks_.emplace_back(); - SplitFindTask& new_task = split_find_tasks_.back(); + SplitFindTask& new_task = split_find_tasks_[cur_task_index]; const uint32_t num_bin = feature_num_bins_[inner_feature_index]; if (is_categorical_[inner_feature_index]) { new_task.reverse = false; new_task.is_categorical = true; - new_task.is_one_hot = (static_cast(num_bin) < max_cat_to_onehot_); + new_task.is_one_hot = (static_cast(num_bin) <= max_cat_to_onehot_); } else { new_task.reverse = true; new_task.is_categorical = false; @@ -187,7 +204,7 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task.skip_default_bin = false; new_task.na_as_missing = false; new_task.inner_feature_index = inner_feature_index; - if (missing_type != MissingType::NaN && is_categorical_[inner_feature_index]) { + if (missing_type == MissingType::NaN && !is_categorical_[inner_feature_index]) { new_task.assume_out_default_left = true; } else { new_task.assume_out_default_left = false; @@ -196,9 +213,11 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; new_task.default_bin = feature_default_bins_[inner_feature_index]; new_task.num_bin = num_bin; + new_task.cuda_random = nullptr; + ++cur_task_index; } } - num_tasks_ = static_cast(split_find_tasks_.size()); + CHECK_EQ(cur_task_index, static_cast(split_find_tasks_.size())); if (extra_trees_) { cuda_randoms_.Resize(num_tasks_); From 4c260d2f529a9840291a4311baf94a4c1053e09f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 8 Nov 2021 13:40:22 +0000 Subject: [PATCH 112/166] fix misuse of reference --- .../cuda/cuda_best_split_finder.cpp | 106 +++++++++--------- .../cuda/cuda_best_split_finder.cu | 8 ++ 2 files changed, 61 insertions(+), 53 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 4edae1e3ffbc..6742edcc6499 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -125,68 +125,68 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { const MissingType missing_type = feature_missing_type_[inner_feature_index]; if (num_bin > 2 && missing_type != MissingType::None && !is_categorical_[inner_feature_index]) { if (missing_type == MissingType::Zero) { - SplitFindTask& new_task = split_find_tasks_[cur_task_index]; - new_task.reverse = false; - new_task.skip_default_bin = true; - new_task.na_as_missing = false; - new_task.inner_feature_index = inner_feature_index; - new_task.assume_out_default_left = false; - new_task.is_categorical = false; + SplitFindTask* new_task = &split_find_tasks_[cur_task_index]; + new_task->reverse = false; + new_task->skip_default_bin = true; + new_task->na_as_missing = false; + new_task->inner_feature_index = inner_feature_index; + new_task->assume_out_default_left = false; + new_task->is_categorical = false; uint32_t num_bin = feature_num_bins_[inner_feature_index]; - new_task.is_one_hot = false; - new_task.hist_offset = feature_hist_offsets_[inner_feature_index]; - new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; - new_task.default_bin = feature_default_bins_[inner_feature_index]; - new_task.num_bin = num_bin; - new_task.cuda_random = nullptr; + new_task->is_one_hot = false; + new_task->hist_offset = feature_hist_offsets_[inner_feature_index]; + new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index]; + new_task->default_bin = feature_default_bins_[inner_feature_index]; + new_task->num_bin = num_bin; + new_task->cuda_random = nullptr; ++cur_task_index; - new_task = split_find_tasks_[cur_task_index]; - new_task.reverse = true; - new_task.skip_default_bin = true; - new_task.na_as_missing = false; - new_task.inner_feature_index = inner_feature_index; - new_task.assume_out_default_left = true; - new_task.is_categorical = false; + new_task = &split_find_tasks_[cur_task_index]; + new_task->reverse = true; + new_task->skip_default_bin = true; + new_task->na_as_missing = false; + new_task->inner_feature_index = inner_feature_index; + new_task->assume_out_default_left = true; + new_task->is_categorical = false; num_bin = feature_num_bins_[inner_feature_index]; - new_task.is_one_hot = false; - new_task.hist_offset = feature_hist_offsets_[inner_feature_index]; - new_task.default_bin = feature_default_bins_[inner_feature_index]; - new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; - new_task.num_bin = num_bin; - new_task.cuda_random = nullptr; + new_task->is_one_hot = false; + new_task->hist_offset = feature_hist_offsets_[inner_feature_index]; + new_task->default_bin = feature_default_bins_[inner_feature_index]; + new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index]; + new_task->num_bin = num_bin; + new_task->cuda_random = nullptr; ++cur_task_index; } else { - SplitFindTask& new_task = split_find_tasks_[cur_task_index]; - new_task.reverse = false; - new_task.skip_default_bin = false; - new_task.na_as_missing = true; - new_task.inner_feature_index = inner_feature_index; - new_task.assume_out_default_left = false; - new_task.is_categorical = false; + SplitFindTask* new_task = &split_find_tasks_[cur_task_index]; + new_task->reverse = false; + new_task->skip_default_bin = false; + new_task->na_as_missing = true; + new_task->inner_feature_index = inner_feature_index; + new_task->assume_out_default_left = false; + new_task->is_categorical = false; uint32_t num_bin = feature_num_bins_[inner_feature_index]; - new_task.is_one_hot = false; - new_task.hist_offset = feature_hist_offsets_[inner_feature_index]; - new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; - new_task.default_bin = feature_default_bins_[inner_feature_index]; - new_task.num_bin = num_bin; - new_task.cuda_random = nullptr; + new_task->is_one_hot = false; + new_task->hist_offset = feature_hist_offsets_[inner_feature_index]; + new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index]; + new_task->default_bin = feature_default_bins_[inner_feature_index]; + new_task->num_bin = num_bin; + new_task->cuda_random = nullptr; ++cur_task_index; - new_task = split_find_tasks_[cur_task_index]; - new_task.reverse = true; - new_task.skip_default_bin = false; - new_task.na_as_missing = true; - new_task.inner_feature_index = inner_feature_index; - new_task.assume_out_default_left = true; - new_task.is_categorical = false; + new_task = &split_find_tasks_[cur_task_index]; + new_task->reverse = true; + new_task->skip_default_bin = false; + new_task->na_as_missing = true; + new_task->inner_feature_index = inner_feature_index; + new_task->assume_out_default_left = true; + new_task->is_categorical = false; num_bin = feature_num_bins_[inner_feature_index]; - new_task.is_one_hot = false; - new_task.hist_offset = feature_hist_offsets_[inner_feature_index]; - new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; - new_task.default_bin = feature_default_bins_[inner_feature_index]; - new_task.num_bin = num_bin; - new_task.cuda_random = nullptr; + new_task->is_one_hot = false; + new_task->hist_offset = feature_hist_offsets_[inner_feature_index]; + new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index]; + new_task->default_bin = feature_default_bins_[inner_feature_index]; + new_task->num_bin = num_bin; + new_task->cuda_random = nullptr; ++cur_task_index; } } else { @@ -204,7 +204,7 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task.skip_default_bin = false; new_task.na_as_missing = false; new_task.inner_feature_index = inner_feature_index; - if (missing_type == MissingType::NaN && !is_categorical_[inner_feature_index]) { + if (missing_type != MissingType::NaN && !is_categorical_[inner_feature_index]) { new_task.assume_out_default_left = true; } else { new_task.assume_out_default_left = false; diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 60e8d5062522..965ff9453702 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -215,6 +215,9 @@ __device__ void FindBestSplitsForLeafKernelInner( (task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast(task->default_bin)) : (task->skip_default_bin && (threadIdx_x + task->mfb_offset) == static_cast(task->default_bin)); const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset; + if (threadIdx.x == 0) { + printf("task->na_as_missing = %d\n", static_cast(task->na_as_missing)); + } if (!REVERSE) { if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { const unsigned int bin_offset = threadIdx_x << 1; @@ -245,6 +248,8 @@ __device__ void FindBestSplitsForLeafKernelInner( const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian; + printf("reverse = %d, threadIdx.x = %d, sum_left_gradient = %f, sum_left_hessian = %f, sum_right_gradient = %f, sum_right_hessian = %f\n", + static_cast(REVERSE), threadIdx.x, sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian); const data_size_t left_count = num_data - right_count; if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { @@ -628,6 +633,9 @@ __global__ void FindBestSplitsForLeafKernel( CUDASplitInfo* cuda_best_split_info) { const unsigned int task_index = blockIdx.x % num_tasks; const SplitFindTask* task = tasks + task_index; + if (threadIdx.x == 0) { + printf("task %d, task->na_as_missing = %d\n", task_index, static_cast(task->na_as_missing)); + } const bool is_larger = static_cast(blockIdx.x >= num_tasks || LARGER_ONLY); const int inner_feature_index = task->inner_feature_index; const double parent_gain = is_larger ? larger_leaf_splits->gain : smaller_leaf_splits->gain; From 89d8214f872052e044b8470997f98fe8b57cfc82 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 8 Nov 2021 13:57:01 +0000 Subject: [PATCH 113/166] remove useless changes --- src/boosting/gbdt.h | 1 + src/treelearner/cuda/cuda_best_split_finder.cu | 8 -------- src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 9ef33ca53ff7..472ea1707104 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -495,6 +495,7 @@ class GBDT : public GBDTBase { /*! \brief Second order derivative of training data */ std::vector> hessians_; #endif + /*! \brief Store the indices of in-bag data */ std::vector> bag_data_indices_; /*! \brief Number of in-bag data */ diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 965ff9453702..60e8d5062522 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -215,9 +215,6 @@ __device__ void FindBestSplitsForLeafKernelInner( (task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast(task->default_bin)) : (task->skip_default_bin && (threadIdx_x + task->mfb_offset) == static_cast(task->default_bin)); const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset; - if (threadIdx.x == 0) { - printf("task->na_as_missing = %d\n", static_cast(task->na_as_missing)); - } if (!REVERSE) { if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { const unsigned int bin_offset = threadIdx_x << 1; @@ -248,8 +245,6 @@ __device__ void FindBestSplitsForLeafKernelInner( const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian; - printf("reverse = %d, threadIdx.x = %d, sum_left_gradient = %f, sum_left_hessian = %f, sum_right_gradient = %f, sum_right_hessian = %f\n", - static_cast(REVERSE), threadIdx.x, sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian); const data_size_t left_count = num_data - right_count; if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { @@ -633,9 +628,6 @@ __global__ void FindBestSplitsForLeafKernel( CUDASplitInfo* cuda_best_split_info) { const unsigned int task_index = blockIdx.x % num_tasks; const SplitFindTask* task = tasks + task_index; - if (threadIdx.x == 0) { - printf("task %d, task->na_as_missing = %d\n", task_index, static_cast(task->na_as_missing)); - } const bool is_larger = static_cast(blockIdx.x >= num_tasks || LARGER_ONLY); const int inner_feature_index = task->inner_feature_index; const double parent_gain = is_larger ? larger_leaf_splits->gain : smaller_leaf_splits->gain; diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index ef2cad29b35a..31065d780756 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -216,7 +216,7 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, &leaf_sum_hessians_[right_leaf_index], &sum_left_gradients, &sum_right_gradients); - //CheckSplitValid(best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients); + CheckSplitValid(best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients); smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index); larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_); global_timer.Stop("CUDASingleGPUTreeLearner::Split"); From 54bc66a9c6f115cc2e61c3c20ac3648794f31615 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 9 Nov 2021 04:59:25 +0000 Subject: [PATCH 114/166] add support for path smoothing --- include/LightGBM/cuda/cuda_tree.hpp | 2 + .../cuda/cuda_best_split_finder.cpp | 17 +- .../cuda/cuda_best_split_finder.cu | 627 +++++++++--------- .../cuda/cuda_best_split_finder.hpp | 33 +- src/treelearner/cuda/cuda_leaf_splits.cpp | 3 +- src/treelearner/cuda/cuda_leaf_splits.cu | 23 +- src/treelearner/cuda/cuda_leaf_splits.hpp | 88 ++- .../cuda/cuda_single_gpu_tree_learner.cpp | 10 +- .../cuda/cuda_single_gpu_tree_learner.cu | 59 +- .../cuda/cuda_single_gpu_tree_learner.hpp | 5 +- 10 files changed, 527 insertions(+), 340 deletions(-) diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index 9b1d271faf92..b8fb47f38b8f 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -56,6 +56,8 @@ class CUDATree : public Tree { uint32_t* cuda_bitset_inner, size_t cuda_bitset_inner_len); + const int* cuda_leaf_parent() const { return cuda_leaf_parent_; } + const int* cuda_left_child() const { return cuda_left_child_; } const int* cuda_right_child() const { return cuda_right_child_; } diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 6742edcc6499..aec614c3c977 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -31,6 +31,8 @@ CUDABestSplitFinder::CUDABestSplitFinder( max_cat_to_onehot_(config->max_cat_to_onehot), extra_trees_(config->extra_trees), extra_seed_(config->extra_seed), + use_smoothing_(config->path_smooth > 0), + path_smooth_(config->path_smooth), num_total_bin_(feature_hist_offsets.empty() ? 0 : static_cast(feature_hist_offsets.back())), cuda_hist_(cuda_hist) { InitFeatureMetaInfo(train_data); @@ -138,7 +140,6 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index]; new_task->default_bin = feature_default_bins_[inner_feature_index]; new_task->num_bin = num_bin; - new_task->cuda_random = nullptr; ++cur_task_index; new_task = &split_find_tasks_[cur_task_index]; @@ -154,7 +155,6 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task->default_bin = feature_default_bins_[inner_feature_index]; new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index]; new_task->num_bin = num_bin; - new_task->cuda_random = nullptr; ++cur_task_index; } else { SplitFindTask* new_task = &split_find_tasks_[cur_task_index]; @@ -170,7 +170,6 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index]; new_task->default_bin = feature_default_bins_[inner_feature_index]; new_task->num_bin = num_bin; - new_task->cuda_random = nullptr; ++cur_task_index; new_task = &split_find_tasks_[cur_task_index]; @@ -186,7 +185,6 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task->mfb_offset = feature_mfb_offsets_[inner_feature_index]; new_task->default_bin = feature_default_bins_[inner_feature_index]; new_task->num_bin = num_bin; - new_task->cuda_random = nullptr; ++cur_task_index; } } else { @@ -213,19 +211,14 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { new_task.mfb_offset = feature_mfb_offsets_[inner_feature_index]; new_task.default_bin = feature_default_bins_[inner_feature_index]; new_task.num_bin = num_bin; - new_task.cuda_random = nullptr; ++cur_task_index; } } CHECK_EQ(cur_task_index, static_cast(split_find_tasks_.size())); if (extra_trees_) { - cuda_randoms_.Resize(num_tasks_); + cuda_randoms_.Resize(num_tasks_ * 2); LaunchInitCUDARandomKernel(); - for (int task_index = 0; task_index < num_tasks_; ++task_index) { - split_find_tasks_[task_index].cuda_random = cuda_randoms_.RawData() + task_index; - split_find_tasks_[task_index].rand_threshold = 0; - } } const int num_task_blocks = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; @@ -281,8 +274,8 @@ void CUDABestSplitFinder::ResetConfig(const Config* config) { void CUDABestSplitFinder::BeforeTrain(const std::vector& is_feature_used_bytree) { CopyFromHostToCUDADevice(cuda_is_feature_used_bytree_, - is_feature_used_bytree.data(), - is_feature_used_bytree.size(), __FILE__, __LINE__); + is_feature_used_bytree.data(), + is_feature_used_bytree.size(), __FILE__, __LINE__); } void CUDABestSplitFinder::FindBestSplitsForLeaf( diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 60e8d5062522..7cf984721461 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -120,71 +120,17 @@ __device__ int ReduceBestGainForLeaves(double gain, int leaf_index, double* shar return leaf_index; } -__device__ double ThresholdL1(double s, double l1) { - const double reg_s = fmax(0.0, fabs(s) - l1); - if (s >= 0.0f) { - return reg_s; - } else { - return -reg_s; - } -} - -__device__ double CUDABestSplitFinder::CalculateSplittedLeafOutput(double sum_gradients, - double sum_hessians, double l1, const bool use_l1, - double l2) { - double ret; - if (use_l1) { - ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2); - } else { - ret = -sum_gradients / (sum_hessians + l2); - } - return ret; -} - -__device__ double GetLeafGainGivenOutput(double sum_gradients, - double sum_hessians, double l1, const bool use_l1, - double l2, double output) { - if (use_l1) { - const double sg_l1 = ThresholdL1(sum_gradients, l1); - return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output); - } else { - return -(2.0 * sum_gradients * output + - (sum_hessians + l2) * output * output); - } -} - -__device__ double GetLeafGain(double sum_gradients, double sum_hessians, - double l1, const bool use_l1, double l2) { - if (use_l1) { - const double sg_l1 = ThresholdL1(sum_gradients, l1); - return (sg_l1 * sg_l1) / (sum_hessians + l2); - } else { - return (sum_gradients * sum_gradients) / (sum_hessians + l2); - } -} - -__device__ double GetSplitGains(double sum_left_gradients, - double sum_left_hessians, - double sum_right_gradients, - double sum_right_hessians, - double l1, const bool use_l1, double l2) { - return GetLeafGain(sum_left_gradients, - sum_left_hessians, - l1, use_l1, l2) + - GetLeafGain(sum_right_gradients, - sum_right_hessians, - l1, use_l1, l2); -} - -template +template __device__ void FindBestSplitsForLeafKernelInner( // input feature information const hist_t* feature_hist_ptr, // input task information const SplitFindTask* task, + CUDARandom* cuda_random, // input config parameter values const double lambda_l1, const double lambda_l2, + const double path_smooth, const data_size_t min_data_in_leaf, const double min_sum_hessian_in_leaf, const double min_gain_to_split, @@ -193,10 +139,10 @@ __device__ void FindBestSplitsForLeafKernelInner( const double sum_gradients, const double sum_hessians, const data_size_t num_data, + const double parent_output, // output parameters CUDASplitInfo* cuda_best_split_info) { const double cnt_factor = num_data / sum_hessians; - const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; cuda_best_split_info->is_valid = false; @@ -206,6 +152,12 @@ __device__ void FindBestSplitsForLeafKernelInner( double local_gain = 0.0f; bool threshold_found = false; uint32_t threshold_value = 0; + __shared__ int rand_threshold; + if (USE_RAND && threadIdx.x == 0) { + if (task->num_bin - 2 > 0) { + rand_threshold = cuda_random->NextInt(0, task->num_bin - 2); + } + } __shared__ uint32_t best_thread_index; __shared__ double shared_double_buffer[32]; __shared__ bool shared_bool_buffer[32]; @@ -247,11 +199,12 @@ __device__ void FindBestSplitsForLeafKernelInner( const double sum_left_hessian = sum_hessians - sum_right_hessian; const data_size_t left_count = num_data - right_count; if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && - sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { - double current_gain = GetSplitGains( + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf && + (!USE_RAND || static_cast(task->num_bin - 2 - threadIdx_x) == rand_threshold)) { + double current_gain = CUDALeafSplits::GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - lambda_l2); + sum_right_hessian, lambda_l1, + lambda_l2, path_smooth, left_count, right_count, parent_output); // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; @@ -269,11 +222,12 @@ __device__ void FindBestSplitsForLeafKernelInner( const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = num_data - left_count; if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && - sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { - double current_gain = GetSplitGains( + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf && + (!USE_RAND || static_cast(threadIdx_x + task->mfb_offset) == rand_threshold)) { + double current_gain = CUDALeafSplits::GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - lambda_l2); + sum_right_hessian, lambda_l1, + lambda_l2, path_smooth, left_count, right_count, parent_output); // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; @@ -301,10 +255,10 @@ __device__ void FindBestSplitsForLeafKernelInner( const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon; const data_size_t left_count = num_data - right_count; - const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2); - const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2); + const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output); + const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -312,11 +266,11 @@ __device__ void FindBestSplitsForLeafKernelInner( cuda_best_split_info->right_sum_hessians = sum_right_hessian; cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; - cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, lambda_l2, left_output); cuda_best_split_info->right_value = right_output; - cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, lambda_l2, right_output); } else { const double sum_left_gradient = local_grad_hist; const double sum_left_hessian = local_hess_hist - kEpsilon; @@ -324,10 +278,10 @@ __device__ void FindBestSplitsForLeafKernelInner( const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon; const data_size_t right_count = num_data - left_count; - const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2); - const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2); + const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output); + const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -335,24 +289,26 @@ __device__ void FindBestSplitsForLeafKernelInner( cuda_best_split_info->right_sum_hessians = sum_right_hessian; cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; - cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, lambda_l2, left_output); cuda_best_split_info->right_value = right_output; - cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, lambda_l2, right_output); } } } -template +template __device__ void FindBestSplitsForLeafKernelCategoricalInner( // input feature information const hist_t* feature_hist_ptr, // input task information const SplitFindTask* task, + CUDARandom* cuda_random, // input config parameter values const double lambda_l1, const double lambda_l2, + const double path_smooth, const data_size_t min_data_in_leaf, const double min_sum_hessian_in_leaf, const double min_gain_to_split, @@ -365,6 +321,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const double sum_gradients, const double sum_hessians, const data_size_t num_data, + const double parent_output, // output parameters CUDASplitInfo* cuda_best_split_info) { __shared__ double shared_gain_buffer[32]; @@ -372,7 +329,6 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( __shared__ uint32_t shared_thread_index_buffer[32]; __shared__ uint32_t best_thread_index; const double cnt_factor = num_data / sum_hessians; - const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; const double l2 = lambda_l2 + cat_l2; @@ -384,7 +340,17 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const int bin_start = 1 - task->mfb_offset; const int bin_end = task->num_bin - task->mfb_offset; const int threadIdx_x = static_cast(threadIdx.x); + + __shared__ int rand_threshold; + if (task->is_one_hot) { + if (USE_RAND && threadIdx.x == 0) { + rand_threshold = 0; + if (bin_end > bin_start) { + rand_threshold = cuda_random->NextInt(bin_start, bin_end); + } + } + __syncthreads(); if (threadIdx_x >= bin_start && threadIdx_x < bin_end) { const int bin_offset = (threadIdx_x << 1); const hist_t grad = feature_hist_ptr[bin_offset]; @@ -395,12 +361,12 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const data_size_t other_count = num_data - cnt; if (other_count >= min_data_in_leaf) { const double sum_other_hessian = sum_hessians - hess - kEpsilon; - if (sum_other_hessian >= min_sum_hessian_in_leaf) { + if (sum_other_hessian >= min_sum_hessian_in_leaf && (!USE_RAND || static_cast(threadIdx_x) == rand_threshold)) { const double sum_other_gradient = sum_gradients - grad; - double current_gain = GetSplitGains( + double current_gain = CUDALeafSplits::GetSplitGains( sum_other_gradient, sum_other_hessian, grad, - hess + kEpsilon, lambda_l1, use_l1, - l2); + hess + kEpsilon, lambda_l1, + l2, path_smooth, other_count, cnt, parent_output); if (current_gain > min_gain_shift) { local_gain = current_gain; threshold_found = true; @@ -428,10 +394,10 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); - const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, l2); - const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, l2); + const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, l2, path_smooth, left_count, parent_output); + const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, l2, path_smooth, right_count, parent_output); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -439,11 +405,11 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( cuda_best_split_info->right_sum_hessians = sum_right_hessian; cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; - cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, l2, left_output); + cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, l2, left_output); cuda_best_split_info->right_value = right_output; - cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, l2, right_output); + cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, l2, right_output); } } else { __shared__ double shared_value_buffer[NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER]; @@ -480,6 +446,14 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( __syncthreads(); const int max_num_cat = min(max_cat_threshold, (used_bin + 1) / 2); + if (USE_RAND) { + rand_threshold = 0; + const int max_threshold = max(min(max_num_cat, used_bin) - 1, 0); + if (max_threshold > 0) { + rand_threshold = cuda_random->NextInt(0, max_threshold); + } + } + // left to right double grad = 0.0f; double hess = 0.0f; @@ -502,11 +476,12 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = num_data - left_count; if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && - sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { - double current_gain = GetSplitGains( + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf && + (!USE_RAND || threadIdx_x == static_cast(rand_threshold))) { + double current_gain = CUDALeafSplits::GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - l2); + sum_right_hessian, lambda_l1, + l2, path_smooth, left_count, right_count, parent_output); // gain with split is worse than without split if (current_gain > local_gain) { local_gain = current_gain; @@ -541,11 +516,12 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = num_data - left_count; if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && - sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { - double current_gain = GetSplitGains( + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf && + (!USE_RAND || threadIdx_x == static_cast(rand_threshold))) { + double current_gain = CUDALeafSplits::GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - l2); + sum_right_hessian, lambda_l1, + l2, path_smooth, left_count, right_count, parent_output); // gain with split is worse than without split if (current_gain > local_gain) { local_gain = current_gain; @@ -583,10 +559,10 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); - const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, l2); - const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, l2); + const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, l2, path_smooth, left_count, parent_output); + const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, l2, path_smooth, right_count, parent_output); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -594,22 +570,23 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( cuda_best_split_info->right_sum_hessians = sum_right_hessian; cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; - cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, l2, left_output); + cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, l2, left_output); cuda_best_split_info->right_value = right_output; - cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, l2, right_output); + cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, l2, right_output); } } } -template +template __global__ void FindBestSplitsForLeafKernel( // input feature information const int8_t* is_feature_used_bytree, // input task information const int num_tasks, const SplitFindTask* tasks, + CUDARandom* cuda_randoms, // input leaf information const CUDALeafSplitsStruct* smaller_leaf_splits, const CUDALeafSplitsStruct* larger_leaf_splits, @@ -619,6 +596,7 @@ __global__ void FindBestSplitsForLeafKernel( const double min_gain_to_split, const double lambda_l1, const double lambda_l2, + const double path_smooth, const double cat_smooth, const double cat_l2, const int max_cat_threshold, @@ -626,27 +604,31 @@ __global__ void FindBestSplitsForLeafKernel( const int max_cat_to_onehot, // output CUDASplitInfo* cuda_best_split_info) { - const unsigned int task_index = blockIdx.x % num_tasks; + const unsigned int task_index = blockIdx.x; const SplitFindTask* task = tasks + task_index; - const bool is_larger = static_cast(blockIdx.x >= num_tasks || LARGER_ONLY); const int inner_feature_index = task->inner_feature_index; - const double parent_gain = is_larger ? larger_leaf_splits->gain : smaller_leaf_splits->gain; - const double sum_gradients = is_larger ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients; - const double sum_hessians = (is_larger ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon; - const double num_data = is_larger ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf; - const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; + const double parent_gain = IS_LARGER ? larger_leaf_splits->gain : smaller_leaf_splits->gain; + const double sum_gradients = IS_LARGER ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients; + const double sum_hessians = (IS_LARGER ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon; + const data_size_t num_data = IS_LARGER ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf; + const double parent_output = IS_LARGER ? larger_leaf_splits->leaf_value : smaller_leaf_splits->leaf_value; + const unsigned int output_offset = IS_LARGER ? (task_index + num_tasks) : task_index; CUDASplitInfo* out = cuda_best_split_info + output_offset; + CUDARandom* cuda_random = USE_RAND ? + (IS_LARGER ? cuda_randoms + task_index * 2 + 1: cuda_randoms + task_index * 2) : nullptr; if (is_feature_used_bytree[inner_feature_index]) { - const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset * 2; + const hist_t* hist_ptr = (IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset * 2; if (task->is_categorical) { - FindBestSplitsForLeafKernelCategoricalInner( + FindBestSplitsForLeafKernelCategoricalInner( // input feature information hist_ptr, // input task information task, + cuda_random, // input config parameter values lambda_l1, lambda_l2, + path_smooth, min_data_in_leaf, min_sum_hessian_in_leaf, min_gain_to_split, @@ -659,18 +641,21 @@ __global__ void FindBestSplitsForLeafKernel( sum_gradients, sum_hessians, num_data, + parent_output, // output parameters out); } else { if (!task->reverse) { - FindBestSplitsForLeafKernelInner( + FindBestSplitsForLeafKernelInner( // input feature information hist_ptr, // input task information task, + cuda_random, // input config parameter values lambda_l1, lambda_l2, + path_smooth, min_data_in_leaf, min_sum_hessian_in_leaf, min_gain_to_split, @@ -679,17 +664,20 @@ __global__ void FindBestSplitsForLeafKernel( sum_gradients, sum_hessians, num_data, + parent_output, // output parameters out); } else { - FindBestSplitsForLeafKernelInner( + FindBestSplitsForLeafKernelInner( // input feature information hist_ptr, // input task information task, + cuda_random, // input config parameter values lambda_l1, lambda_l2, + path_smooth, min_data_in_leaf, min_sum_hessian_in_leaf, min_gain_to_split, @@ -698,6 +686,7 @@ __global__ void FindBestSplitsForLeafKernel( sum_gradients, sum_hessians, num_data, + parent_output, // output parameters out); } @@ -707,15 +696,17 @@ __global__ void FindBestSplitsForLeafKernel( } } -template +template __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( // input feature information const hist_t* feature_hist_ptr, // input task information const SplitFindTask* task, + CUDARandom* cuda_random, // input config parameter values const double lambda_l1, const double lambda_l2, + const double path_smooth, const data_size_t min_data_in_leaf, const double min_sum_hessian_in_leaf, const double min_gain_to_split, @@ -724,19 +715,25 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( const double sum_gradients, const double sum_hessians, const data_size_t num_data, + const double parent_output, // output parameters CUDASplitInfo* cuda_best_split_info, // buffer hist_t* hist_grad_buffer_ptr, hist_t* hist_hess_buffer_ptr) { const double cnt_factor = num_data / sum_hessians; - const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; cuda_best_split_info->is_valid = false; double local_gain = 0.0f; bool threshold_found = false; uint32_t threshold_value = 0; + __shared__ int rand_threshold; + if (USE_RAND && threadIdx.x == 0) { + if (task->num_bin - 2 > 0) { + rand_threshold = cuda_random->NextInt(0, task->num_bin - 2); + } + } __shared__ uint32_t best_thread_index; __shared__ double shared_gain_buffer[32]; __shared__ bool shared_found_buffer[32]; @@ -791,11 +788,12 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( const double sum_left_hessian = sum_hessians - sum_right_hessian; const data_size_t left_count = num_data - right_count; if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && - sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { - double current_gain = GetSplitGains( + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf && + (!USE_RAND || static_cast(task->num_bin - 2 - bin) == rand_threshold)) { + double current_gain = CUDALeafSplits::GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - lambda_l2); + sum_right_hessian, lambda_l1, + lambda_l2, path_smooth, left_count, right_count, parent_output); // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; @@ -817,11 +815,12 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = num_data - left_count; if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && - sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { - double current_gain = GetSplitGains( + sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf && + (!USE_RAND || static_cast(bin + task->mfb_offset) == rand_threshold)) { + double current_gain = CUDALeafSplits::GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - lambda_l2); + sum_right_hessian, lambda_l1, + lambda_l2, path_smooth, left_count, right_count, parent_output); // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; @@ -851,10 +850,10 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( const double sum_left_gradient = sum_gradients - sum_right_gradient; const double sum_left_hessian = sum_hessians - sum_right_hessian - kEpsilon; const data_size_t left_count = num_data - right_count; - const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2); - const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2); + const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output); + const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -862,11 +861,11 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( cuda_best_split_info->right_sum_hessians = sum_right_hessian; cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; - cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, lambda_l2, left_output); cuda_best_split_info->right_value = right_output; - cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, lambda_l2, right_output); } else { const unsigned int best_bin = static_cast(threshold_value - task->mfb_offset); const double sum_left_gradient = hist_grad_buffer_ptr[best_bin]; @@ -875,10 +874,10 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian - kEpsilon; const data_size_t right_count = num_data - left_count; - const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2); - const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2); + const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output); + const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -886,24 +885,26 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( cuda_best_split_info->right_sum_hessians = sum_right_hessian; cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; - cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, lambda_l2, left_output); + cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, lambda_l2, left_output); cuda_best_split_info->right_value = right_output; - cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, lambda_l2, right_output); + cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, lambda_l2, right_output); } } } -template +template __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( // input feature information const hist_t* feature_hist_ptr, // input task information const SplitFindTask* task, + CUDARandom* cuda_random, // input config parameter values const double lambda_l1, const double lambda_l2, + const double path_smooth, const data_size_t min_data_in_leaf, const double min_sum_hessian_in_leaf, const double min_gain_to_split, @@ -916,6 +917,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( const double sum_gradients, const double sum_hessians, const data_size_t num_data, + const double parent_output, // buffer hist_t* hist_grad_buffer_ptr, hist_t* hist_hess_buffer_ptr, @@ -928,7 +930,6 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( __shared__ uint32_t shared_thread_index_buffer[32]; __shared__ uint32_t best_thread_index; const double cnt_factor = num_data / sum_hessians; - const bool use_l1 = lambda_l1 > 0.0f; const double min_gain_shift = parent_gain + min_gain_to_split; const double l2 = lambda_l2 + cat_l2; @@ -937,11 +938,20 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( cuda_best_split_info->is_valid = false; + __shared__ int rand_threshold; + const int bin_start = 1 - task->mfb_offset; const int bin_end = task->num_bin - task->mfb_offset; int best_threshold = -1; const int threadIdx_x = static_cast(threadIdx.x); if (task->is_one_hot) { + if (USE_RAND && threadIdx.x == 0) { + rand_threshold = 0; + if (bin_end > bin_start) { + rand_threshold = cuda_random->NextInt(bin_start, bin_end); + } + } + __syncthreads(); for (int bin = bin_start + threadIdx_x; bin < bin_end; bin += static_cast(blockDim.x)) { const int bin_offset = (bin << 1); const hist_t grad = feature_hist_ptr[bin_offset]; @@ -952,12 +962,12 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( const data_size_t other_count = num_data - cnt; if (other_count >= min_data_in_leaf) { const double sum_other_hessian = sum_hessians - hess - kEpsilon; - if (sum_other_hessian >= min_sum_hessian_in_leaf) { + if (sum_other_hessian >= min_sum_hessian_in_leaf && (!USE_RAND || bin == rand_threshold)) { const double sum_other_gradient = sum_gradients - grad; - double current_gain = GetSplitGains( + double current_gain = CUDALeafSplits::GetSplitGains( sum_other_gradient, sum_other_hessian, grad, - hess + kEpsilon, lambda_l1, use_l1, - l2); + hess + kEpsilon, lambda_l1, + l2, path_smooth, other_count, cnt, parent_output); if (current_gain > min_gain_shift) { best_threshold = bin; local_gain = current_gain - min_gain_shift; @@ -986,10 +996,10 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); - const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, l2); - const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, l2); + const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, l2, path_smooth, left_count, parent_output); + const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, l2, path_smooth, right_count, parent_output); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -997,11 +1007,11 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( cuda_best_split_info->right_sum_hessians = sum_right_hessian; cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; - cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, l2, left_output); + cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, l2, left_output); cuda_best_split_info->right_value = right_output; - cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, l2, right_output); + cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, l2, right_output); } } else { __shared__ uint16_t shared_mem_buffer_uint16[32]; @@ -1034,13 +1044,20 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( BitonicArgSortDevice( hist_stat_buffer_ptr, hist_index_buffer_ptr, task->num_bin - task->mfb_offset); const int max_num_cat = min(max_cat_threshold, (used_bin + 1) / 2); + if (USE_RAND) { + rand_threshold = 0; + const int max_threshold = max(min(max_num_cat, used_bin) - 1, 0); + if (max_threshold > 0) { + rand_threshold = cuda_random->NextInt(0, max_threshold); + } + } __syncthreads(); // left to right - if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { - const int bin_offset = (hist_index_buffer_ptr[threadIdx_x] << 1); - hist_grad_buffer_ptr[threadIdx_x] = feature_hist_ptr[bin_offset]; - hist_hess_buffer_ptr[threadIdx_x] = feature_hist_ptr[bin_offset + 1]; + for (int bin = static_cast(threadIdx_x); bin < used_bin && bin < max_num_cat; bin += static_cast(blockDim.x)) { + const int bin_offset = (hist_index_buffer_ptr[bin] << 1); + hist_grad_buffer_ptr[bin] = feature_hist_ptr[bin_offset]; + hist_hess_buffer_ptr[bin] = feature_hist_ptr[bin_offset + 1]; } if (threadIdx_x == 0) { hist_hess_buffer_ptr[0] += kEpsilon; @@ -1050,19 +1067,19 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( __syncthreads(); GlobalMemoryPrefixSum(hist_hess_buffer_ptr, static_cast(bin_end)); // TODO(shiyu1994): constrain the split with min_data_in_group - if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { - const double sum_left_gradient = hist_grad_buffer_ptr[threadIdx_x]; - const double sum_left_hessian = hist_hess_buffer_ptr[threadIdx_x]; + for (int bin = static_cast(threadIdx_x); bin < used_bin && bin < max_num_cat; bin += static_cast(blockDim.x)) { + const double sum_left_gradient = hist_grad_buffer_ptr[bin]; + const double sum_left_hessian = hist_hess_buffer_ptr[bin]; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = num_data - left_count; if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { - double current_gain = GetSplitGains( + double current_gain = CUDALeafSplits::GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - l2); + sum_right_hessian, lambda_l1, + l2, path_smooth, left_count, right_count, parent_output); // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; @@ -1070,16 +1087,17 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( best_dir = 1; best_sum_left_gradient = sum_left_gradient; best_sum_left_hessian = sum_left_hessian; + best_threshold = bin; } } } __syncthreads(); // right to left - if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { - const int bin_offset = (hist_index_buffer_ptr[used_bin - 1 - threadIdx_x] << 1); - hist_grad_buffer_ptr[threadIdx_x] = feature_hist_ptr[bin_offset]; - hist_hess_buffer_ptr[threadIdx_x] = feature_hist_ptr[bin_offset + 1]; + for (int bin = static_cast(threadIdx_x); bin < used_bin && bin < max_num_cat; bin += static_cast(blockDim.x)) { + const int bin_offset = (hist_index_buffer_ptr[used_bin - 1 - bin] << 1); + hist_grad_buffer_ptr[bin] = feature_hist_ptr[bin_offset]; + hist_hess_buffer_ptr[bin] = feature_hist_ptr[bin_offset + 1]; } if (threadIdx_x == 0) { hist_hess_buffer_ptr[0] += kEpsilon; @@ -1089,19 +1107,19 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( __syncthreads(); GlobalMemoryPrefixSum(hist_hess_buffer_ptr, static_cast(bin_end)); // TODO(shiyu1994): constrain the split with min_data_in_group - if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { - const double sum_left_gradient = hist_grad_buffer_ptr[threadIdx_x]; - const double sum_left_hessian = hist_hess_buffer_ptr[threadIdx_x]; + for (int bin = static_cast(threadIdx_x); bin < used_bin && bin < max_num_cat; bin += static_cast(blockDim.x)) { + const double sum_left_gradient = hist_grad_buffer_ptr[bin]; + const double sum_left_hessian = hist_hess_buffer_ptr[bin]; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = num_data - left_count; if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf && sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf) { - double current_gain = GetSplitGains( + double current_gain = CUDALeafSplits::GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, - l2); + sum_right_hessian, lambda_l1, + l2, path_smooth, left_count, right_count, parent_output); // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; @@ -1109,6 +1127,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( best_dir = -1; best_sum_left_gradient = sum_left_gradient; best_sum_left_hessian = sum_left_hessian; + best_threshold = bin; } } } @@ -1121,15 +1140,15 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( __syncthreads(); if (threshold_found && threadIdx_x == best_thread_index) { cuda_best_split_info->is_valid = true; - cuda_best_split_info->num_cat_threshold = threadIdx_x + 1; - cuda_best_split_info->cat_threshold = new uint32_t[threadIdx_x + 1]; + cuda_best_split_info->num_cat_threshold = best_threshold + 1; + cuda_best_split_info->cat_threshold = new uint32_t[best_threshold + 1]; cuda_best_split_info->gain = local_gain; if (best_dir == 1) { - for (int i = 0; i < threadIdx_x + 1; ++i) { + for (int i = 0; i < best_threshold + 1; ++i) { (cuda_best_split_info->cat_threshold)[i] = hist_index_buffer_ptr[i] + task->mfb_offset; } } else { - for (int i = 0; i < threadIdx_x + 1; ++i) { + for (int i = 0; i < best_threshold + 1; ++i) { (cuda_best_split_info->cat_threshold)[i] = hist_index_buffer_ptr[used_bin - 1 - i] + task->mfb_offset; } } @@ -1140,10 +1159,10 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( const double sum_right_gradient = sum_gradients - sum_left_gradient; const double sum_right_hessian = sum_hessians - sum_left_hessian; const data_size_t right_count = static_cast(__double2int_rn(sum_right_hessian * cnt_factor)); - const double left_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, l2); - const double right_output = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, l2); + const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, l2, path_smooth, left_count, parent_output); + const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, l2, path_smooth, right_count, parent_output); cuda_best_split_info->left_sum_gradients = sum_left_gradient; cuda_best_split_info->left_sum_hessians = sum_left_hessian; cuda_best_split_info->left_count = left_count; @@ -1151,22 +1170,23 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( cuda_best_split_info->right_sum_hessians = sum_right_hessian; cuda_best_split_info->right_count = right_count; cuda_best_split_info->left_value = left_output; - cuda_best_split_info->left_gain = GetLeafGainGivenOutput(sum_left_gradient, - sum_left_hessian, lambda_l1, use_l1, l2, left_output); + cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_left_gradient, + sum_left_hessian, lambda_l1, l2, left_output); cuda_best_split_info->right_value = right_output; - cuda_best_split_info->right_gain = GetLeafGainGivenOutput(sum_right_gradient, - sum_right_hessian, lambda_l1, use_l1, l2, right_output); + cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_right_gradient, + sum_right_hessian, lambda_l1, l2, right_output); } } } -template +template __global__ void FindBestSplitsForLeafKernel_GlobalMemory( // input feature information const int8_t* is_feature_used_bytree, // input task information const int num_tasks, const SplitFindTask* tasks, + CUDARandom* cuda_randoms, // input leaf information const CUDALeafSplitsStruct* smaller_leaf_splits, const CUDALeafSplitsStruct* larger_leaf_splits, @@ -1176,6 +1196,7 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( const double min_gain_to_split, const double lambda_l1, const double lambda_l2, + const double path_smooth, const double cat_smooth, const double cat_l2, const int max_cat_threshold, @@ -1188,31 +1209,35 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( hist_t* feature_hist_hess_buffer, hist_t* feature_hist_stat_buffer, data_size_t* feature_hist_index_buffer) { - const unsigned int task_index = blockIdx.x % num_tasks; + const unsigned int task_index = blockIdx.x; const SplitFindTask* task = tasks + task_index; - const bool is_larger = static_cast(blockIdx.x >= num_tasks || LARGER_ONLY); - const double parent_gain = is_larger ? larger_leaf_splits->gain : smaller_leaf_splits->gain; - const double sum_gradients = is_larger ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients; - const double sum_hessians = (is_larger ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon; - const double num_data = is_larger ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf; - const unsigned int output_offset = is_larger ? (task_index + num_tasks) : task_index; + const double parent_gain = IS_LARGER ? larger_leaf_splits->gain : smaller_leaf_splits->gain; + const double sum_gradients = IS_LARGER ? larger_leaf_splits->sum_of_gradients : smaller_leaf_splits->sum_of_gradients; + const double sum_hessians = (IS_LARGER ? larger_leaf_splits->sum_of_hessians : smaller_leaf_splits->sum_of_hessians) + 2 * kEpsilon; + const data_size_t num_data = IS_LARGER ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf; + const double parent_output = IS_LARGER ? larger_leaf_splits->leaf_value : smaller_leaf_splits->leaf_value; + const unsigned int output_offset = IS_LARGER ? (task_index + num_tasks) : task_index; CUDASplitInfo* out = cuda_best_split_info + output_offset; + CUDARandom* cuda_random = USE_RAND ? + (IS_LARGER ? cuda_randoms + task_index * 2 + 1: cuda_randoms + task_index * 2) : nullptr; if (is_feature_used_bytree[task->inner_feature_index]) { const uint32_t hist_offset = task->hist_offset; - const hist_t* hist_ptr = (is_larger ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + hist_offset * 2; + const hist_t* hist_ptr = (IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + hist_offset * 2; hist_t* hist_grad_buffer_ptr = feature_hist_grad_buffer + hist_offset * 2; hist_t* hist_hess_buffer_ptr = feature_hist_hess_buffer + hist_offset * 2; hist_t* hist_stat_buffer_ptr = feature_hist_stat_buffer + hist_offset * 2; data_size_t* hist_index_buffer_ptr = feature_hist_index_buffer + hist_offset * 2; if (task->is_categorical) { - FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( + FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( // input feature information hist_ptr, // input task information task, + cuda_random, // input config parameter values lambda_l1, lambda_l2, + path_smooth, min_data_in_leaf, min_sum_hessian_in_leaf, min_gain_to_split, @@ -1225,6 +1250,7 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( sum_gradients, sum_hessians, num_data, + parent_output, // buffer hist_grad_buffer_ptr, hist_hess_buffer_ptr, @@ -1234,14 +1260,16 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( out); } else { if (!task->reverse) { - FindBestSplitsForLeafKernelInner_GlobalMemory( + FindBestSplitsForLeafKernelInner_GlobalMemory( // input feature information hist_ptr, // input task information task, + cuda_random, // input config parameter values lambda_l1, lambda_l2, + path_smooth, min_data_in_leaf, min_sum_hessian_in_leaf, min_gain_to_split, @@ -1250,20 +1278,23 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( sum_gradients, sum_hessians, num_data, + parent_output, // output parameters out, // buffer hist_grad_buffer_ptr, hist_hess_buffer_ptr); } else { - FindBestSplitsForLeafKernelInner_GlobalMemory( + FindBestSplitsForLeafKernelInner_GlobalMemory( // input feature information hist_ptr, // input task information task, + cuda_random, // input config parameter values lambda_l1, lambda_l2, + path_smooth, min_data_in_leaf, min_sum_hessian_in_leaf, min_gain_to_split, @@ -1272,6 +1303,7 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( sum_gradients, sum_hessians, num_data, + parent_output, // output parameters out, // buffer @@ -1284,119 +1316,110 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( } } -void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel( - const CUDALeafSplitsStruct* smaller_leaf_splits, - const CUDALeafSplitsStruct* larger_leaf_splits, - const int smaller_leaf_index, - const int larger_leaf_index, - const bool is_smaller_leaf_valid, - const bool is_larger_leaf_valid) { +#define LaunchFindBestSplitsForLeafKernel_PARAMS \ + const CUDALeafSplitsStruct* smaller_leaf_splits, \ + const CUDALeafSplitsStruct* larger_leaf_splits, \ + const int smaller_leaf_index, \ + const int larger_leaf_index, \ + const bool is_smaller_leaf_valid, \ + const bool is_larger_leaf_valid + +#define LaunchFindBestSplitsForLeafKernel_ARGS \ + smaller_leaf_splits, \ + larger_leaf_splits, \ + smaller_leaf_index, \ + larger_leaf_index, \ + is_smaller_leaf_valid, \ + is_larger_leaf_valid + +#define FindBestSplitsForLeafKernel_ARGS \ + cuda_is_feature_used_bytree_, \ + num_tasks_, \ + cuda_split_find_tasks_.RawData(), \ + cuda_randoms_.RawData(), \ + smaller_leaf_splits, \ + larger_leaf_splits, \ + min_data_in_leaf_, \ + min_sum_hessian_in_leaf_, \ + min_gain_to_split_, \ + lambda_l1_, \ + lambda_l2_, \ + path_smooth_, \ + cat_smooth_, \ + cat_l2_, \ + max_cat_threshold_, \ + min_data_per_group_, \ + max_cat_to_onehot_, \ + cuda_best_split_info_ + +#define GlobalMemory_Buffer_ARGS \ + cuda_feature_hist_grad_buffer_, \ + cuda_feature_hist_hess_buffer_, \ + cuda_feature_hist_stat_buffer_, \ + cuda_feature_hist_index_buffer_ + +void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernel(LaunchFindBestSplitsForLeafKernel_PARAMS) { if (!is_smaller_leaf_valid && !is_larger_leaf_valid) { return; } - bool larger_only = false; - if (!is_smaller_leaf_valid) { - larger_only = true; + if (!extra_trees_) { + LaunchFindBestSplitsForLeafKernelInner0(LaunchFindBestSplitsForLeafKernel_ARGS); + } else { + LaunchFindBestSplitsForLeafKernelInner0(LaunchFindBestSplitsForLeafKernel_ARGS); } +} - #define FindBestSplitsForLeafKernel_ARGS \ - cuda_is_feature_used_bytree_, \ - num_tasks_, \ - cuda_split_find_tasks_.RawData(), \ - smaller_leaf_splits, \ - larger_leaf_splits, \ - min_data_in_leaf_, \ - min_sum_hessian_in_leaf_, \ - min_gain_to_split_, \ - lambda_l1_, \ - lambda_l2_, \ - cat_smooth_, \ - cat_l2_, \ - max_cat_threshold_, \ - min_data_per_group_, \ - max_cat_to_onehot_, \ - cuda_best_split_info_ +template +void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernelInner0(LaunchFindBestSplitsForLeafKernel_PARAMS) { + if (lambda_l1_ <= 0.0f) { + LaunchFindBestSplitsForLeafKernelInner1(LaunchFindBestSplitsForLeafKernel_ARGS); + } else { + LaunchFindBestSplitsForLeafKernelInner1(LaunchFindBestSplitsForLeafKernel_ARGS); + } +} - #define GlobalMemory_Buffer_ARGS \ - cuda_feature_hist_grad_buffer_, \ - cuda_feature_hist_hess_buffer_, \ - cuda_feature_hist_stat_buffer_, \ - cuda_feature_hist_index_buffer_ +template +void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernelInner1(LaunchFindBestSplitsForLeafKernel_PARAMS) { + if (!use_smoothing_) { + LaunchFindBestSplitsForLeafKernelInner2(LaunchFindBestSplitsForLeafKernel_ARGS); + } else { + LaunchFindBestSplitsForLeafKernelInner2(LaunchFindBestSplitsForLeafKernel_ARGS); + } +} +template +void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernelInner2(LaunchFindBestSplitsForLeafKernel_PARAMS) { if (!use_global_memory_) { - if (!extra_trees_) { - if (!larger_only) { - FindBestSplitsForLeafKernel + if (is_smaller_leaf_valid) { + FindBestSplitsForLeafKernel <<>> (FindBestSplitsForLeafKernel_ARGS); - } else { - FindBestSplitsForLeafKernel - <<>> - (FindBestSplitsForLeafKernel_ARGS); - } - } else { - if (!larger_only) { - FindBestSplitsForLeafKernel - <<>> - (FindBestSplitsForLeafKernel_ARGS); - } else { - FindBestSplitsForLeafKernel - <<>> - (FindBestSplitsForLeafKernel_ARGS); - } } SynchronizeCUDADevice(__FILE__, __LINE__); - if (larger_leaf_index >= 0) { - if (!extra_trees_) { - FindBestSplitsForLeafKernel - <<>> - (FindBestSplitsForLeafKernel_ARGS); - } else { - FindBestSplitsForLeafKernel - <<>> - (FindBestSplitsForLeafKernel_ARGS); - } + if (is_larger_leaf_valid) { + FindBestSplitsForLeafKernel + <<>> + (FindBestSplitsForLeafKernel_ARGS); } } else { - if (!extra_trees_) { - if (!larger_only) { - FindBestSplitsForLeafKernel_GlobalMemory + if (is_smaller_leaf_valid) { + FindBestSplitsForLeafKernel_GlobalMemory <<>> (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); - } else { - FindBestSplitsForLeafKernel_GlobalMemory - <<>> - (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); - } - } else { - if (!larger_only) { - FindBestSplitsForLeafKernel_GlobalMemory - <<>> - (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); - } else { - FindBestSplitsForLeafKernel_GlobalMemory - <<>> - (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); - } } SynchronizeCUDADevice(__FILE__, __LINE__); - if (larger_leaf_index >= 0) { - if (!extra_trees_) { - FindBestSplitsForLeafKernel_GlobalMemory - <<>> - (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); - } else { - FindBestSplitsForLeafKernel_GlobalMemory - <<>> - (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); - } + if (is_larger_leaf_valid) { + FindBestSplitsForLeafKernel_GlobalMemory + <<>> + (FindBestSplitsForLeafKernel_ARGS, GlobalMemory_Buffer_ARGS); } } - - #undef FindBestSplitsForLeafKernel_ARGS - #undef GlobalMemory_Buffer_ARGS } +#undef LaunchFindBestSplitsForLeafKernel_PARAMS +#undef FindBestSplitsForLeafKernel_ARGS +#undef GlobalMemory_Buffer_ARGS + __device__ void ReduceBestSplit(bool* found, double* gain, uint32_t* shared_read_index, uint32_t num_features_aligned) { const uint32_t threadIdx_x = threadIdx.x; @@ -1751,8 +1774,10 @@ __global__ void InitCUDARandomKernel( } void CUDABestSplitFinder::LaunchInitCUDARandomKernel() { - const int num_blocks = (num_tasks_ + NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER - 1) / NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER; - InitCUDARandomKernel<<>>(extra_seed_, num_tasks_, cuda_randoms_.RawData()); + const int num_blocks = (static_cast(cuda_randoms_.Size()) + + NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER - 1) / NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER; + InitCUDARandomKernel<<>>(extra_seed_, + static_cast(cuda_randoms_.Size()), cuda_randoms_.RawData()); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 6af881435247..ac800228b8c9 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -37,7 +37,7 @@ struct SplitFindTask { uint8_t mfb_offset; uint32_t num_bin; uint32_t default_bin; - CUDARandom* cuda_random; + //CUDARandom* cuda_random; int rand_threshold; }; @@ -89,15 +89,28 @@ class CUDABestSplitFinder { void ResetConfig(const Config* config); - __device__ static double CalculateSplittedLeafOutput( - double sum_gradients, - double sum_hessians, double l1, const bool use_l1, - double l2); - private: - void LaunchFindBestSplitsForLeafKernel(const CUDALeafSplitsStruct* smaller_leaf_splits, - const CUDALeafSplitsStruct* larger_leaf_splits, const int smaller_leaf_index, const int larger_leaf_index, - const bool is_smaller_leaf_valid, const bool is_larger_leaf_valid); + + #define LaunchFindBestSplitsForLeafKernel_PARAMS \ + const CUDALeafSplitsStruct* smaller_leaf_splits, \ + const CUDALeafSplitsStruct* larger_leaf_splits, \ + const int smaller_leaf_index, \ + const int larger_leaf_index, \ + const bool is_smaller_leaf_valid, \ + const bool is_larger_leaf_valid + + void LaunchFindBestSplitsForLeafKernel(LaunchFindBestSplitsForLeafKernel_PARAMS); + + template + void LaunchFindBestSplitsForLeafKernelInner0(LaunchFindBestSplitsForLeafKernel_PARAMS); + + template + void LaunchFindBestSplitsForLeafKernelInner1(LaunchFindBestSplitsForLeafKernel_PARAMS); + + template + void LaunchFindBestSplitsForLeafKernelInner2(LaunchFindBestSplitsForLeafKernel_PARAMS); + + #undef LaunchFindBestSplitsForLeafKernel_PARAMS void LaunchSyncBestSplitForLeafKernel( const int host_smaller_leaf_index, @@ -145,6 +158,8 @@ class CUDABestSplitFinder { int max_cat_to_onehot_; bool extra_trees_; int extra_seed_; + bool use_smoothing_; + double path_smooth_; std::vector cuda_streams_; // for best split find tasks std::vector split_find_tasks_; diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index c7dc55eefba6..6aa020d9ea0d 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -40,6 +40,7 @@ void CUDALeafSplits::InitValues() { } void CUDALeafSplits::InitValues( + const double lambda_l1, const double lambda_l2, const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf, double* root_sum_hessians) { @@ -47,7 +48,7 @@ void CUDALeafSplits::InitValues( cuda_hessians_ = cuda_hessians; SetCUDAMemory(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); SetCUDAMemory(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__); - LaunchInitValuesKernal(cuda_bagging_data_indices, cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf); + LaunchInitValuesKernal(lambda_l1, lambda_l2, cuda_bagging_data_indices, cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf); CopyFromCUDADeviceToHost(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, __FILE__, __LINE__); SynchronizeCUDADevice(__FILE__, __LINE__); } diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 2c47196ba704..29e42f67ead9 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -34,6 +34,8 @@ __global__ void CUDAInitValuesKernel1(const score_t* cuda_gradients, const score } __global__ void CUDAInitValuesKernel2( + const double lambda_l1, + const double lambda_l2, const int num_blocks_to_reduce, double* cuda_sum_of_gradients, double* cuda_sum_of_hessians, @@ -57,8 +59,23 @@ __global__ void CUDAInitValuesKernel2( cuda_struct->sum_of_gradients = sum_of_gradients; cuda_struct->sum_of_hessians = sum_of_hessians; cuda_struct->num_data_in_leaf = num_data; - cuda_struct->gain = 0.0f; - cuda_struct->leaf_value = 0.0f; + const bool use_l1 = lambda_l1 > 0.0f; + if (!use_l1) { + // no smoothing on root node + cuda_struct->gain = CUDALeafSplits::GetLeafGain(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f); + } else { + // no smoothing on root node + cuda_struct->gain = CUDALeafSplits::GetLeafGain(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f); + } + if (!use_l1) { + // no smoothing on root node + cuda_struct->leaf_value = + CUDALeafSplits::CalculateSplittedLeafOutput(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f); + } else { + // no smoothing on root node + cuda_struct->leaf_value = + CUDALeafSplits::CalculateSplittedLeafOutput(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f); + } cuda_struct->data_indices_in_leaf = cuda_data_indices_in_leaf; cuda_struct->hist_in_leaf = cuda_hist_in_leaf; } @@ -80,6 +97,7 @@ void CUDALeafSplits::LaunchInitValuesEmptyKernel() { } void CUDALeafSplits::LaunchInitValuesKernal( + const double lambda_l1, const double lambda_l2, const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, @@ -95,6 +113,7 @@ void CUDALeafSplits::LaunchInitValuesKernal( } SynchronizeCUDADevice(__FILE__, __LINE__); CUDAInitValuesKernel2<<<1, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>( + lambda_l1, lambda_l2, num_blocks_init_from_gradients_, cuda_sum_of_gradients_buffer_, cuda_sum_of_hessians_buffer_, diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index 3ad0284dc86d..1a54b1153c6b 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -39,6 +39,7 @@ class CUDALeafSplits { void Init(); void InitValues( + const double lambda_l1, const double lambda_l2, const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, @@ -52,13 +53,92 @@ class CUDALeafSplits { void Resize(const data_size_t num_data); + __device__ static double ThresholdL1(double s, double l1) { + const double reg_s = fmax(0.0, fabs(s) - l1); + if (s >= 0.0f) { + return reg_s; + } else { + return -reg_s; + } + } + + template + __device__ static double CalculateSplittedLeafOutput(double sum_gradients, + double sum_hessians, double l1, double l2, + double path_smooth, data_size_t num_data, + double parent_output) { + double ret; + if (USE_L1) { + ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2); + } else { + ret = -sum_gradients / (sum_hessians + l2); + } + if (USE_SMOOTHING) { + ret = ret * (num_data / path_smooth) / (num_data / path_smooth + 1) \ + + parent_output / (num_data / path_smooth + 1); + } + return ret; + } + + template + __device__ static double GetLeafGainGivenOutput(double sum_gradients, + double sum_hessians, double l1, + double l2, double output) { + if (USE_L1) { + const double sg_l1 = ThresholdL1(sum_gradients, l1); + return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output); + } else { + return -(2.0 * sum_gradients * output + + (sum_hessians + l2) * output * output); + } + } + + template + __device__ static double GetLeafGain(double sum_gradients, double sum_hessians, + double l1, double l2, + double path_smooth, data_size_t num_data, + double parent_output) { + if (!USE_SMOOTHING) { + if (USE_L1) { + const double sg_l1 = ThresholdL1(sum_gradients, l1); + return (sg_l1 * sg_l1) / (sum_hessians + l2); + } else { + return (sum_gradients * sum_gradients) / (sum_hessians + l2); + } + } else { + const double output = CalculateSplittedLeafOutput( + sum_gradients, sum_hessians, l1, l2, path_smooth, num_data, parent_output); + return GetLeafGainGivenOutput(sum_gradients, sum_hessians, l1, l2, output); + } + } + + template + __device__ static double GetSplitGains(double sum_left_gradients, + double sum_left_hessians, + double sum_right_gradients, + double sum_right_hessians, + double l1, double l2, + double path_smooth, + data_size_t left_count, + data_size_t right_count, + double parent_output) { + return GetLeafGain(sum_left_gradients, + sum_left_hessians, + l1, l2, path_smooth, left_count, parent_output) + + GetLeafGain(sum_right_gradients, + sum_right_hessians, + l1, l2, path_smooth, right_count, parent_output); + } + private: void LaunchInitValuesEmptyKernel(); - void LaunchInitValuesKernal(const data_size_t* cuda_bagging_data_indices, - const data_size_t* cuda_data_indices_in_leaf, - const data_size_t num_used_indices, - hist_t* cuda_hist_in_leaf); + void LaunchInitValuesKernal( + const double lambda_l1, const double lambda_l2, + const data_size_t* cuda_bagging_data_indices, + const data_size_t* cuda_data_indices_in_leaf, + const data_size_t num_used_indices, + hist_t* cuda_hist_in_leaf); // Host memory data_size_t num_data_; diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 31065d780756..62d04499545c 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -81,6 +81,8 @@ void CUDASingleGPUTreeLearner::BeforeTrain() { cuda_data_partition_->use_bagging() ? cuda_data_partition_->cuda_data_indices() : nullptr; cuda_data_partition_->BeforeTrain(); cuda_smaller_leaf_splits_->InitValues( + config_->lambda_l1, + config_->lambda_l2, cuda_gradients_, cuda_hessians_, leaf_splits_init_indices, @@ -322,7 +324,7 @@ Tree* CUDASingleGPUTreeLearner::FitByExistingTree(const Tree* old_tree, const sc std::unique_ptr cuda_tree(new CUDATree(old_tree)); SetCUDAMemory(cuda_leaf_gradient_stat_buffer_, 0, static_cast(old_tree->num_leaves()), __FILE__, __LINE__); SetCUDAMemory(cuda_leaf_hessian_stat_buffer_, 0, static_cast(old_tree->num_leaves()), __FILE__, __LINE__); - ReduceLeafStat(cuda_tree.get(), gradients, hessians); + ReduceLeafStat(cuda_tree.get(), gradients, hessians, cuda_data_partition_->cuda_data_indices()); cuda_tree->SyncLeafOutputFromCUDAToHost(); return cuda_tree.release(); } @@ -348,8 +350,10 @@ Tree* CUDASingleGPUTreeLearner::FitByExistingTree(const Tree* old_tree, const st } void CUDASingleGPUTreeLearner::ReduceLeafStat( - CUDATree* old_tree, const score_t* gradients, const score_t* hessians) const { - LaunchReduceLeafStatKernel(gradients, hessians, old_tree->num_leaves(), refit_num_data_, old_tree->cuda_leaf_value_ref(), old_tree->shrinkage()); + CUDATree* old_tree, const score_t* gradients, const score_t* hessians, const data_size_t* num_data_in_leaf) const { + LaunchReduceLeafStatKernel(gradients, hessians, num_data_in_leaf, old_tree->cuda_leaf_parent(), + old_tree->cuda_left_child(), old_tree->cuda_right_child(), + old_tree->num_leaves(), refit_num_data_, old_tree->cuda_leaf_value_ref(), old_tree->shrinkage()); } void CUDASingleGPUTreeLearner::ConstructBitsetForCategoricalSplit( diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu index a4ae3fb65166..0df1965da09f 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu @@ -70,13 +70,18 @@ __global__ void ReduceLeafStatKernel_GlobalMemory( } } +template __global__ void CalcRefitLeafOutputKernel( const int num_leaves, const double* leaf_grad_stat_buffer, const double* leaf_hess_stat_buffer, + const data_size_t* num_data_in_leaf, + const int* leaf_parent, + const int* left_child, + const int* right_child, const double lambda_l1, - const bool use_l1, const double lambda_l2, + const double path_smooth, const double shrinkage_rate, const double refit_decay_rate, double* leaf_value) { @@ -84,8 +89,27 @@ __global__ void CalcRefitLeafOutputKernel( if (leaf_index < num_leaves) { const double sum_gradients = leaf_grad_stat_buffer[leaf_index]; const double sum_hessians = leaf_hess_stat_buffer[leaf_index]; + const data_size_t num_data = num_data_in_leaf[leaf_index]; const double old_leaf_value = leaf_value[leaf_index]; - double new_leaf_value = CUDABestSplitFinder::CalculateSplittedLeafOutput(sum_gradients, sum_hessians, lambda_l1, use_l1, lambda_l2); + double new_leaf_value = 0.0f; + if (!USE_SMOOTHING) { + new_leaf_value = CUDALeafSplits::CalculateSplittedLeafOutput(sum_gradients, sum_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f); + } else { + const int parent = leaf_parent[leaf_index]; + if (parent >= 0) { + const int sibliing = left_child[parent] == leaf_index ? right_child[parent] : left_child[parent]; + const double sum_gradients_of_parent = sum_gradients + leaf_grad_stat_buffer[sibliing]; + const double sum_hessians_of_parent = sum_hessians + leaf_hess_stat_buffer[sibliing]; + const data_size_t num_data_in_parent = num_data + num_data_in_leaf[sibliing]; + const double parent_output = + CUDALeafSplits::CalculateSplittedLeafOutput( + sum_gradients_of_parent, sum_hessians_of_parent, lambda_l1, lambda_l2, 0.0f, 0, 0.0f); + new_leaf_value = CUDALeafSplits::CalculateSplittedLeafOutput( + sum_gradients, sum_hessians, lambda_l1, lambda_l2, path_smooth, num_data_in_parent, parent_output); + } else { + new_leaf_value = CUDALeafSplits::CalculateSplittedLeafOutput(sum_gradients, sum_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f); + } + } if (isnan(new_leaf_value)) { new_leaf_value = 0.0f; } else { @@ -96,7 +120,8 @@ __global__ void CalcRefitLeafOutputKernel( } void CUDASingleGPUTreeLearner::LaunchReduceLeafStatKernel( - const score_t* gradients, const score_t* hessians, const int num_leaves, + const score_t* gradients, const score_t* hessians, const data_size_t* num_data_in_leaf, + const int* leaf_parent, const int* left_child, const int* right_child, const int num_leaves, const data_size_t num_data, double* cuda_leaf_value, const double shrinkage_rate) const { int num_block = (num_data + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; if (num_leaves <= 2048) { @@ -109,10 +134,32 @@ void CUDASingleGPUTreeLearner::LaunchReduceLeafStatKernel( cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_); } const bool use_l1 = config_->lambda_l1 > 0.0f; + const bool use_smoothing = config_->path_smooth > 0.0f; num_block = (num_leaves + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; - CalcRefitLeafOutputKernel<<>>( - num_leaves, cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_, - config_->lambda_l1, use_l1, config_->lambda_l2, shrinkage_rate, config_->refit_decay_rate, cuda_leaf_value); + + #define CalcRefitLeafOutputKernel_ARGS \ + num_leaves, cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_, num_data_in_leaf, \ + leaf_parent, left_child, right_child, \ + config_->lambda_l1, config_->lambda_l2, config_->path_smooth, \ + shrinkage_rate, config_->refit_decay_rate, cuda_leaf_value + + if (!use_l1) { + if (!use_smoothing) { + CalcRefitLeafOutputKernel + <<>>(CalcRefitLeafOutputKernel_ARGS); + } else { + CalcRefitLeafOutputKernel + <<>>(CalcRefitLeafOutputKernel_ARGS); + } + } else { + if (!use_smoothing) { + CalcRefitLeafOutputKernel + <<>>(CalcRefitLeafOutputKernel_ARGS); + } else { + CalcRefitLeafOutputKernel + <<>>(CalcRefitLeafOutputKernel_ARGS); + } + } } template diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 8784410c1516..c4ea569247a1 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -52,9 +52,10 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { protected: void BeforeTrain() override; - void ReduceLeafStat(CUDATree* old_tree, const score_t* gradients, const score_t* hessians) const; + void ReduceLeafStat(CUDATree* old_tree, const score_t* gradients, const score_t* hessians, const data_size_t* num_data_in_leaf) const; - void LaunchReduceLeafStatKernel(const score_t* gradients, const score_t* hessians, + void LaunchReduceLeafStatKernel(const score_t* gradients, const score_t* hessians, const data_size_t* num_data_in_leaf, + const int* leaf_parent, const int* left_child, const int* right_child, const int num_leaves, const data_size_t num_data, double* cuda_leaf_value, const double shrinkage_rate) const; void ConstructBitsetForCategoricalSplit(const CUDASplitInfo* best_split_info); From 86e208a3f8ed56c1e2fcd402310b152be257380a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 9 Nov 2021 09:39:22 +0000 Subject: [PATCH 115/166] virtual destructor for LightGBM::Tree --- include/LightGBM/tree.h | 2 +- src/io/cuda/cuda_tree.cpp | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 77fa6cb19e4e..853396d4b445 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -39,7 +39,7 @@ class Tree { */ Tree(const char* str, size_t* used_len); - ~Tree() noexcept = default; + virtual ~Tree() noexcept = default; /*! * \brief Performing a split on tree leaves. diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index 252e1847766b..f1d95e95f050 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -103,13 +103,13 @@ void CUDATree::InitCUDAMemory() { __FILE__, __LINE__); AllocateCUDAMemory(&cuda_leaf_weight_, + static_cast(max_leaves_), + __FILE__, + __LINE__); + AllocateCUDAMemory(&cuda_leaf_count_, static_cast(max_leaves_), __FILE__, __LINE__); - AllocateCUDAMemory(&cuda_leaf_count_, - static_cast(max_leaves_), - __FILE__, - __LINE__); AllocateCUDAMemory(&cuda_internal_count_, static_cast(max_leaves_), __FILE__, @@ -181,6 +181,11 @@ void CUDATree::InitCUDA() { internal_count_.size(), __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_leaf_count_, + leaf_count_.data(), + leaf_count_.size(), + __FILE__, + __LINE__); InitCUDAMemoryFromHostMemory(&cuda_split_gain_, split_gain_.data(), split_gain_.size(), From d888f1d39547e7f43234805f454d06953cb0f68b Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 12 Nov 2021 03:32:22 +0000 Subject: [PATCH 116/166] fix overlapped cat threshold in best split infos --- .../cuda/cuda_best_split_finder.cpp | 42 +++++++++++-- .../cuda/cuda_best_split_finder.cu | 48 ++++----------- .../cuda/cuda_best_split_finder.hpp | 9 ++- src/treelearner/cuda/cuda_data_partition.cpp | 26 ++++---- src/treelearner/cuda/cuda_data_partition.cu | 12 ++-- src/treelearner/cuda/cuda_data_partition.hpp | 2 +- .../cuda/cuda_single_gpu_tree_learner.cpp | 59 ++++++++++++++++++- .../cuda/cuda_single_gpu_tree_learner.cu | 15 ++--- .../cuda/cuda_single_gpu_tree_learner.hpp | 4 +- 9 files changed, 144 insertions(+), 73 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index aec614c3c977..0fb88395549d 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -238,8 +238,14 @@ void CUDABestSplitFinder::InitCUDAFeatureMetaInfo() { const size_t output_buffer_size = 2 * static_cast(num_tasks_); AllocateCUDAMemory(&cuda_best_split_info_, output_buffer_size, __FILE__, __LINE__); - AllocateCatVectors(cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size); - AllocateCatVectors(cuda_best_split_info_, output_buffer_size); + + max_num_categories_in_split_ = std::min(max_cat_threshold_, max_num_categorical_bin_ / 2); + AllocateCUDAMemory(&cuda_cat_threshold_feature_, max_num_categories_in_split_ * output_buffer_size, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_cat_threshold_real_feature_, max_num_categories_in_split_ * output_buffer_size, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_cat_threshold_leaf_, max_num_categories_in_split_ * cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_cat_threshold_real_leaf_, max_num_categories_in_split_ * cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__); + AllocateCatVectors(cuda_leaf_best_split_info_, cuda_cat_threshold_leaf_, cuda_cat_threshold_real_leaf_, cuda_best_leaf_split_info_buffer_size); + AllocateCatVectors(cuda_best_split_info_, cuda_cat_threshold_feature_, cuda_cat_threshold_real_feature_, output_buffer_size); } void CUDABestSplitFinder::ResetTrainingData( @@ -262,14 +268,38 @@ void CUDABestSplitFinder::ResetConfig(const Config* config) { min_data_in_leaf_ = config->min_data_in_leaf; min_sum_hessian_in_leaf_ = config->min_sum_hessian_in_leaf; min_gain_to_split_ = config->min_gain_to_split; + cat_smooth_ = config->cat_smooth; + cat_l2_ = config->cat_l2; + max_cat_threshold_ = config->max_cat_threshold; + min_data_per_group_ = config->min_data_per_group; + max_cat_to_onehot_ = config->max_cat_to_onehot; + extra_trees_ = config->extra_trees; + extra_seed_ = config->extra_seed; + use_smoothing_ = (config->path_smooth > 0.0f); + path_smooth_ = config->path_smooth; + const int num_task_blocks = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; - const size_t cuda_best_leaf_split_info_buffer_size = static_cast(num_task_blocks) * static_cast(num_leaves_); + size_t cuda_best_leaf_split_info_buffer_size = static_cast(num_task_blocks) * static_cast(num_leaves_); DeallocateCUDAMemory(&cuda_leaf_best_split_info_, __FILE__, __LINE__); AllocateCUDAMemory(&cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size, __FILE__, __LINE__); - AllocateCatVectors(cuda_leaf_best_split_info_, cuda_best_leaf_split_info_buffer_size); + max_num_categories_in_split_ = std::min(max_cat_threshold_, max_num_categorical_bin_ / 2); + size_t total_cat_threshold_size = max_num_categories_in_split_ * cuda_best_leaf_split_info_buffer_size; + DeallocateCUDAMemory(&cuda_cat_threshold_leaf_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_cat_threshold_real_leaf_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_cat_threshold_leaf_, total_cat_threshold_size, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_cat_threshold_real_leaf_, total_cat_threshold_size, __FILE__, __LINE__); + AllocateCatVectors(cuda_leaf_best_split_info_, cuda_cat_threshold_leaf_, cuda_cat_threshold_real_leaf_, cuda_best_leaf_split_info_buffer_size); + + cuda_best_leaf_split_info_buffer_size = 2 * static_cast(num_tasks_); + total_cat_threshold_size = max_num_categories_in_split_ * cuda_best_leaf_split_info_buffer_size; + DeallocateCUDAMemory(&cuda_cat_threshold_feature_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_cat_threshold_real_feature_, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_cat_threshold_feature_, total_cat_threshold_size, __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_cat_threshold_real_feature_, total_cat_threshold_size, __FILE__, __LINE__); + AllocateCatVectors(cuda_best_split_info_, cuda_cat_threshold_feature_, cuda_cat_threshold_real_feature_, cuda_best_leaf_split_info_buffer_size); } void CUDABestSplitFinder::BeforeTrain(const std::vector& is_feature_used_bytree) { @@ -327,8 +357,8 @@ const CUDASplitInfo* CUDABestSplitFinder::FindBestFromAllSplits( return cuda_leaf_best_split_info_ + (*best_leaf_index); } -void CUDABestSplitFinder::AllocateCatVectors(CUDASplitInfo* cuda_split_infos, size_t len) const { - LaunchAllocateCatVectorsKernel(cuda_split_infos, len); +void CUDABestSplitFinder::AllocateCatVectors(CUDASplitInfo* cuda_split_infos, uint32_t* cat_threshold_vec, int* cat_threshold_real_vec, size_t len) { + LaunchAllocateCatVectorsKernel(cuda_split_infos, cat_threshold_vec, cat_threshold_real_vec, len); } } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 7cf984721461..99537a9a2c12 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -615,7 +615,7 @@ __global__ void FindBestSplitsForLeafKernel( const unsigned int output_offset = IS_LARGER ? (task_index + num_tasks) : task_index; CUDASplitInfo* out = cuda_best_split_info + output_offset; CUDARandom* cuda_random = USE_RAND ? - (IS_LARGER ? cuda_randoms + task_index * 2 + 1: cuda_randoms + task_index * 2) : nullptr; + (IS_LARGER ? cuda_randoms + task_index * 2 + 1 : cuda_randoms + task_index * 2) : nullptr; if (is_feature_used_bytree[inner_feature_index]) { const hist_t* hist_ptr = (IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset * 2; if (task->is_categorical) { @@ -1505,21 +1505,7 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( if ((other_split_info->is_valid && smaller_leaf_split_info->is_valid && other_split_info->gain > smaller_leaf_split_info->gain) || (!smaller_leaf_split_info->is_valid && other_split_info->is_valid)) { - smaller_leaf_split_info->is_valid = other_split_info->is_valid; - smaller_leaf_split_info->inner_feature_index = other_split_info->inner_feature_index; - smaller_leaf_split_info->default_left = other_split_info->default_left; - smaller_leaf_split_info->threshold = other_split_info->threshold; - smaller_leaf_split_info->gain = other_split_info->gain; - smaller_leaf_split_info->left_sum_gradients = other_split_info->left_sum_gradients; - smaller_leaf_split_info->left_sum_hessians = other_split_info->left_sum_hessians; - smaller_leaf_split_info->left_count = other_split_info->left_count; - smaller_leaf_split_info->left_gain = other_split_info->left_gain; - smaller_leaf_split_info->left_value = other_split_info->left_value; - smaller_leaf_split_info->right_sum_gradients = other_split_info->right_sum_gradients; - smaller_leaf_split_info->right_sum_hessians = other_split_info->right_sum_hessians; - smaller_leaf_split_info->right_count = other_split_info->right_count; - smaller_leaf_split_info->right_gain = other_split_info->right_gain; - smaller_leaf_split_info->right_value = other_split_info->right_value; + *smaller_leaf_split_info = *other_split_info; } } } @@ -1533,21 +1519,7 @@ __global__ void SyncBestSplitForLeafKernelAllBlocks( if ((other_split_info->is_valid && larger_leaf_split_info->is_valid && other_split_info->gain > larger_leaf_split_info->gain) || (!larger_leaf_split_info->is_valid && other_split_info->is_valid)) { - larger_leaf_split_info->is_valid = other_split_info->is_valid; - larger_leaf_split_info->inner_feature_index = other_split_info->inner_feature_index; - larger_leaf_split_info->default_left = other_split_info->default_left; - larger_leaf_split_info->threshold = other_split_info->threshold; - larger_leaf_split_info->gain = other_split_info->gain; - larger_leaf_split_info->left_sum_gradients = other_split_info->left_sum_gradients; - larger_leaf_split_info->left_sum_hessians = other_split_info->left_sum_hessians; - larger_leaf_split_info->left_count = other_split_info->left_count; - larger_leaf_split_info->left_gain = other_split_info->left_gain; - larger_leaf_split_info->left_value = other_split_info->left_value; - larger_leaf_split_info->right_sum_gradients = other_split_info->right_sum_gradients; - larger_leaf_split_info->right_sum_hessians = other_split_info->right_sum_hessians; - larger_leaf_split_info->right_count = other_split_info->right_count; - larger_leaf_split_info->right_gain = other_split_info->right_gain; - larger_leaf_split_info->right_value = other_split_info->right_value; + *larger_leaf_split_info = *other_split_info; } } } @@ -1741,12 +1713,14 @@ void CUDABestSplitFinder::LaunchFindBestFromAllSplitsKernel( __global__ void AllocateCatVectorsKernel( CUDASplitInfo* cuda_split_infos, size_t len, const int max_num_categories_in_split, - const bool has_categorical_feature) { + const bool has_categorical_feature, + uint32_t* cat_threshold_vec, + int* cat_threshold_real_vec) { const size_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i < len) { if (has_categorical_feature) { - cuda_split_infos[i].cat_threshold = new uint32_t[max_num_categories_in_split]; - cuda_split_infos[i].cat_threshold_real = new int[max_num_categories_in_split]; + cuda_split_infos[i].cat_threshold = cat_threshold_vec + i * max_num_categories_in_split; + cuda_split_infos[i].cat_threshold_real = cat_threshold_real_vec + i * max_num_categories_in_split; cuda_split_infos[i].num_cat_threshold = 0; } else { cuda_split_infos[i].cat_threshold = nullptr; @@ -1756,11 +1730,11 @@ __global__ void AllocateCatVectorsKernel( } } -void CUDABestSplitFinder::LaunchAllocateCatVectorsKernel(CUDASplitInfo* cuda_split_infos, size_t len) const { - const int max_num_categories_in_split = min(max_cat_threshold_ / 2, max_num_categorical_bin_); +void CUDABestSplitFinder::LaunchAllocateCatVectorsKernel( + CUDASplitInfo* cuda_split_infos, uint32_t* cat_threshold_vec, int* cat_threshold_real_vec, size_t len) { const int num_blocks = (static_cast(len) + NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER - 1) / NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER; AllocateCatVectorsKernel<<>>( - cuda_split_infos, len, max_num_categories_in_split, has_categorical_feature_); + cuda_split_infos, len, max_num_categories_in_split_, has_categorical_feature_, cat_threshold_vec, cat_threshold_real_vec); } __global__ void InitCUDARandomKernel( diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index ac800228b8c9..fb27cb6d7ca5 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -131,9 +131,9 @@ class CUDABestSplitFinder { int* best_leaf_index, data_size_t* num_cat_threshold); - void AllocateCatVectors(CUDASplitInfo* cuda_split_infos, size_t len) const; + void AllocateCatVectors(CUDASplitInfo* cuda_split_infos, uint32_t* cat_threshold_vec, int* cat_threshold_real_vec, size_t len); - void LaunchAllocateCatVectorsKernel(CUDASplitInfo* cuda_split_infos, size_t len) const; + void LaunchAllocateCatVectorsKernel(CUDASplitInfo* cuda_split_infos, uint32_t* cat_threshold_vec, int* cat_threshold_real_vec, size_t len); void LaunchInitCUDARandomKernel(); @@ -190,6 +190,11 @@ class CUDABestSplitFinder { hist_t* cuda_feature_hist_hess_buffer_; hist_t* cuda_feature_hist_stat_buffer_; data_size_t* cuda_feature_hist_index_buffer_; + uint32_t* cuda_cat_threshold_leaf_; + int* cuda_cat_threshold_real_leaf_; + uint32_t* cuda_cat_threshold_feature_; + int* cuda_cat_threshold_real_feature_; + int max_num_categories_in_split_; // used for extremely randomized trees CUDAVector cuda_randoms_; diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 3732796b877e..4fa3c906cdfc 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -31,16 +31,19 @@ CUDADataPartition::CUDADataPartition( cuda_column_data_ = train_data->cuda_column_data(); is_categorical_feature_.resize(train_data->num_features(), false); - is_single_feature_in_group_.resize(train_data->num_features(), false); + is_single_feature_in_column_.resize(train_data->num_features(), false); for (int feature_index = 0; feature_index < train_data->num_features(); ++feature_index) { if (train_data->FeatureBinMapper(feature_index)->bin_type() == BinType::CategoricalBin) { is_categorical_feature_[feature_index] = true; } const int feature_group_index = train_data->Feature2Group(feature_index); - if (!train_data->IsMultiGroup(feature_group_index) && - (feature_index == 0 || train_data->Feature2Group(feature_index - 1) != feature_group_index) && + if (!train_data->IsMultiGroup(feature_group_index)) { + if ((feature_index == 0 || train_data->Feature2Group(feature_index - 1) != feature_group_index) && (feature_index == train_data->num_features() - 1 || train_data->Feature2Group(feature_index + 1) != feature_group_index)) { - is_single_feature_in_group_[feature_index] = true; + is_single_feature_in_column_[feature_index] = true; + } + } else { + is_single_feature_in_column_[feature_index] = true; } } @@ -212,13 +215,14 @@ void CUDADataPartition::GenDataToLeftBitVector( left_leaf_index, right_leaf_index); } else { - LaunchGenDataToLeftBitVectorKernel(num_data_in_leaf, - split_feature_index, - split_threshold, - split_default_left, - leaf_data_start, - left_leaf_index, - right_leaf_index); + LaunchGenDataToLeftBitVectorKernel( + num_data_in_leaf, + split_feature_index, + split_threshold, + split_default_left, + leaf_data_start, + left_leaf_index, + right_leaf_index); } } diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 287c1d54b20d..23966c2a3249 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -581,7 +581,8 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorCategoricalKernel( const data_size_t* data_indices_in_leaf = cuda_data_indices_ + leaf_data_start; const int column_index = cuda_column_data_->feature_to_column(split_feature_index); const uint8_t bit_type = cuda_column_data_->column_bit_type(column_index); - const uint32_t min_bin = cuda_column_data_->feature_min_bin(split_feature_index); + const bool is_single_feature_in_column = is_single_feature_in_column_[split_feature_index]; + const uint32_t min_bin = is_single_feature_in_column ? 1 : cuda_column_data_->feature_min_bin(split_feature_index); const uint32_t max_bin = cuda_column_data_->feature_max_bin(split_feature_index); const uint32_t most_freq_bin = cuda_column_data_->feature_most_freq_bin(split_feature_index); const uint32_t default_bin = cuda_column_data_->feature_default_bin(split_feature_index); @@ -591,14 +592,13 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorCategoricalKernel( CopyFromCUDADeviceToHost(host_bitset.data(), bitset, bitset_len, __FILE__, __LINE__); uint8_t split_default_to_left = 0; int default_leaf_index = right_leaf_index; - const int is_single_feature_in_group = is_single_feature_in_group_[split_feature_index]; - if (Common::FindInBitset(host_bitset.data(), bitset_len, most_freq_bin)) { + if (most_freq_bin > 0 && Common::FindInBitset(host_bitset.data(), bitset_len, most_freq_bin)) { split_default_to_left = 1; default_leaf_index = left_leaf_index; } if (bit_type == 8) { const uint8_t* column_data = reinterpret_cast(column_data_pointer); - if (is_single_feature_in_group) { + if (is_single_feature_in_column) { GenDataToLeftBitVectorKernel_Categorical<<>>(GenBitVector_Categorical_ARGS); UpdateDataIndexToLeafIndexKernel_Categorical<<>>(UpdateDataIndexToLeafIndex_Categorical_ARGS); } else { @@ -607,7 +607,7 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorCategoricalKernel( } } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(column_data_pointer); - if (is_single_feature_in_group) { + if (is_single_feature_in_column) { GenDataToLeftBitVectorKernel_Categorical<<>>(GenBitVector_Categorical_ARGS); UpdateDataIndexToLeafIndexKernel_Categorical<<>>(UpdateDataIndexToLeafIndex_Categorical_ARGS); } else { @@ -616,7 +616,7 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorCategoricalKernel( } } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(column_data_pointer); - if (is_single_feature_in_group) { + if (is_single_feature_in_column) { GenDataToLeftBitVectorKernel_Categorical<<>>(GenBitVector_Categorical_ARGS); UpdateDataIndexToLeafIndexKernel_Categorical<<>>(UpdateDataIndexToLeafIndex_Categorical_ARGS); } else { diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index ab26493b6766..70a3356a4ff4 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -294,7 +294,7 @@ class CUDADataPartition { /*! \brief marks whether a feature is a categorical feature */ std::vector is_categorical_feature_; /*! \brief marks whether a feature is the only feature in its group */ - std::vector is_single_feature_in_group_; + std::vector is_single_feature_in_column_; // config information /*! \brief maximum number of leaves in a tree */ diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 62d04499545c..7be6efcde314 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -218,7 +218,7 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, &leaf_sum_hessians_[right_leaf_index], &sum_left_gradients, &sum_right_gradients); - CheckSplitValid(best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients); + CheckSplitValid(leaf_best_split_feature_[best_leaf_index_], best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients); smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index); larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_); global_timer.Stop("CUDASingleGPUTreeLearner::Split"); @@ -409,7 +409,7 @@ void CUDASingleGPUTreeLearner::AllocateBitset() { if (bin_mapper->bin_type() == BinType::CategoricalBin) { const int offset = categorical_bin_offsets_[i]; for (int bin = 0; bin < bin_mapper->num_bin(); ++bin) { - categorical_bin_to_value_[offset + bin] = bin_mapper->BinToValue(bin); + categorical_bin_to_value_[offset + bin] = static_cast(bin_mapper->BinToValue(bin)); } } } @@ -424,6 +424,7 @@ void CUDASingleGPUTreeLearner::AllocateBitset() { } void CUDASingleGPUTreeLearner::CheckSplitValid( + const int inner_split_feature, const int left_leaf, const int right_leaf, const double split_sum_left_gradients, @@ -448,10 +449,64 @@ void CUDASingleGPUTreeLearner::CheckSplitValid( sum_right_gradients += gradients_[index]; sum_right_hessians += hessians_[index]; } + Log::Warning("inner_split_feature = %d", inner_split_feature); Log::Warning("sum_left_gradients = %f, split_sum_left_gradients = %f", sum_left_gradients, split_sum_left_gradients); Log::Warning("sum_left_hessians = %f, leaf_sum_hessians_[%d] = %f", sum_left_hessians, left_leaf, leaf_sum_hessians_[left_leaf]); Log::Warning("sum_right_gradients = %f, split_sum_right_gradients = %f", sum_right_gradients, split_sum_right_gradients); Log::Warning("sum_right_hessians = %f, leaf_sum_hessians_[%d] = %f", sum_right_hessians, right_leaf, leaf_sum_hessians_[right_leaf]); + + /*if (train_data_->FeatureBinMapper(inner_split_feature)->bin_type() == BinType::CategoricalBin) { + std::vector host_bitset_inner(cuda_bitset_inner_len_); + CopyFromCUDADeviceToHost(host_bitset_inner.data(), cuda_bitset_inner_, cuda_bitset_inner_len_, __FILE__, __LINE__); + std::vector host_left_data_indices(leaf_num_data_[left_leaf]); + std::vector host_right_data_indices(leaf_num_data_[right_leaf]); + CopyFromCUDADeviceToHost(host_left_data_indices.data(), cuda_data_partition_->cuda_data_indices() + leaf_data_start_[left_leaf], + static_cast(leaf_num_data_[left_leaf]), __FILE__, __LINE__); + CopyFromCUDADeviceToHost(host_right_data_indices.data(), cuda_data_partition_->cuda_data_indices() + leaf_data_start_[right_leaf], + static_cast(leaf_num_data_[right_leaf]), __FILE__, __LINE__); + BinIterator* iter = train_data_->FeatureIterator(inner_split_feature); + for (size_t i = 0; i < host_left_data_indices.size(); ++i) { + const data_size_t data_index = host_left_data_indices[i]; + const uint32_t bin = iter->RawGet(data_index); + const bool to_left = Common::FindInBitset(host_bitset_inner.data(), cuda_bitset_inner_len_, bin); + if (!to_left) { + Log::Warning("error !!! bin = %d found in left"); + } + } + for (size_t i = 0; i < host_right_data_indices.size(); ++i) { + const data_size_t data_index = host_right_data_indices[i]; + const uint32_t bin = iter->RawGet(data_index); + const bool to_right = (bin == 0 || !Common::FindInBitset(host_bitset_inner.data(), cuda_bitset_inner_len_, bin)); + if (!to_right) { + Log::Warning("error !!! bin = %d found in right"); + } + } + + // construct histogram manually + std::vector hist(500, 0.0f); + for (size_t i = 0; i < host_left_data_indices.size(); ++i) { + const data_size_t data_index = host_left_data_indices[i]; + const uint32_t bin = iter->RawGet(data_index); + const score_t gradient = gradients_[data_index]; + const score_t hessian = hessians_[data_index]; + hist[2 * bin] += gradient; + hist[2 * bin + 1] += hessian; + } + for (size_t i = 0; i < host_right_data_indices.size(); ++i) { + const data_size_t data_index = host_right_data_indices[i]; + const uint32_t bin = iter->RawGet(data_index); + const score_t gradient = gradients_[data_index]; + const score_t hessian = hessians_[data_index]; + hist[2 * bin] += gradient; + hist[2 * bin + 1] += hessian; + } + + Log::Warning("==================== manual histogram for leaf %d (====================", left_leaf); + for (size_t i = 0; i < 100; ++i) { + Log::Warning("bin %d, grad %f, hess %f", i, hist[2 * i], hist[2 * i + 1]); + } + }*/ + CHECK_LE(std::fabs(sum_left_gradients - split_sum_left_gradients), 1e-6f); CHECK_LE(std::fabs(sum_left_hessians - leaf_sum_hessians_[left_leaf]), 1e-6f); CHECK_LE(std::fabs(sum_right_gradients - split_sum_right_gradients), 1e-6f); diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu index 0df1965da09f..3423b915b7fe 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu @@ -198,10 +198,6 @@ __global__ void ReduceBlockMaxLen(size_t* out_len_buffer, const int num_blocks) template __global__ void CUDAConstructBitsetKernel(const CUDASplitInfo* best_split_info, uint32_t* out, size_t cuda_bitset_len) { const T* vals = nullptr; - for (size_t i = threadIdx.x + blockIdx.x * blockDim.x; i < cuda_bitset_len; i += blockDim.x) { - out[i] = 0; - } - __syncthreads(); if (IS_INNER) { vals = reinterpret_cast(best_split_info->cat_threshold); } else { @@ -223,14 +219,17 @@ __global__ void SetRealThresholdKernel( const int* categorical_bin_to_value_ptr = categorical_bin_to_value + categorical_bin_offsets[best_split_info->inner_feature_index]; int* cat_threshold_real = best_split_info->cat_threshold_real; const uint32_t* cat_threshold = best_split_info->cat_threshold; - for (int i = 0; i < num_cat_threshold; ++i) { - cat_threshold_real[i] = categorical_bin_to_value_ptr[cat_threshold[i]]; + const int index = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (index < num_cat_threshold) { + cat_threshold_real[index] = categorical_bin_to_value_ptr[cat_threshold[index]]; } } template void CUDAConstructBitset(const CUDASplitInfo* best_split_info, const int num_cat_threshold, uint32_t* out, size_t bitset_len) { const int num_blocks = (num_cat_threshold + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; + // clear the bitset vector first + SetCUDAMemory(out, 0, bitset_len, __FILE__, __LINE__); CUDAConstructBitsetKernel<<>>(best_split_info, out, bitset_len); } @@ -246,7 +245,9 @@ size_t CUDABitsetLen(const CUDASplitInfo* best_split_info, const int num_cat_thr void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel( const CUDASplitInfo* best_split_info) { - SetRealThresholdKernel<<<1, 1>>>(best_split_info, cuda_categorical_bin_to_value_, cuda_categorical_bin_offsets_); + const int num_blocks = (num_cat_threshold_ + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE; + SetRealThresholdKernel<<>> + (best_split_info, cuda_categorical_bin_to_value_, cuda_categorical_bin_offsets_); cuda_bitset_inner_len_ = CUDABitsetLen(best_split_info, num_cat_threshold_, cuda_block_bitset_len_buffer_); CUDAConstructBitset(best_split_info, num_cat_threshold_, cuda_bitset_inner_, cuda_bitset_inner_len_); cuda_bitset_len_ = CUDABitsetLen(best_split_info, num_cat_threshold_, cuda_block_bitset_len_buffer_); diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index c4ea569247a1..680216c01758 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -64,7 +64,9 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { void AllocateBitset(); - void CheckSplitValid(const int left_leaf, const int right_leaf, const double sum_left_gradients, const double sum_right_gradients); + void CheckSplitValid( + const int inner_split_feature, + const int left_leaf, const int right_leaf, const double sum_left_gradients, const double sum_right_gradients); // GPU device ID int gpu_device_id_; From 5efe0fb78b734efc2e695aca401ddb5122c7291d Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 17 Nov 2021 06:24:03 +0000 Subject: [PATCH 117/166] reset histogram pointers in data partition and spllit finder in ResetConfig --- CMakeLists.txt | 2 +- include/LightGBM/cuda/cuda_utils.h | 3 +++ src/cuda/cuda_utils.cpp | 8 ++++++++ src/io/cuda/cuda_column_data.cpp | 4 ++-- src/io/cuda/cuda_metadata.cpp | 4 ++-- src/io/cuda/cuda_row_data.cpp | 4 ++-- src/io/cuda/cuda_tree.cpp | 4 ++-- src/treelearner/cuda/cuda_best_split_finder.cpp | 3 ++- src/treelearner/cuda/cuda_best_split_finder.hpp | 2 +- src/treelearner/cuda/cuda_data_partition.cpp | 3 ++- src/treelearner/cuda/cuda_data_partition.hpp | 2 +- src/treelearner/cuda/cuda_histogram_constructor.cu | 12 +++--------- .../cuda/cuda_histogram_constructor.hpp | 2 -- .../cuda/cuda_single_gpu_tree_learner.cpp | 14 +++++++++----- .../cuda/cuda_single_gpu_tree_learner.hpp | 5 ++++- 15 files changed, 42 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8706e87bf040..4c9068f0b88f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -358,7 +358,7 @@ if(USE_CUDA) src/io/cuda/*.cpp src/cuda/*.cpp src/cuda/*.cu -endif(USE_CUDA) +endif() ) add_executable(lightgbm src/main.cpp src/application/application.cpp ${SOURCES}) diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index a4efad6ebaca..46f26ff97018 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -28,6 +28,8 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = #define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); } +void SetCUDADevice(int gpu_device_id, const char* file, int line); + template void AllocateCUDAMemory(T** out_ptr, size_t size, const char* file, const int line) { void* tmp_ptr = nullptr; @@ -86,6 +88,7 @@ void SynchronizeCUDADevice(const char* file, const int line); template void SetCUDAMemory(T* dst_ptr, int value, size_t size, const char* file, const int line) { CUDASUCCESS_OR_FATAL_OUTER(cudaMemset(reinterpret_cast(dst_ptr), value, size * sizeof(T))); + SynchronizeCUDADevice(file, line); } template diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp index e99bd1ec78f6..67c1c3678f9d 100644 --- a/src/cuda/cuda_utils.cpp +++ b/src/cuda/cuda_utils.cpp @@ -18,6 +18,14 @@ void PrintLastCUDAError() { Log::Warning(error_name); } +void SetCUDADevice(int gpu_device_id, const char* file, int line) { + int cur_gpu_device_id = 0; + CUDASUCCESS_OR_FATAL_OUTER(cudaGetDevice(&cur_gpu_device_id)); + if (cur_gpu_device_id != gpu_device_id) { + CUDASUCCESS_OR_FATAL_OUTER(cudaSetDevice(gpu_device_id)); + } +} + } // namespace LightGBM #endif // USE_CUDA diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp index 48e02702f5f7..a1080cb2b902 100644 --- a/src/io/cuda/cuda_column_data.cpp +++ b/src/io/cuda/cuda_column_data.cpp @@ -13,9 +13,9 @@ CUDAColumnData::CUDAColumnData(const data_size_t num_data, const int gpu_device_ num_threads_ = OMP_NUM_THREADS(); num_data_ = num_data; if (gpu_device_id >= 0) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); + SetCUDADevice(gpu_device_id, __FILE__, __LINE__); } else { - CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + SetCUDADevice(0, __FILE__, __LINE__); } cuda_used_indices_ = nullptr; cuda_data_by_column_ = nullptr; diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp index e269fb0c7c2d..1f83fd7efdf2 100644 --- a/src/io/cuda/cuda_metadata.cpp +++ b/src/io/cuda/cuda_metadata.cpp @@ -11,9 +11,9 @@ namespace LightGBM { CUDAMetadata::CUDAMetadata(const int gpu_device_id) { if (gpu_device_id >= 0) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); + SetCUDADevice(gpu_device_id, __FILE__, __LINE__); } else { - CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + SetCUDADevice(0, __FILE__, __LINE__); } cuda_label_ = nullptr; cuda_weights_ = nullptr; diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 3ba3deff72b3..20172eb1d61f 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -23,9 +23,9 @@ CUDARowData::CUDARowData(const Dataset* train_data, num_feature_group_ = train_data->num_feature_groups(); num_feature_ = train_data->num_features(); if (gpu_device_id >= 0) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); + SetCUDADevice(gpu_device_id, __FILE__, __LINE__); } else { - CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + SetCUDADevice(0, __FILE__, __LINE__); } cuda_data_uint8_t_ = nullptr; cuda_data_uint16_t_ = nullptr; diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index f1d95e95f050..f318e66fd2c5 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -15,9 +15,9 @@ Tree(max_leaves, track_branch_features, is_linear), num_threads_per_block_add_prediction_to_score_(1024) { is_cuda_tree_ = true; if (gpu_device_id >= 0) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); + SetCUDADevice(gpu_device_id, __FILE__, __LINE__); } else { - CUDASUCCESS_OR_FATAL(cudaSetDevice(0)); + SetCUDADevice(0, __FILE__, __LINE__); } if (has_categorical_feature) { cuda_cat_boundaries_.Resize(max_leaves); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index 0fb88395549d..f48e54a4ec6e 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -261,7 +261,7 @@ void CUDABestSplitFinder::ResetTrainingData( InitCUDAFeatureMetaInfo(); } -void CUDABestSplitFinder::ResetConfig(const Config* config) { +void CUDABestSplitFinder::ResetConfig(const Config* config, const hist_t* cuda_hist) { num_leaves_ = config->num_leaves; lambda_l1_ = config->lambda_l1; lambda_l2_ = config->lambda_l2; @@ -277,6 +277,7 @@ void CUDABestSplitFinder::ResetConfig(const Config* config) { extra_seed_ = config->extra_seed; use_smoothing_ = (config->path_smooth > 0.0f); path_smooth_ = config->path_smooth; + cuda_hist_ = cuda_hist; const int num_task_blocks = (num_tasks_ + NUM_TASKS_PER_SYNC_BLOCK - 1) / NUM_TASKS_PER_SYNC_BLOCK; size_t cuda_best_leaf_split_info_buffer_size = static_cast(num_task_blocks) * static_cast(num_leaves_); diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index fb27cb6d7ca5..fbdff366c1de 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -87,7 +87,7 @@ class CUDABestSplitFinder { const Dataset* train_data, const std::vector& feature_hist_offsets); - void ResetConfig(const Config* config); + void ResetConfig(const Config* config, const hist_t* cuda_hist); private: diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 4fa3c906cdfc..d48d024c9d24 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -361,9 +361,10 @@ void CUDADataPartition::ResetTrainingData(const Dataset* train_data, const int n cur_num_leaves_ = 1; } -void CUDADataPartition::ResetConfig(const Config* config) { +void CUDADataPartition::ResetConfig(const Config* config, hist_t* cuda_hist) { num_threads_ = OMP_NUM_THREADS(); num_leaves_ = config->num_leaves; + cuda_hist_ = cuda_hist; DeallocateCUDAMemory(&cuda_leaf_data_start_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_leaf_data_end_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_leaf_num_data_, __FILE__, __LINE__); diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 70a3356a4ff4..b5b7fa9eebc8 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -74,7 +74,7 @@ class CUDADataPartition { void ResetTrainingData(const Dataset* train_data, const int num_total_bin, hist_t* cuda_hist); - void ResetConfig(const Config* config); + void ResetConfig(const Config* config, hist_t* cuda_hist); void ResetByLeafPred(const std::vector& leaf_pred, int num_leaves); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index f4e0ba66a9f3..9168d03f017f 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -42,19 +42,14 @@ __global__ void CUDAConstructHistogramDenseKernel( shared_hist[i] = 0.0f; } __syncthreads(); - const unsigned int threadIdx_y = threadIdx.y; const unsigned int blockIdx_y = blockIdx.y; const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread; const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start; data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast(blockDim.y))); - const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y; - const data_size_t remainder = block_num_data % blockDim.y; - const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast(threadIdx_y >= remainder); - data_size_t inner_data_index = static_cast(threadIdx_y); const int column_index = static_cast(threadIdx.x) + partition_column_start; if (threadIdx.x < static_cast(num_columns_in_partition)) { double* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); - for (data_size_t i = 0; i < num_iteration_this; ++i) { + for (data_size_t inner_data_index = static_cast(threadIdx.y); inner_data_index < block_num_data; inner_data_index += blockDim.y) { const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; const score_t grad = cuda_gradients[data_index]; const score_t hess = cuda_hessians[data_index]; @@ -63,7 +58,6 @@ __global__ void CUDAConstructHistogramDenseKernel( double* pos_ptr = shared_hist_ptr + pos; atomicAdd_block(pos_ptr, grad); atomicAdd_block(pos_ptr + 1, hess); - inner_data_index += blockDim.y; } } __syncthreads(); @@ -522,8 +516,8 @@ __global__ void SubtractHistogramKernel( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const CUDALeafSplitsStruct* cuda_larger_leaf_splits) { const unsigned int global_thread_index = threadIdx.x + blockIdx.x * blockDim.x; - const int cuda_larger_leaf_index_ref = cuda_larger_leaf_splits->leaf_index; - if (cuda_larger_leaf_index_ref >= 0) { + const int cuda_larger_leaf_index = cuda_larger_leaf_splits->leaf_index; + if (cuda_larger_leaf_index >= 0) { const hist_t* smaller_leaf_hist = cuda_smaller_leaf_splits->hist_in_leaf; hist_t* larger_leaf_hist = cuda_larger_leaf_splits->hist_in_leaf; if (global_thread_index < 2 * num_total_bin) { diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 86cbf64151a8..852974e743a8 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -59,8 +59,6 @@ class CUDAHistogramConstructor { const hist_t* cuda_hist() const { return cuda_hist_; } - hist_t* cuda_hist_pointer() const { return cuda_hist_; } - hist_t* cuda_hist_pointer() { return cuda_hist_; } private: diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 7be6efcde314..859d3cdb6a8f 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -33,7 +33,7 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_ num_threads_ = OMP_NUM_THREADS(); // use the first gpu by default gpu_device_id_ = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0; - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id_)); + SetCUDADevice(gpu_device_id_, __FILE__, __LINE__); cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_)); cuda_smaller_leaf_splits_->Init(); @@ -218,7 +218,9 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, &leaf_sum_hessians_[right_leaf_index], &sum_left_gradients, &sum_right_gradients); - CheckSplitValid(leaf_best_split_feature_[best_leaf_index_], best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients); + CheckSplitValid(leaf_best_split_feature_[best_leaf_index_], leaf_best_split_threshold_[best_leaf_index_], + best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients, + leaf_num_data_[best_leaf_index_], leaf_num_data_[right_leaf_index]); smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index); larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_); global_timer.Stop("CUDASingleGPUTreeLearner::Split"); @@ -266,8 +268,8 @@ void CUDASingleGPUTreeLearner::ResetConfig(const Config* config) { leaf_sum_hessians_.resize(config_->num_leaves, 0.0f); } cuda_histogram_constructor_->ResetConfig(config); - cuda_best_split_finder_->ResetConfig(config); - cuda_data_partition_->ResetConfig(config); + cuda_best_split_finder_->ResetConfig(config, cuda_histogram_constructor_->cuda_hist()); + cuda_data_partition_->ResetConfig(config, cuda_histogram_constructor_->cuda_hist_pointer()); } void CUDASingleGPUTreeLearner::SetBaggingData(const Dataset* /*subset*/, @@ -425,10 +427,12 @@ void CUDASingleGPUTreeLearner::AllocateBitset() { void CUDASingleGPUTreeLearner::CheckSplitValid( const int inner_split_feature, + const uint32_t inner_threshold, const int left_leaf, const int right_leaf, const double split_sum_left_gradients, - const double split_sum_right_gradients) { + const double split_sum_right_gradients, + const data_size_t left_count, const data_size_t right_count) { std::vector left_data_indices(leaf_num_data_[left_leaf]); std::vector right_data_indices(leaf_num_data_[right_leaf]); CopyFromCUDADeviceToHost(left_data_indices.data(), diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 680216c01758..9c2164f7378c 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -66,7 +66,10 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { void CheckSplitValid( const int inner_split_feature, - const int left_leaf, const int right_leaf, const double sum_left_gradients, const double sum_right_gradients); + const uint32_t inner_threshold, + const int left_leaf, const int right_leaf, + const double sum_left_gradients, const double sum_right_gradients, + const data_size_t left_count, const data_size_t right_count); // GPU device ID int gpu_device_id_; From 0bb88fb5d4eadbdfbd162bcf9ee7d1c82e51ffc0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 17 Nov 2021 06:56:17 +0000 Subject: [PATCH 118/166] comment useless parameter --- src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index f271f1801486..474003599eb7 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -426,7 +426,7 @@ void CUDASingleGPUTreeLearner::AllocateBitset() { } void CUDASingleGPUTreeLearner::CheckSplitValid( - const int inner_split_feature, + const int /*inner_split_feature*/, const uint32_t /*inner_threshold*/, const int left_leaf, const int right_leaf, From 0678d9af5e01fb838d314be34f076b295a68d114 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 18 Nov 2021 05:41:14 +0000 Subject: [PATCH 119/166] fix reverse case when na is missing and default bin is zero --- .../cuda/cuda_best_split_finder.cu | 91 +++++++++--- src/treelearner/cuda/cuda_data_partition.cu | 136 ++++++++++++------ src/treelearner/cuda/cuda_data_partition.hpp | 40 ++++-- 3 files changed, 190 insertions(+), 77 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 99537a9a2c12..d464f0556990 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -168,10 +168,18 @@ __device__ void FindBestSplitsForLeafKernelInner( (task->skip_default_bin && (threadIdx_x + task->mfb_offset) == static_cast(task->default_bin)); const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset; if (!REVERSE) { - if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { - const unsigned int bin_offset = threadIdx_x << 1; - local_grad_hist = feature_hist_ptr[bin_offset]; - local_hess_hist = feature_hist_ptr[bin_offset + 1]; + if (task->na_as_missing && task->mfb_offset == 1) { + if (threadIdx_x < static_cast(task->num_bin) && threadIdx_x > 0) { + const unsigned int bin_offset = (threadIdx_x - 1) << 1; + local_grad_hist = feature_hist_ptr[bin_offset]; + local_hess_hist = feature_hist_ptr[bin_offset + 1]; + } + } else { + if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) { + const unsigned int bin_offset = threadIdx_x << 1; + local_grad_hist = feature_hist_ptr[bin_offset]; + local_hess_hist = feature_hist_ptr[bin_offset + 1]; + } } } else { if (threadIdx_x >= static_cast(task->na_as_missing) && @@ -183,6 +191,15 @@ __device__ void FindBestSplitsForLeafKernelInner( } } __syncthreads(); + if (!REVERSE && task->na_as_missing && task->mfb_offset == 1) { + const hist_t sum_gradients_non_default = ShuffleReduceSum(local_grad_hist, shared_double_buffer, blockDim.x); + __syncthreads(); + const hist_t sum_hessians_non_default = ShuffleReduceSum(local_hess_hist, shared_double_buffer, blockDim.x); + if (threadIdx_x == 0) { + local_grad_hist += (sum_gradients - sum_gradients_non_default); + local_hess_hist += (sum_hessians - sum_hessians_non_default); + } + } if (threadIdx_x == 0) { local_hess_hist += kEpsilon; } @@ -214,7 +231,8 @@ __device__ void FindBestSplitsForLeafKernelInner( } } } else { - if (threadIdx_x <= feature_num_bin_minus_offset - 2 && !skip_sum) { + const uint32_t end = (task->na_as_missing && task->mfb_offset == 1) ? static_cast(task->num_bin - 2) : feature_num_bin_minus_offset - 2; + if (threadIdx_x <= end && !skip_sum) { const double sum_left_gradient = local_grad_hist; const double sum_left_hessian = local_hess_hist; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); @@ -231,7 +249,9 @@ __device__ void FindBestSplitsForLeafKernelInner( // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; - threshold_value = static_cast(threadIdx_x + task->mfb_offset); + threshold_value = (task->na_as_missing && task->mfb_offset == 1) ? + static_cast(threadIdx_x) : + static_cast(threadIdx_x + task->mfb_offset); threshold_found = true; } } @@ -735,26 +755,48 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( } } __shared__ uint32_t best_thread_index; - __shared__ double shared_gain_buffer[32]; + __shared__ double shared_double_buffer[32]; __shared__ bool shared_found_buffer[32]; __shared__ uint32_t shared_thread_index_buffer[32]; const unsigned int threadIdx_x = threadIdx.x; const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset; if (!REVERSE) { - for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; ++bin) { - const bool skip_sum = - (task->skip_default_bin && (bin + task->mfb_offset) == static_cast(task->default_bin)); - if (!skip_sum) { - const unsigned int bin_offset = bin << 1; - hist_grad_buffer_ptr[bin] = feature_hist_ptr[bin_offset]; - hist_hess_buffer_ptr[bin] = feature_hist_ptr[bin_offset + 1]; - } else { - hist_grad_buffer_ptr[bin] = 0.0f; - hist_hess_buffer_ptr[bin] = 0.0f; + if (task->na_as_missing && task->mfb_offset == 1) { + uint32_t bin_start = threadIdx_x > 0 ? threadIdx_x : blockDim.x; + hist_t thread_sum_gradients = 0.0f; + hist_t thread_sum_hessians = 0.0f; + for (unsigned int bin = bin_start; bin < static_cast(task->num_bin); bin += blockDim.x) { + const unsigned int bin_offset = (bin - 1) << 1; + const hist_t grad = feature_hist_ptr[bin_offset]; + const hist_t hess = feature_hist_ptr[bin_offset + 1]; + hist_grad_buffer_ptr[bin] = grad; + hist_hess_buffer_ptr[bin] = hess; + thread_sum_gradients += grad; + thread_sum_hessians += hess; + } + const hist_t sum_gradients_non_default = ShuffleReduceSum(thread_sum_gradients, shared_double_buffer, blockDim.x); + __syncthreads(); + const hist_t sum_hessians_non_default = ShuffleReduceSum(thread_sum_hessians, shared_double_buffer, blockDim.x); + if (threadIdx_x == 0) { + hist_grad_buffer_ptr[0] = sum_gradients - sum_gradients_non_default; + hist_hess_buffer_ptr[0] = sum_hessians - sum_hessians_non_default; + } + } else { + for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; bin += blockDim.x) { + const bool skip_sum = + (task->skip_default_bin && (bin + task->mfb_offset) == static_cast(task->default_bin)); + if (!skip_sum) { + const unsigned int bin_offset = bin << 1; + hist_grad_buffer_ptr[bin] = feature_hist_ptr[bin_offset]; + hist_hess_buffer_ptr[bin] = feature_hist_ptr[bin_offset + 1]; + } else { + hist_grad_buffer_ptr[bin] = 0.0f; + hist_hess_buffer_ptr[bin] = 0.0f; + } } } } else { - for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; ++bin) { + for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; bin += blockDim.x) { const bool skip_sum = bin >= static_cast(task->na_as_missing) && (task->skip_default_bin && (task->num_bin - 1 - bin) == static_cast(task->default_bin)); if (!skip_sum) { @@ -777,7 +819,7 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( __syncthreads(); GlobalMemoryPrefixSum(hist_hess_buffer_ptr, static_cast(feature_num_bin_minus_offset)); if (REVERSE) { - for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; ++bin) { + for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; bin += blockDim.x) { const bool skip_sum = (bin >= static_cast(task->na_as_missing) && (task->skip_default_bin && (task->num_bin - 1 - bin) == static_cast(task->default_bin))); if (!skip_sum) { @@ -804,7 +846,8 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( } } } else { - for (unsigned int bin = threadIdx_x; bin <= feature_num_bin_minus_offset - 2; ++bin) { + const uint32_t end = (task->na_as_missing && task->mfb_offset == 1) ? static_cast(task->num_bin - 2) : feature_num_bin_minus_offset - 2; + for (unsigned int bin = threadIdx_x; bin <= end; bin += blockDim.x) { const bool skip_sum = (task->skip_default_bin && (bin + task->mfb_offset) == static_cast(task->default_bin)); if (!skip_sum) { @@ -824,7 +867,8 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( // gain with split is worse than without split if (current_gain > min_gain_shift) { local_gain = current_gain - min_gain_shift; - threshold_value = static_cast(bin + task->mfb_offset); + threshold_value = (task->na_as_missing && task->mfb_offset == 1) ? + bin : static_cast(bin + task->mfb_offset); threshold_found = true; } } @@ -832,7 +876,7 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( } } __syncthreads(); - const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_gain_buffer, shared_found_buffer, shared_thread_index_buffer); + const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_double_buffer, shared_found_buffer, shared_thread_index_buffer); if (threadIdx_x == 0) { best_thread_index = result; } @@ -867,7 +911,8 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput(sum_right_gradient, sum_right_hessian, lambda_l1, lambda_l2, right_output); } else { - const unsigned int best_bin = static_cast(threshold_value - task->mfb_offset); + const unsigned int best_bin = (task->na_as_missing && task->mfb_offset == 1) ? + threshold_value : static_cast(threshold_value - task->mfb_offset); const double sum_left_gradient = hist_grad_buffer_ptr[best_bin]; const double sum_left_hessian = hist_hess_buffer_ptr[best_bin] - kEpsilon; const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 23966c2a3249..5453b49c1301 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -107,7 +107,7 @@ __device__ bool CUDAFindInBitset(const uint32_t* bits, int n, T pos) { default_leaf_index, \ missing_default_leaf_index -template +template __global__ void UpdateDataIndexToLeafIndexKernel( UpdateDataIndexToLeafIndexKernel_PARAMS, int* cuda_data_index_to_leaf_index) { @@ -119,7 +119,8 @@ __global__ void UpdateDataIndexToLeafIndexKernel( if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin)) { cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; - } else if (bin < min_bin || bin > max_bin) { + } else if ((USE_MIN_BIN && (bin < min_bin || bin > max_bin)) || + (!USE_MIN_BIN && bin == 0)) { if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO && MFB_IS_ZERO)) { cuda_data_index_to_leaf_index[global_data_index] = missing_default_leaf_index; } else { @@ -161,22 +162,23 @@ void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel( const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_to_left) { + const bool max_to_left, + const bool is_single_feature_in_column) { if (min_bin < max_bin) { if (!missing_is_zero) { LaunchUpdateDataIndexToLeafIndexKernel_Inner0 - (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left); + (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column); } else { LaunchUpdateDataIndexToLeafIndexKernel_Inner0 - (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left); + (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column); } } else { if (!missing_is_zero) { LaunchUpdateDataIndexToLeafIndexKernel_Inner0 - (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left); + (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column); } else { LaunchUpdateDataIndexToLeafIndexKernel_Inner0 - (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left); + (UpdateDataIndexToLeafIndex_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column); } } } @@ -187,13 +189,14 @@ void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner0( const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_to_left) { + const bool max_to_left, + const bool is_single_feature_in_column) { if (!missing_is_na) { LaunchUpdateDataIndexToLeafIndexKernel_Inner1 - (UpdateDataIndexToLeafIndex_ARGS, mfb_is_zero, mfb_is_na, max_to_left); + (UpdateDataIndexToLeafIndex_ARGS, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column); } else { LaunchUpdateDataIndexToLeafIndexKernel_Inner1 - (UpdateDataIndexToLeafIndex_ARGS, mfb_is_zero, mfb_is_na, max_to_left); + (UpdateDataIndexToLeafIndex_ARGS, mfb_is_zero, mfb_is_na, max_to_left, is_single_feature_in_column); } } @@ -202,13 +205,14 @@ void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner1( UpdateDataIndexToLeafIndexKernel_PARAMS, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_to_left) { + const bool max_to_left, + const bool is_single_feature_in_column) { if (!mfb_is_zero) { LaunchUpdateDataIndexToLeafIndexKernel_Inner2 - (UpdateDataIndexToLeafIndex_ARGS, mfb_is_na, max_to_left); + (UpdateDataIndexToLeafIndex_ARGS, mfb_is_na, max_to_left, is_single_feature_in_column); } else { LaunchUpdateDataIndexToLeafIndexKernel_Inner2 - (UpdateDataIndexToLeafIndex_ARGS, mfb_is_na, max_to_left); + (UpdateDataIndexToLeafIndex_ARGS, mfb_is_na, max_to_left, is_single_feature_in_column); } } @@ -216,27 +220,42 @@ template - (UpdateDataIndexToLeafIndex_ARGS, max_to_left); + (UpdateDataIndexToLeafIndex_ARGS, max_to_left, is_single_feature_in_column); } else { LaunchUpdateDataIndexToLeafIndexKernel_Inner3 - (UpdateDataIndexToLeafIndex_ARGS, max_to_left); + (UpdateDataIndexToLeafIndex_ARGS, max_to_left, is_single_feature_in_column); } } template void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner3( UpdateDataIndexToLeafIndexKernel_PARAMS, - const bool max_to_left) { + const bool max_to_left, + const bool is_single_feature_in_column) { if (!max_to_left) { - UpdateDataIndexToLeafIndexKernel + LaunchUpdateDataIndexToLeafIndexKernel_Inner4 + (UpdateDataIndexToLeafIndex_ARGS, is_single_feature_in_column); + } else { + LaunchUpdateDataIndexToLeafIndexKernel_Inner4 + (UpdateDataIndexToLeafIndex_ARGS, is_single_feature_in_column); + } +} + +template +void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner4( + UpdateDataIndexToLeafIndexKernel_PARAMS, + const bool is_single_feature_in_column) { + if (!is_single_feature_in_column) { + UpdateDataIndexToLeafIndexKernel <<>>( UpdateDataIndexToLeafIndex_ARGS, cuda_data_index_to_leaf_index_); } else { - UpdateDataIndexToLeafIndexKernel + UpdateDataIndexToLeafIndexKernel <<>>( UpdateDataIndexToLeafIndex_ARGS, cuda_data_index_to_leaf_index_); @@ -265,7 +284,7 @@ void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner3( split_default_to_left, \ split_missing_default_to_left -template +template __global__ void GenDataToLeftBitVectorKernel( GenDataToLeftBitVectorKernel_PARMS, uint16_t* block_to_left_offset, @@ -281,7 +300,8 @@ __global__ void GenDataToLeftBitVectorKernel( if ((MISSING_IS_ZERO && !MFB_IS_ZERO && bin == t_zero_bin) || (MISSING_IS_NA && !MFB_IS_NA && bin == max_bin)) { thread_to_left_offset_cnt = split_missing_default_to_left; - } else if ((bin < min_bin || bin > max_bin)) { + } else if ((USE_MIN_BIN && (bin < min_bin || bin > max_bin)) || + (!USE_MIN_BIN && bin == 0)) { if ((MISSING_IS_NA && MFB_IS_NA) || (MISSING_IS_ZERO || MFB_IS_ZERO)) { thread_to_left_offset_cnt = split_missing_default_to_left; } else { @@ -320,22 +340,23 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner( const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_bin_to_left) { + const bool max_bin_to_left, + const bool is_single_feature_in_column) { if (min_bin < max_bin) { if (!missing_is_zero) { LaunchGenDataToLeftBitVectorKernelInner0 - (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left); + (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column); } else { LaunchGenDataToLeftBitVectorKernelInner0 - (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left); + (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column); } } else { if (!missing_is_zero) { LaunchGenDataToLeftBitVectorKernelInner0 - (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left); + (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column); } else { LaunchGenDataToLeftBitVectorKernelInner0 - (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left); + (GenBitVector_ARGS, missing_is_na, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column); } } } @@ -346,13 +367,14 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner0( const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_bin_to_left) { + const bool max_bin_to_left, + const bool is_single_feature_in_column) { if (!missing_is_na) { LaunchGenDataToLeftBitVectorKernelInner1 - (GenBitVector_ARGS, mfb_is_zero, mfb_is_na, max_bin_to_left); + (GenBitVector_ARGS, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column); } else { LaunchGenDataToLeftBitVectorKernelInner1 - (GenBitVector_ARGS, mfb_is_zero, mfb_is_na, max_bin_to_left); + (GenBitVector_ARGS, mfb_is_zero, mfb_is_na, max_bin_to_left, is_single_feature_in_column); } } @@ -361,13 +383,14 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner1( GenDataToLeftBitVectorKernel_PARMS, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_bin_to_left) { + const bool max_bin_to_left, + const bool is_single_feature_in_column) { if (!mfb_is_zero) { LaunchGenDataToLeftBitVectorKernelInner2 - (GenBitVector_ARGS, mfb_is_na, max_bin_to_left); + (GenBitVector_ARGS, mfb_is_na, max_bin_to_left, is_single_feature_in_column); } else { LaunchGenDataToLeftBitVectorKernelInner2 - (GenBitVector_ARGS, mfb_is_na, max_bin_to_left); + (GenBitVector_ARGS, mfb_is_na, max_bin_to_left, is_single_feature_in_column); } } @@ -375,30 +398,48 @@ template - (GenBitVector_ARGS, max_bin_to_left); + (GenBitVector_ARGS, max_bin_to_left, is_single_feature_in_column); } else { LaunchGenDataToLeftBitVectorKernelInner3 - (GenBitVector_ARGS, max_bin_to_left); + (GenBitVector_ARGS, max_bin_to_left, is_single_feature_in_column); } } template void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner3( GenDataToLeftBitVectorKernel_PARMS, - const bool max_bin_to_left) { + const bool max_bin_to_left, + const bool is_single_feature_in_column) { if (!max_bin_to_left) { - GenDataToLeftBitVectorKernel + LaunchGenDataToLeftBitVectorKernelInner4 + (GenBitVector_ARGS, is_single_feature_in_column); + } else { + LaunchGenDataToLeftBitVectorKernelInner4 + + (GenBitVector_ARGS, is_single_feature_in_column); + } +} + +template +void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner4( + GenDataToLeftBitVectorKernel_PARMS, + const bool is_single_feature_in_column) { + Log::Warning("is_single_feature_in_column = %d", static_cast(is_single_feature_in_column)); + if (!is_single_feature_in_column) { + GenDataToLeftBitVectorKernel + <<>>(GenBitVector_ARGS, cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_); } else { GenDataToLeftBitVectorKernel - + <<>>(GenBitVector_ARGS, cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_); } @@ -416,6 +457,7 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel( const bool missing_is_na = static_cast(cuda_column_data_->feature_missing_is_na(split_feature_index)); const bool mfb_is_zero = static_cast(cuda_column_data_->feature_mfb_is_zero(split_feature_index)); const bool mfb_is_na = static_cast(cuda_column_data_->feature_mfb_is_na(split_feature_index)); + const bool is_single_feature_in_column = is_single_feature_in_column_[split_feature_index]; const uint32_t default_bin = cuda_column_data_->feature_default_bin(split_feature_index); const uint32_t most_freq_bin = cuda_column_data_->feature_most_freq_bin(split_feature_index); const uint32_t min_bin = cuda_column_data_->feature_min_bin(split_feature_index); @@ -456,14 +498,16 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel( missing_is_na, mfb_is_zero, mfb_is_na, - max_bin_to_left); + max_bin_to_left, + is_single_feature_in_column); LaunchUpdateDataIndexToLeafIndexKernel( UpdateDataIndexToLeafIndex_ARGS, missing_is_zero, missing_is_na, mfb_is_zero, mfb_is_na, - max_bin_to_left); + max_bin_to_left, + is_single_feature_in_column); } else if (bit_type == 16) { const uint16_t* column_data = reinterpret_cast(column_data_pointer); LaunchGenDataToLeftBitVectorKernelInner( @@ -472,14 +516,16 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel( missing_is_na, mfb_is_zero, mfb_is_na, - max_bin_to_left); + max_bin_to_left, + is_single_feature_in_column); LaunchUpdateDataIndexToLeafIndexKernel( UpdateDataIndexToLeafIndex_ARGS, missing_is_zero, missing_is_na, mfb_is_zero, mfb_is_na, - max_bin_to_left); + max_bin_to_left, + is_single_feature_in_column); } else if (bit_type == 32) { const uint32_t* column_data = reinterpret_cast(column_data_pointer); LaunchGenDataToLeftBitVectorKernelInner( @@ -488,14 +534,16 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel( missing_is_na, mfb_is_zero, mfb_is_na, - max_bin_to_left); + max_bin_to_left, + is_single_feature_in_column); LaunchUpdateDataIndexToLeafIndexKernel( UpdateDataIndexToLeafIndex_ARGS, missing_is_zero, missing_is_na, mfb_is_zero, mfb_is_na, - max_bin_to_left); + max_bin_to_left, + is_single_feature_in_column); } } diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index b5b7fa9eebc8..c015f6d589d9 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -188,7 +188,8 @@ class CUDADataPartition { const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_bin_to_left); + const bool max_bin_to_left, + const bool is_single_feature_in_column); template void LaunchGenDataToLeftBitVectorKernelInner0( @@ -196,25 +197,34 @@ class CUDADataPartition { const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_bin_to_left); + const bool max_bin_to_left, + const bool is_single_feature_in_column); template void LaunchGenDataToLeftBitVectorKernelInner1( GenDataToLeftBitVectorKernel_PARMS, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_bin_to_left); + const bool max_bin_to_left, + const bool is_single_feature_in_column); template void LaunchGenDataToLeftBitVectorKernelInner2( GenDataToLeftBitVectorKernel_PARMS, const bool mfb_is_na, - const bool max_bin_to_left); + const bool max_bin_to_left, + const bool is_single_feature_in_column); template void LaunchGenDataToLeftBitVectorKernelInner3( GenDataToLeftBitVectorKernel_PARMS, - const bool max_bin_to_left); + const bool max_bin_to_left, + const bool is_single_feature_in_column); + + template + void LaunchGenDataToLeftBitVectorKernelInner4( + GenDataToLeftBitVectorKernel_PARMS, + const bool is_single_feature_in_column); #undef GenDataToLeftBitVectorKernel_PARMS @@ -238,7 +248,8 @@ class CUDADataPartition { const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_to_left); + const bool max_to_left, + const bool is_single_feature_in_column); template void LaunchUpdateDataIndexToLeafIndexKernel_Inner0( @@ -246,25 +257,34 @@ class CUDADataPartition { const bool missing_is_na, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_to_left); + const bool max_to_left, + const bool is_single_feature_in_column); template void LaunchUpdateDataIndexToLeafIndexKernel_Inner1( UpdateDataIndexToLeafIndexKernel_PARAMS, const bool mfb_is_zero, const bool mfb_is_na, - const bool max_to_left); + const bool max_to_left, + const bool is_single_feature_in_column); template void LaunchUpdateDataIndexToLeafIndexKernel_Inner2( UpdateDataIndexToLeafIndexKernel_PARAMS, const bool mfb_is_na, - const bool max_to_left); + const bool max_to_left, + const bool is_single_feature_in_column); template void LaunchUpdateDataIndexToLeafIndexKernel_Inner3( UpdateDataIndexToLeafIndexKernel_PARAMS, - const bool max_to_left); + const bool max_to_left, + const bool is_single_feature_in_column); + + template + void LaunchUpdateDataIndexToLeafIndexKernel_Inner4( + UpdateDataIndexToLeafIndexKernel_PARAMS, + const bool is_single_feature_in_column); #undef UpdateDataIndexToLeafIndexKernel_PARAMS From 26130d908cbf789b4ae18c768305f36be3367da8 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 18 Nov 2021 08:35:37 +0000 Subject: [PATCH 120/166] fix mfb_is_na and mfb_is_zero and is_single_feature_column --- src/io/dataset.cpp | 44 +++++++++++++++++---- src/treelearner/cuda/cuda_data_partition.cu | 10 ++--- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index e001fccbb3f1..c4b09696f88b 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1570,12 +1570,27 @@ void Dataset::CreateCUDAColumnData() { if (feature_bin_mapper->missing_type() == MissingType::Zero) { feature_missing_is_zero[feature_index] = 1; feature_missing_is_na[feature_index] = 0; + if (feature_default_bin[feature_index] == feature_most_freq_bins[feature_index]) { + feature_mfb_is_zero[feature_index] = 1; + } else { + feature_mfb_is_zero[feature_index] = 0; + } + feature_mfb_is_na[feature_index] = 0; } else if (feature_bin_mapper->missing_type() == MissingType::NaN) { feature_missing_is_zero[feature_index] = 0; feature_missing_is_na[feature_index] = 1; + feature_mfb_is_zero[feature_index] = 0; + if (feature_most_freq_bins[feature_index] + feature_min_bins[feature_index] == feature_max_bins[feature_index] && + feature_most_freq_bins[feature_index] > 0) { + feature_mfb_is_na[feature_index] = 1; + } else { + feature_mfb_is_na[feature_index] = 0; + } } else { feature_missing_is_zero[feature_index] = 0; feature_missing_is_na[feature_index] = 0; + feature_mfb_is_zero[feature_index] = 0; + feature_mfb_is_na[feature_index] = 0; } ++feature_index; } @@ -1601,15 +1616,30 @@ void Dataset::CreateCUDAColumnData() { feature_most_freq_bins[feature_index] = most_freq_bin; feature_default_bin[feature_index] = feature_bin_mapper->GetDefaultBin(); if (feature_bin_mapper->missing_type() == MissingType::Zero) { - feature_missing_is_zero[feature_index] = 1; - feature_missing_is_na[feature_index] = 0; - } else if (feature_bin_mapper->missing_type() == MissingType::NaN) { - feature_missing_is_zero[feature_index] = 0; - feature_missing_is_na[feature_index] = 1; + feature_missing_is_zero[feature_index] = 1; + feature_missing_is_na[feature_index] = 0; + if (feature_default_bin[feature_index] == feature_most_freq_bins[feature_index]) { + feature_mfb_is_zero[feature_index] = 1; } else { - feature_missing_is_zero[feature_index] = 0; - feature_missing_is_na[feature_index] = 0; + feature_mfb_is_zero[feature_index] = 0; } + feature_mfb_is_na[feature_index] = 0; + } else if (feature_bin_mapper->missing_type() == MissingType::NaN) { + feature_missing_is_zero[feature_index] = 0; + feature_missing_is_na[feature_index] = 1; + feature_mfb_is_zero[feature_index] = 0; + if (feature_most_freq_bins[feature_index] + feature_min_bins[feature_index] == feature_max_bins[feature_index] && + feature_most_freq_bins[feature_index] > 0) { + feature_mfb_is_na[feature_index] = 1; + } else { + feature_mfb_is_na[feature_index] = 0; + } + } else { + feature_missing_is_zero[feature_index] = 0; + feature_missing_is_na[feature_index] = 0; + feature_mfb_is_zero[feature_index] = 0; + feature_mfb_is_na[feature_index] = 0; + } ++feature_index; } ++num_columns; diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 5453b49c1301..820a7b0a2efb 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -250,12 +250,12 @@ void CUDADataPartition::LaunchUpdateDataIndexToLeafIndexKernel_Inner4( UpdateDataIndexToLeafIndexKernel_PARAMS, const bool is_single_feature_in_column) { if (!is_single_feature_in_column) { - UpdateDataIndexToLeafIndexKernel + UpdateDataIndexToLeafIndexKernel <<>>( UpdateDataIndexToLeafIndex_ARGS, cuda_data_index_to_leaf_index_); } else { - UpdateDataIndexToLeafIndexKernel + UpdateDataIndexToLeafIndexKernel <<>>( UpdateDataIndexToLeafIndex_ARGS, cuda_data_index_to_leaf_index_); @@ -434,12 +434,12 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernelInner4( Log::Warning("is_single_feature_in_column = %d", static_cast(is_single_feature_in_column)); if (!is_single_feature_in_column) { GenDataToLeftBitVectorKernel - + <<>>(GenBitVector_ARGS, cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_); } else { GenDataToLeftBitVectorKernel - + <<>>(GenBitVector_ARGS, cuda_block_to_left_offset_, cuda_block_data_to_left_offset_, cuda_block_data_to_right_offset_); } @@ -460,7 +460,7 @@ void CUDADataPartition::LaunchGenDataToLeftBitVectorKernel( const bool is_single_feature_in_column = is_single_feature_in_column_[split_feature_index]; const uint32_t default_bin = cuda_column_data_->feature_default_bin(split_feature_index); const uint32_t most_freq_bin = cuda_column_data_->feature_most_freq_bin(split_feature_index); - const uint32_t min_bin = cuda_column_data_->feature_min_bin(split_feature_index); + const uint32_t min_bin = is_single_feature_in_column ? 1 : cuda_column_data_->feature_min_bin(split_feature_index); const uint32_t max_bin = cuda_column_data_->feature_max_bin(split_feature_index); uint32_t th = split_threshold + min_bin; uint32_t t_zero_bin = min_bin + default_bin; From d49e92af18880b521eb6fa1358e3cb3ab5195528 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 19 Nov 2021 04:44:59 +0000 Subject: [PATCH 121/166] remove debug log --- python-package/lightgbm/basic.py | 4 ++-- python-package/lightgbm/dask.py | 22 ++++++++++----------- src/treelearner/cuda/cuda_data_partition.cu | 1 - 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index a4358b9ef40c..15e373baab04 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3598,7 +3598,7 @@ def refit(self, data, label, decay_rate=0.9, **kwargs): predictor = self._to_predictor(deepcopy(kwargs)) leaf_preds = predictor.predict(data, -1, pred_leaf=True) nrow, ncol = leaf_preds.shape - out_is_linear = ctypes.c_int(0) + out_is_linear = ctypes.c_bool(False) _safe_call(_LIB.LGBM_BoosterGetLinear( self.handle, ctypes.byref(out_is_linear))) @@ -3607,7 +3607,7 @@ def refit(self, data, label, decay_rate=0.9, **kwargs): params=self.params, default_value=None ) - new_params["linear_tree"] = bool(out_is_linear.value) + new_params["linear_tree"] = out_is_linear.value train_set = Dataset(data, label, silent=True, params=new_params) new_params['refit_decay_rate'] = decay_rate new_booster = Booster(new_params, train_set) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 27fedacb301e..b985c08757c7 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -11,7 +11,7 @@ from copy import deepcopy from enum import Enum, auto from functools import partial -from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union from urllib.parse import urlparse import numpy as np @@ -21,8 +21,8 @@ from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait) -from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomEvalFunction, - _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict) +from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _lgbmmodel_doc_custom_eval_note, + _lgbmmodel_doc_fit, _lgbmmodel_doc_predict) _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series] _DaskMatrixLike = Union[dask_Array, dask_DataFrame] @@ -400,7 +400,7 @@ def _train( eval_class_weight: Optional[List[Union[dict, str]]] = None, eval_init_score: Optional[List[_DaskCollection]] = None, eval_group: Optional[List[_DaskVectorLike]] = None, - eval_metric: Optional[Union[_LGBM_ScikitCustomEvalFunction, str, List[Union[_LGBM_ScikitCustomEvalFunction, str]]]] = None, + eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, eval_at: Optional[Iterable[int]] = None, **kwargs: Any ) -> LGBMModel: @@ -1029,7 +1029,7 @@ def _lgb_dask_fit( eval_class_weight: Optional[List[Union[dict, str]]] = None, eval_init_score: Optional[List[_DaskCollection]] = None, eval_group: Optional[List[_DaskVectorLike]] = None, - eval_metric: Optional[Union[_LGBM_ScikitCustomEvalFunction, str, List[Union[_LGBM_ScikitCustomEvalFunction, str]]]] = None, + eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, eval_at: Optional[Iterable[int]] = None, early_stopping_rounds: Optional[int] = None, **kwargs: Any @@ -1096,7 +1096,7 @@ def __init__( learning_rate: float = 0.1, n_estimators: int = 100, subsample_for_bin: int = 200000, - objective: Optional[str] = None, + objective: Optional[Union[Callable, str]] = None, class_weight: Optional[Union[dict, str]] = None, min_split_gain: float = 0., min_child_weight: float = 1e-3, @@ -1165,7 +1165,7 @@ def fit( eval_sample_weight: Optional[List[_DaskVectorLike]] = None, eval_class_weight: Optional[List[Union[dict, str]]] = None, eval_init_score: Optional[List[_DaskCollection]] = None, - eval_metric: Optional[Union[_LGBM_ScikitCustomEvalFunction, str, List[Union[_LGBM_ScikitCustomEvalFunction, str]]]] = None, + eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, early_stopping_rounds: Optional[int] = None, **kwargs: Any ) -> "DaskLGBMClassifier": @@ -1281,7 +1281,7 @@ def __init__( learning_rate: float = 0.1, n_estimators: int = 100, subsample_for_bin: int = 200000, - objective: Optional[str] = None, + objective: Optional[Union[Callable, str]] = None, class_weight: Optional[Union[dict, str]] = None, min_split_gain: float = 0., min_child_weight: float = 1e-3, @@ -1348,7 +1348,7 @@ def fit( eval_names: Optional[List[str]] = None, eval_sample_weight: Optional[List[_DaskVectorLike]] = None, eval_init_score: Optional[List[_DaskVectorLike]] = None, - eval_metric: Optional[Union[_LGBM_ScikitCustomEvalFunction, str, List[Union[_LGBM_ScikitCustomEvalFunction, str]]]] = None, + eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, early_stopping_rounds: Optional[int] = None, **kwargs: Any ) -> "DaskLGBMRegressor": @@ -1446,7 +1446,7 @@ def __init__( learning_rate: float = 0.1, n_estimators: int = 100, subsample_for_bin: int = 200000, - objective: Optional[str] = None, + objective: Optional[Union[Callable, str]] = None, class_weight: Optional[Union[dict, str]] = None, min_split_gain: float = 0., min_child_weight: float = 1e-3, @@ -1516,7 +1516,7 @@ def fit( eval_sample_weight: Optional[List[_DaskVectorLike]] = None, eval_init_score: Optional[List[_DaskVectorLike]] = None, eval_group: Optional[List[_DaskVectorLike]] = None, - eval_metric: Optional[Union[_LGBM_ScikitCustomEvalFunction, str, List[Union[_LGBM_ScikitCustomEvalFunction, str]]]] = None, + eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, eval_at: Iterable[int] = (1, 2, 3, 4, 5), early_stopping_rounds: Optional[int] = None, **kwargs: Any diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 820a7b0a2efb..1aa19367106a 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -431,7 +431,6 @@ template (is_single_feature_in_column)); if (!is_single_feature_in_column) { GenDataToLeftBitVectorKernel From 3214d687a5ac2b76d016207df721bbb8154d4eb9 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 23 Nov 2021 04:55:34 +0000 Subject: [PATCH 122/166] fix cat_l2 when one-hot fix gradient copy when data subset is used --- src/boosting/gbdt.cpp | 4 +++- src/treelearner/cuda/cuda_best_split_finder.cu | 9 ++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index ea1530491f84..b4aeaac9059d 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -395,7 +395,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto grad = gradients + offset; auto hess = hessians + offset; // need to copy gradients for bagging subset. - if (is_use_subset_ && bag_data_cnt_ < num_data_) { + if (is_use_subset_ && bag_data_cnt_ < num_data_ && + !(LGBM_config_::current_learner == use_cuda_learner && config_->num_gpu == 1 && + (config_->tree_learner == std::string("serial") || Network::num_machines() == 1))) { for (int i = 0; i < bag_data_cnt_; ++i) { gradients_[offset + i] = grad[bag_data_indices_[i]]; hessians_[offset + i] = hess[bag_data_indices_[i]]; diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index d464f0556990..6645eb0f04a4 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -350,7 +350,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( __shared__ uint32_t best_thread_index; const double cnt_factor = num_data / sum_hessians; const double min_gain_shift = parent_gain + min_gain_to_split; - const double l2 = lambda_l2 + cat_l2; + double l2 = lambda_l2; double local_gain = min_gain_shift; bool threshold_found = false; @@ -437,6 +437,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( __shared__ uint16_t shared_mem_buffer_uint16[32]; __shared__ double shared_mem_buffer_double[32]; __shared__ int used_bin; + l2 += cat_l2; uint16_t is_valid_bin = 0; int best_dir = 0; double best_sum_left_gradient = 0.0f; @@ -621,7 +622,6 @@ __global__ void FindBestSplitsForLeafKernel( const double cat_l2, const int max_cat_threshold, const int min_data_per_group, - const int max_cat_to_onehot, // output CUDASplitInfo* cuda_best_split_info) { const unsigned int task_index = blockIdx.x; @@ -976,7 +976,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( __shared__ uint32_t best_thread_index; const double cnt_factor = num_data / sum_hessians; const double min_gain_shift = parent_gain + min_gain_to_split; - const double l2 = lambda_l2 + cat_l2; + double l2 = lambda_l2; double local_gain = kMinScore; bool threshold_found = false; @@ -1061,6 +1061,7 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( } else { __shared__ uint16_t shared_mem_buffer_uint16[32]; __shared__ int used_bin; + l2 += cat_l2; uint16_t is_valid_bin = 0; int best_dir = 0; double best_sum_left_gradient = 0.0f; @@ -1246,7 +1247,6 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( const double cat_l2, const int max_cat_threshold, const int min_data_per_group, - const int max_cat_to_onehot, // output CUDASplitInfo* cuda_best_split_info, // buffer @@ -1394,7 +1394,6 @@ __global__ void FindBestSplitsForLeafKernel_GlobalMemory( cat_l2_, \ max_cat_threshold_, \ min_data_per_group_, \ - max_cat_to_onehot_, \ cuda_best_split_info_ #define GlobalMemory_Buffer_ARGS \ From 85ea4088390277c01eebb35a9fff1b3f34420cfc Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 1 Dec 2021 06:59:26 +0000 Subject: [PATCH 123/166] switch shared histogram size according to CUDA version --- include/LightGBM/cuda/cuda_row_data.hpp | 20 +++++++- src/io/cuda/cuda_row_data.cpp | 12 ++++- .../cuda/cuda_histogram_constructor.cpp | 10 ++-- .../cuda/cuda_histogram_constructor.cu | 51 ++++++++++++------- .../cuda/cuda_histogram_constructor.hpp | 17 ++++--- .../cuda/cuda_single_gpu_tree_learner.cpp | 2 +- 6 files changed, 77 insertions(+), 35 deletions(-) diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 5ff97b0defd8..607321403909 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -18,15 +18,25 @@ #include "../train_share_states.h" -#define SHRAE_HIST_SIZE (6144) #define COPY_SUBROW_BLOCK_SIZE_ROW_DATA (1024) +#ifndef CUDART_VERSION +#error CUDART_VERSION Undefined! +#elif CUDART_VERSION == 10000 +#define DP_SHARED_HIST_SIZE (5560) +#else +#define DP_SHARED_HIST_SIZE (6144) +#endif +#define SP_SHARED_HIST_SIZE (DP_SHARED_HIST_SIZE * 2) + namespace LightGBM { class CUDARowData { public: CUDARowData(const Dataset* train_data, - const TrainingShareStates* train_share_state, const int gpu_device_id); + const TrainingShareStates* train_share_state, + const int gpu_device_id, + const bool gpu_use_dp); ~CUDARowData(); @@ -76,6 +86,8 @@ class CUDARowData { const uint32_t* cuda_partition_hist_offsets() const { return cuda_partition_hist_offsets_; } + int shared_hist_size() const { return shared_hist_size_; } + private: void DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state); @@ -134,6 +146,10 @@ class CUDARowData { std::vector large_bin_partitions_; /*! \brief index of partitions with small bins */ std::vector small_bin_partitions_; + /*! \brief shared memory size used by histogram */ + int shared_hist_size_; + /*! \brief whether to use double precision in histograms per block */ + bool gpu_use_dp_; // CUDA memory diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 20172eb1d61f..afe530b7205b 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -11,10 +11,18 @@ namespace LightGBM { CUDARowData::CUDARowData(const Dataset* train_data, const TrainingShareStates* train_share_state, - const int gpu_device_id): gpu_device_id_(gpu_device_id) { + const int gpu_device_id, + const bool gpu_use_dp): +gpu_device_id_(gpu_device_id), +gpu_use_dp_(gpu_use_dp) { num_threads_ = OMP_NUM_THREADS(); num_data_ = train_data->num_data(); const auto& feature_hist_offsets = train_share_state->feature_hist_offsets(); + if (gpu_use_dp_) { + shared_hist_size_ = DP_SHARED_HIST_SIZE; + } else { + shared_hist_size_ = SP_SHARED_HIST_SIZE; + } if (feature_hist_offsets.empty()) { num_total_bin_ = 0; } else { @@ -169,7 +177,7 @@ void CUDARowData::Init(const Dataset* train_data, TrainingShareStates* train_sha } void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingShareStates* share_state) { - const uint32_t max_num_bin_per_partition = SHRAE_HIST_SIZE / 2; + const uint32_t max_num_bin_per_partition = shared_hist_size_ / 2; const std::vector& column_hist_offsets = share_state->column_hist_offsets(); std::vector feature_group_num_feature_offsets; int offsets = 0; diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 92053d898728..7e6be1c1069c 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -19,14 +19,16 @@ CUDAHistogramConstructor::CUDAHistogramConstructor( const std::vector& feature_hist_offsets, const int min_data_in_leaf, const double min_sum_hessian_in_leaf, - const int gpu_device_id): + const int gpu_device_id, + const bool gpu_use_dp): num_data_(train_data->num_data()), num_features_(train_data->num_features()), num_leaves_(num_leaves), num_threads_(num_threads), min_data_in_leaf_(min_data_in_leaf), min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf), - gpu_device_id_(gpu_device_id) { + gpu_device_id_(gpu_device_id), + gpu_use_dp_(gpu_use_dp) { InitFeatureMetaInfo(train_data, feature_hist_offsets); cuda_row_data_.reset(nullptr); cuda_feature_num_bins_ = nullptr; @@ -98,7 +100,7 @@ void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStat InitCUDAMemoryFromHostMemory(&cuda_feature_most_freq_bins_, feature_most_freq_bins_.data(), feature_most_freq_bins_.size(), __FILE__, __LINE__); - cuda_row_data_.reset(new CUDARowData(train_data, share_state, gpu_device_id_)); + cuda_row_data_.reset(new CUDARowData(train_data, share_state, gpu_device_id_, gpu_use_dp_)); cuda_row_data_->Init(train_data, share_state); CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_)); @@ -171,7 +173,7 @@ void CUDAHistogramConstructor::ResetTrainingData(const Dataset* train_data, Trai InitCUDAMemoryFromHostMemory(&cuda_feature_most_freq_bins_, feature_most_freq_bins_.data(), feature_most_freq_bins_.size(), __FILE__, __LINE__); - cuda_row_data_.reset(new CUDARowData(train_data, share_states, gpu_device_id_)); + cuda_row_data_.reset(new CUDARowData(train_data, share_states, gpu_device_id_, gpu_use_dp_)); cuda_row_data_->Init(train_data, share_states); InitCUDAMemoryFromHostMemory(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__); diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index 9168d03f017f..de688e9fcfbc 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -14,7 +14,7 @@ namespace LightGBM { -template +template __global__ void CUDAConstructHistogramDenseKernel( const CUDALeafSplitsStruct* smaller_leaf_splits, const score_t* cuda_gradients, @@ -28,7 +28,7 @@ __global__ void CUDAConstructHistogramDenseKernel( const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf; - __shared__ double shared_hist[SHRAE_HIST_SIZE]; + __shared__ HIST_TYPE shared_hist[SHARED_HIST_SIZE]; const unsigned int num_threads_per_block = blockDim.x * blockDim.y; const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x]; const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1]; @@ -48,14 +48,14 @@ __global__ void CUDAConstructHistogramDenseKernel( data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast(blockDim.y))); const int column_index = static_cast(threadIdx.x) + partition_column_start; if (threadIdx.x < static_cast(num_columns_in_partition)) { - double* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); + HIST_TYPE* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); for (data_size_t inner_data_index = static_cast(threadIdx.y); inner_data_index < block_num_data; inner_data_index += blockDim.y) { const data_size_t data_index = data_indices_ref_this_block[inner_data_index]; const score_t grad = cuda_gradients[data_index]; const score_t hess = cuda_hessians[data_index]; const uint32_t bin = static_cast(data_ptr[data_index * num_columns_in_partition + threadIdx.x]); const uint32_t pos = bin << 1; - double* pos_ptr = shared_hist_ptr + pos; + HIST_TYPE* pos_ptr = shared_hist_ptr + pos; atomicAdd_block(pos_ptr, grad); atomicAdd_block(pos_ptr + 1, hess); } @@ -67,7 +67,7 @@ __global__ void CUDAConstructHistogramDenseKernel( } } -template +template __global__ void CUDAConstructHistogramSparseKernel( const CUDALeafSplitsStruct* smaller_leaf_splits, const score_t* cuda_gradients, @@ -81,7 +81,7 @@ __global__ void CUDAConstructHistogramSparseKernel( const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf; - __shared__ double shared_hist[SHRAE_HIST_SIZE]; + __shared__ HIST_TYPE shared_hist[SHARED_HIST_SIZE]; const unsigned int num_threads_per_block = blockDim.x * blockDim.y; const DATA_PTR_TYPE* block_row_ptr = row_ptr + blockIdx.x * (num_data + 1); const BIN_TYPE* data_ptr = data + partition_ptr[blockIdx.x]; @@ -112,7 +112,7 @@ __global__ void CUDAConstructHistogramSparseKernel( const score_t hess = cuda_hessians[data_index]; const uint32_t bin = static_cast(data_ptr[row_start + threadIdx.x]); const uint32_t pos = bin << 1; - double* pos_ptr = shared_hist + pos; + HIST_TYPE* pos_ptr = shared_hist + pos; atomicAdd_block(pos_ptr, grad); atomicAdd_block(pos_ptr + 1, hess); } @@ -247,6 +247,19 @@ __global__ void CUDAConstructHistogramSparseKernel_GlobalMemory( } void CUDAHistogramConstructor::LaunchConstructHistogramKernel( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const data_size_t num_data_in_smaller_leaf) { + if (cuda_row_data_->shared_hist_size() == DP_SHARED_HIST_SIZE && gpu_use_dp_) { + LaunchConstructHistogramKernelInner(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); + } else if (cuda_row_data_->shared_hist_size() == SP_SHARED_HIST_SIZE && !gpu_use_dp_) { + LaunchConstructHistogramKernelInner(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); + } else { + Log::Fatal("Unknown shared histogram size %d", cuda_row_data_->shared_hist_size()); + } +} + +template +void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf) { int grid_dim_x = 0; @@ -260,7 +273,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( if (cuda_row_data_->is_sparse()) { if (cuda_row_data_->bit_type() == 8) { if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint8(), @@ -269,7 +282,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint8(), @@ -278,7 +291,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint8(), @@ -289,7 +302,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( } } else if (cuda_row_data_->bit_type() == 16) { if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint16(), @@ -298,7 +311,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint16(), @@ -307,7 +320,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint16(), @@ -318,7 +331,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( } } else if (cuda_row_data_->bit_type() == 32) { if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint32(), @@ -327,7 +340,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint32(), @@ -336,7 +349,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_row_data_->cuda_partition_hist_offsets(), num_data_); } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( + CUDAConstructHistogramSparseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint32(), @@ -348,7 +361,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( } } else { if (cuda_row_data_->bit_type() == 8) { - CUDAConstructHistogramDenseKernel<<>>( + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint8(), @@ -357,7 +370,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_row_data_->cuda_feature_partition_column_index_offsets(), num_data_); } else if (cuda_row_data_->bit_type() == 16) { - CUDAConstructHistogramDenseKernel<<>>( + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint16(), @@ -366,7 +379,7 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( cuda_row_data_->cuda_feature_partition_column_index_offsets(), num_data_); } else if (cuda_row_data_->bit_type() == 32) { - CUDAConstructHistogramDenseKernel<<>>( + CUDAConstructHistogramDenseKernel<<>>( cuda_smaller_leaf_splits, cuda_gradients_, cuda_hessians_, cuda_row_data_->cuda_data_uint32(), diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 852974e743a8..11e6894e2f72 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -17,7 +17,6 @@ #include "cuda_leaf_splits.hpp" -#define SHRAE_HIST_SIZE (6144) #define NUM_DATA_PER_THREAD (400) #define NUM_THRADS_PER_BLOCK (504) #define NUM_FEATURE_PER_THREAD_GROUP (28) @@ -37,7 +36,8 @@ class CUDAHistogramConstructor { const std::vector& feature_hist_offsets, const int min_data_in_leaf, const double min_sum_hessian_in_leaf, - const int gpu_device_id); + const int gpu_device_id, + const bool gpu_use_dp); ~CUDAHistogramConstructor(); @@ -71,14 +71,14 @@ class CUDAHistogramConstructor { int* block_dim_y, const data_size_t num_data_in_smaller_leaf); - void LaunchConstructHistogramKernel( + template + void LaunchConstructHistogramKernelInner( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf); - void LaunchSparseConstructHistogramKernel( - const dim3 grid_dim, - const dim3 block_dim, - const CUDALeafSplitsStruct* cuda_smaller_leaf_splits); + void LaunchConstructHistogramKernel( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const data_size_t num_data_in_smaller_leaf); void LaunchSubtractHistogramKernel( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, @@ -142,7 +142,10 @@ class CUDAHistogramConstructor { /*! \brief hessians on CUDA */ const score_t* cuda_hessians_; + /*! \brief GPU device index */ const int gpu_device_id_; + /*! \brief use double precision histogram per block */ + const bool gpu_use_dp_; }; } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 474003599eb7..dc99acf54553 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -42,7 +42,7 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_ cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, config_->num_leaves, num_threads_, share_state_->feature_hist_offsets(), - config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf, gpu_device_id_)); + config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf, gpu_device_id_, config_->gpu_use_dp)); cuda_histogram_constructor_->Init(train_data_, share_state_.get()); const auto& feature_hist_offsets = share_state_->feature_hist_offsets(); From 2af0f5d5fb16ffcaa0cf615d72d6c7ae0284fadf Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 1 Dec 2021 08:42:21 +0000 Subject: [PATCH 124/166] gpu_use_dp=true when cuda test --- include/LightGBM/config.h | 4 ++-- include/LightGBM/cuda/cuda_row_data.hpp | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 50371f3a2d91..cb60d62a211c 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -210,7 +210,7 @@ struct Config { // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support - std::string device_type = "cpu"; + std::string device_type = "cuda"; // [doc-only] // alias = random_seed, random_state @@ -1020,7 +1020,7 @@ struct Config { // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) // desc = **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported - bool gpu_use_dp = false; + bool gpu_use_dp = true; // check = >0 // desc = number of GPUs diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 607321403909..08b885eac418 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -20,9 +20,7 @@ #define COPY_SUBROW_BLOCK_SIZE_ROW_DATA (1024) -#ifndef CUDART_VERSION -#error CUDART_VERSION Undefined! -#elif CUDART_VERSION == 10000 +#if CUDART_VERSION <= 10000 #define DP_SHARED_HIST_SIZE (5560) #else #define DP_SHARED_HIST_SIZE (6144) From d0a628f5b8988431cf2c4d45a193c57b1366e9a8 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 1 Dec 2021 08:55:58 +0000 Subject: [PATCH 125/166] revert modification in config.h --- .ci/test.sh | 2 ++ include/LightGBM/config.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.ci/test.sh b/.ci/test.sh index 3f0510aa75e8..654eaa54e028 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -182,6 +182,8 @@ if [[ $TASK == "gpu" ]]; then elif [[ $TASK == "cuda" ]]; then sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done + sed -i'.bak' 's/std::string device_type = "cpu";/std::string gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h + grep -q 'std::string gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done if [[ $METHOD == "pip" ]]; then cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1 pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1 diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index cb60d62a211c..50371f3a2d91 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -210,7 +210,7 @@ struct Config { // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support - std::string device_type = "cuda"; + std::string device_type = "cpu"; // [doc-only] // alias = random_seed, random_state @@ -1020,7 +1020,7 @@ struct Config { // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) // desc = **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported - bool gpu_use_dp = true; + bool gpu_use_dp = false; // check = >0 // desc = number of GPUs From e0018ea47ec1ea7ecba55840c55093ddf457b599 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 1 Dec 2021 09:11:55 +0000 Subject: [PATCH 126/166] fix setting of gpu_use_dp=true in .ci/test.sh --- .ci/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/test.sh b/.ci/test.sh index 654eaa54e028..ba642029ab53 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -182,8 +182,8 @@ if [[ $TASK == "gpu" ]]; then elif [[ $TASK == "cuda" ]]; then sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done - sed -i'.bak' 's/std::string device_type = "cpu";/std::string gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h - grep -q 'std::string gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done + sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h + grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done if [[ $METHOD == "pip" ]]; then cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1 pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1 From e54b51ae1564ad648304d09ea20a5bf6695828ae Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 1 Dec 2021 09:20:20 +0000 Subject: [PATCH 127/166] fix linter errors --- include/LightGBM/cuda/cuda_random.hpp | 4 ++-- include/LightGBM/cuda/cuda_utils.h | 6 +++--- src/cuda/cuda_utils.cpp | 2 +- src/treelearner/cuda/cuda_best_split_finder.cpp | 2 ++ src/treelearner/cuda/cuda_best_split_finder.cu | 4 +++- src/treelearner/cuda/cuda_best_split_finder.hpp | 2 -- src/treelearner/cuda/cuda_data_partition.cpp | 1 + src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp | 3 ++- 8 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/LightGBM/cuda/cuda_random.hpp b/include/LightGBM/cuda/cuda_random.hpp index c9a194e2c93e..6c28e44bc2d3 100644 --- a/include/LightGBM/cuda/cuda_random.hpp +++ b/include/LightGBM/cuda/cuda_random.hpp @@ -69,6 +69,6 @@ class CUDARandom { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA -#endif // LIGHTGBM_CUDA_CUDA_RANDOM_HPP_ +#endif // LIGHTGBM_CUDA_CUDA_RANDOM_HPP_ diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 46f26ff97018..bbf8bb69a524 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -12,10 +12,10 @@ #include #include -#include - #include +#include + namespace LightGBM { #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); } @@ -109,7 +109,7 @@ class CUDAVector { data_ = nullptr; } - CUDAVector(size_t size) { + explicit CUDAVector(size_t size) { size_ = size; AllocateCUDAMemory(&data_, size_, __FILE__, __LINE__); } diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp index 67c1c3678f9d..d0fa773eec8b 100644 --- a/src/cuda/cuda_utils.cpp +++ b/src/cuda/cuda_utils.cpp @@ -15,7 +15,7 @@ void SynchronizeCUDADevice(const char* file, const int line) { void PrintLastCUDAError() { const char* error_name = cudaGetErrorName(cudaGetLastError()); - Log::Warning(error_name); + Log::Warning(error_name); } void SetCUDADevice(int gpu_device_id, const char* file, int line) { diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index f48e54a4ec6e..eefabbbe2e2f 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -6,6 +6,8 @@ #ifdef USE_CUDA +#include + #include "cuda_best_split_finder.hpp" #include "cuda_leaf_splits.hpp" diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 6645eb0f04a4..2b39cfdd4ba6 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -6,6 +6,8 @@ #ifdef USE_CUDA +#include + #include #include "cuda_best_split_finder.hpp" @@ -797,7 +799,7 @@ __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory( } } else { for (unsigned int bin = threadIdx_x; bin < feature_num_bin_minus_offset; bin += blockDim.x) { - const bool skip_sum = bin >= static_cast(task->na_as_missing) && + const bool skip_sum = bin >= static_cast(task->na_as_missing) && (task->skip_default_bin && (task->num_bin - 1 - bin) == static_cast(task->default_bin)); if (!skip_sum) { const unsigned int read_index = feature_num_bin_minus_offset - 1 - bin; diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index fbdff366c1de..4ae9aded5c3d 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -37,7 +37,6 @@ struct SplitFindTask { uint8_t mfb_offset; uint32_t num_bin; uint32_t default_bin; - //CUDARandom* cuda_random; int rand_threshold; }; @@ -90,7 +89,6 @@ class CUDABestSplitFinder { void ResetConfig(const Config* config, const hist_t* cuda_hist); private: - #define LaunchFindBestSplitsForLeafKernel_PARAMS \ const CUDALeafSplitsStruct* smaller_leaf_splits, \ const CUDALeafSplitsStruct* larger_leaf_splits, \ diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index d48d024c9d24..d6bcdfa953c7 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -7,6 +7,7 @@ #ifdef USE_CUDA #include +#include #include "cuda_data_partition.hpp" diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index dc99acf54553..2617f7924290 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -14,6 +14,7 @@ #include #include +#include #include namespace LightGBM { @@ -385,7 +386,7 @@ void CUDASingleGPUTreeLearner::AllocateBitset() { max_cat_num_bin = std::max(bin_mapper->num_bin(), max_cat_num_bin); } } - // std::max(..., 1UL) to avoid error in the case when there are NaN's in the categorical values + // std::max(..., 1UL) to avoid error in the case when there are NaN's in the categorical values const size_t cuda_bitset_max_size = std::max(static_cast((max_cat_value + 31) / 32), 1UL); const size_t cuda_bitset_inner_max_size = std::max(static_cast((max_cat_num_bin + 31) / 32), 1UL); AllocateCUDAMemory(&cuda_bitset_, cuda_bitset_max_size, __FILE__, __LINE__); From 541235fc239354905a069872b189778abd0a8afc Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 1 Dec 2021 09:26:56 +0000 Subject: [PATCH 128/166] fix linter error remove useless change --- src/objective/multiclass_objective.hpp | 2 -- src/treelearner/cuda/cuda_single_gpu_tree_learner.cu | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp index cc4d2c849a54..88aa0ee040e6 100644 --- a/src/objective/multiclass_objective.hpp +++ b/src/objective/multiclass_objective.hpp @@ -267,8 +267,6 @@ class MulticlassOVA: public ObjectiveFunction { } private: - MulticlassOVA() {} - /*! \brief Number of data */ data_size_t num_data_; /*! \brief Number of classes */ diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu index 3423b915b7fe..8a558ddc43d1 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu @@ -10,6 +10,8 @@ #include "cuda_single_gpu_tree_learner.hpp" +#include + namespace LightGBM { __global__ void ReduceLeafStatKernel_SharedMemory( From a2ead3c6a08046934f75d9a8d0265489ff0cc8f0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 1 Dec 2021 11:22:56 +0000 Subject: [PATCH 129/166] recover main.cpp --- src/main.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 4d69c53a1aec..8034da826811 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,16 +8,10 @@ #include "network/linkers.h" -int main(int /*argc*/, char** /*argv*/) { +int main(int argc, char** argv) { bool success = false; try { - const std::string config_str = std::string("config=train.conf"); - char* argv = new char[config_str.size() + 1]; - for (size_t i = 0; i < config_str.size(); ++i) { - argv[i] = config_str[i]; - } - argv[config_str.size()] = '\0'; - LightGBM::Application app(2, &argv - 1); + LightGBM::Application app(argc, argv); app.Run(); #ifdef USE_MPI From 2a81af63b4f95361b07ac4e4f98b4cbea90f47b0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 03:44:55 +0000 Subject: [PATCH 130/166] separate cuda_exp and cuda --- .ci/setup.sh | 2 +- .ci/test.sh | 15 +++-- .github/workflows/cuda_exp.yml | 99 ++++++++++++++++++++++++++++++++ src/application/application.cpp | 2 +- src/boosting/gbdt.cpp | 4 +- src/io/config.cpp | 16 +++--- src/io/dataset.cpp | 9 +-- src/io/dataset_loader.cpp | 2 +- src/treelearner/tree_learner.cpp | 16 ++++-- 9 files changed, 138 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/cuda_exp.yml diff --git a/.ci/setup.sh b/.ci/setup.sh index ef84a51cd587..024872fae7cf 100755 --- a/.ci/setup.sh +++ b/.ci/setup.sh @@ -82,7 +82,7 @@ else # Linux echo libamdocl64.so > $OPENCL_VENDOR_PATH/amdocl64.icd fi ARCH=$(uname -m) - if [[ $TASK == "cuda" ]]; then + if [[ $TASK == "cuda" ] || [ $TASK == "cuda_exp" ]]; then echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections apt-get update apt-get install --no-install-recommends -y \ diff --git a/.ci/test.sh b/.ci/test.sh index ba642029ab53..c68795731c6b 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -179,11 +179,16 @@ if [[ $TASK == "gpu" ]]; then elif [[ $METHOD == "source" ]]; then cmake -DUSE_GPU=ON -DOpenCL_INCLUDE_DIR=$AMDAPPSDK_PATH/include/ .. fi -elif [[ $TASK == "cuda" ]]; then - sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h - grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done - sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h - grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done +elif [[ $TASK == "cuda" ] || [ $TASK == "cuda_exp" ]]; then + if [[ $TASK == "cuda" ]]; then + sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h + grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done + else + sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda_exp";/' $BUILD_DIRECTORY/include/LightGBM/config.h + grep -q 'std::string device_type = "cuda_exp"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done + sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h + grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done + fi if [[ $METHOD == "pip" ]]; then cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1 pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1 diff --git a/.github/workflows/cuda_exp.yml b/.github/workflows/cuda_exp.yml new file mode 100644 index 000000000000..48e07016b17e --- /dev/null +++ b/.github/workflows/cuda_exp.yml @@ -0,0 +1,99 @@ +name: CUDA Experimental Version + +on: + push: + branches: + - master + pull_request: + branches: + - master + +env: + github_actions: 'true' + os_name: linux + task: cuda_exp + conda_env: test-env + +jobs: + test: + name: cuda ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }}) + runs-on: [self-hosted, linux] + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + - method: source + compiler: gcc + python_version: 3.7 + cuda_version: "11.4.2" + - method: pip + compiler: clang + python_version: 3.8 + cuda_version: "10.0" + steps: + - name: Setup or update software on host machine + run: | + sudo apt-get update + sudo apt-get install --no-install-recommends -y \ + apt-transport-https \ + ca-certificates \ + curl \ + git \ + gnupg-agent \ + lsb-release \ + software-properties-common + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - + sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" -y + curl -sL https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - + curl -sL https://nvidia.github.io/nvidia-docker/$(. /etc/os-release;echo $ID$VERSION_ID)/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list + sudo apt-get update + sudo apt-get install --no-install-recommends -y \ + containerd.io \ + docker-ce \ + docker-ce-cli \ + nvidia-docker2 + sudo chmod a+rw /var/run/docker.sock + sudo systemctl restart docker + - name: Remove old folder with repository + run: sudo rm -rf $GITHUB_WORKSPACE + - name: Checkout repository + uses: actions/checkout@v1 + with: + fetch-depth: 5 + submodules: true + - name: Setup and run tests + run: | + export ROOT_DOCKER_FOLDER=/LightGBM + cat > docker.env < docker-script.sh <first_metric_only; shrinkage_rate_ = config_->learning_rate; - if (config_->device_type == std::string("cuda")) { + if (config_->device_type == std::string("cuda") || config_->device_type == std::string("cuda_exp")) { LGBM_config_::current_learner = use_cuda_learner; } @@ -816,7 +816,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { double average_bag_rate = (static_cast(bag_data_cnt_) / num_data_) / config->bagging_freq; is_use_subset_ = false; - if (config_->device_type != std::string("cuda")) { + if (config_->device_type != std::string("cuda") && config_->device_type != std::string("cuda_exp")) { const int group_threshold_usesubset = 100; if (average_bag_rate <= 0.5 && (train_data_->num_feature_groups() < group_threshold_usesubset)) { diff --git a/src/io/config.cpp b/src/io/config.cpp index 8cd6a1163580..73fc7d7bdef9 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -128,6 +128,8 @@ void GetDeviceType(const std::unordered_map& params, s *device_type = "gpu"; } else if (value == std::string("cuda")) { *device_type = "cuda"; + } else if (value == std::string("cuda_exp")) { + *device_type = "cuda_exp"; } else { Log::Fatal("Unknown device type %s", value.c_str()); } @@ -208,7 +210,7 @@ void Config::Set(const std::unordered_map& params) { GetObjectiveType(params, &objective); GetMetricType(params, objective, &metric); GetDeviceType(params, &device_type); - if (device_type == std::string("cuda")) { + if (device_type == std::string("cuda") || device_type == std::string("cuda_exp")) { LGBM_config_::current_device = lgbm_device_cuda; } GetTreeLearnerType(params, &tree_learner); @@ -331,22 +333,20 @@ void Config::CheckParamConflict() { num_leaves = static_cast(full_num_leaves); } } - // force col-wise for gpu, and non-single GPU CUDA version - if (device_type == std::string("gpu") || - (device_type == std::string("cuda") && (num_gpu > 1 || tree_learner != std::string("serial")))) { + if (device_type == std::string("gpu") || device_type == std::string("cuda")) { + // force col-wise for gpu, and cuda version force_col_wise = true; force_row_wise = false; if (deterministic) { Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); } - } else if (device_type == std::string("cuda")) { - // force row-wise for single GPU CUDA version + } else if (device_type == std::string("cuda_exp")) { + // force row-wise for cuda_exp version force_col_wise = false; force_row_wise = true; } // force gpu_use_dp for non-single GPU CUDA version - if (device_type == std::string("cuda") && - (num_gpu > 1 || tree_learner != std::string("serial")) && !gpu_use_dp) { + if (device_type == std::string("cuda")) { Log::Warning("CUDA currently requires double precision calculations."); gpu_use_dp = true; } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c4b09696f88b..0545564a5665 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -349,7 +349,8 @@ void Dataset::Construct(std::vector>* bin_mappers, std::vector group_is_multi_val(used_features.size(), 0); if (io_config.enable_bundle && !used_features.empty()) { - bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); + bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda") + || io_config.device_type == std::string("cuda_exp"); features_in_group = FastFeatureBundling( *bin_mappers, sample_non_zero_indices, sample_values, num_per_col, num_sample_col, static_cast(total_sample_cnt), @@ -439,7 +440,7 @@ void Dataset::FinishLoad() { } } #ifdef USE_CUDA - if (device_type_ == std::string("cuda")) { + if (device_type_ == std::string("cuda_exp")) { CreateCUDAColumnData(); metadata_.CreateCUDAMetadata(gpu_device_id_); } else { @@ -849,7 +850,7 @@ void Dataset::CopySubrow(const Dataset* fullset, gpu_device_id_ = fullset->gpu_device_id_; #ifdef USE_CUDA - if (device_type_ == std::string("cuda")) { + if (device_type_ == std::string("cuda_exp")) { global_timer.Start("prepare subset cuda column data"); if (cuda_column_data_ == nullptr) { cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_)); @@ -1499,7 +1500,7 @@ void Dataset::AddFeaturesFrom(Dataset* other) { } } #ifdef USE_CUDA - if (device_type_ == std::string("cuda")) { + if (device_type_ == std::string("cuda_exp")) { CreateCUDAColumnData(); } else { cuda_column_data_ = nullptr; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 94301de15df4..1848debea4ae 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -273,7 +273,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac #ifdef USE_CUDA dataset->device_type_ = config_.device_type; dataset->gpu_device_id_ = config_.gpu_device_id; - if (config_.device_type == std::string("cuda")) { + if (config_.device_type == std::string("cuda_exp")) { dataset->CreateCUDAColumnData(); dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_); } else { diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp index 9d6f313e54c0..ee3d16a51c1b 100644 --- a/src/treelearner/tree_learner.cpp +++ b/src/treelearner/tree_learner.cpp @@ -41,11 +41,7 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con } } else if (device_type == std::string("cuda")) { if (learner_type == std::string("serial")) { - if (config->num_gpu == 1) { - return new CUDASingleGPUTreeLearner(config); - } else { - return new CUDATreeLearner(config); - } + return new CUDATreeLearner(config); } else if (learner_type == std::string("feature")) { return new FeatureParallelTreeLearner(config); } else if (learner_type == std::string("data")) { @@ -53,6 +49,16 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con } else if (learner_type == std::string("voting")) { return new VotingParallelTreeLearner(config); } + } else if (device_type == std::string("cuda_exp")) { + if (learner_type == std::string("serial")) { + if (config->num_gpu == 1) { + return new CUDASingleGPUTreeLearner(config); + } else { + Log::Fatal("cuda_exp only supports training on a single GPU."); + } + } else { + Log::Fatal("cuda_exp only supports training on a single machine."); + } } return nullptr; } From 988107583f635a02934bc31218e76085fe1ebb70 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 03:50:29 +0000 Subject: [PATCH 131/166] fix ci bash scripts add description for cuda_exp --- .ci/setup.sh | 2 +- .ci/test.sh | 2 +- docs/Parameters.rst | 6 +++++- include/LightGBM/config.h | 4 +++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.ci/setup.sh b/.ci/setup.sh index 024872fae7cf..20cd8b237f5b 100755 --- a/.ci/setup.sh +++ b/.ci/setup.sh @@ -82,7 +82,7 @@ else # Linux echo libamdocl64.so > $OPENCL_VENDOR_PATH/amdocl64.icd fi ARCH=$(uname -m) - if [[ $TASK == "cuda" ] || [ $TASK == "cuda_exp" ]]; then + if [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections apt-get update apt-get install --no-install-recommends -y \ diff --git a/.ci/test.sh b/.ci/test.sh index c68795731c6b..275ee97f5781 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -179,7 +179,7 @@ if [[ $TASK == "gpu" ]]; then elif [[ $METHOD == "source" ]]; then cmake -DUSE_GPU=ON -DOpenCL_INCLUDE_DIR=$AMDAPPSDK_PATH/include/ .. fi -elif [[ $TASK == "cuda" ] || [ $TASK == "cuda_exp" ]]; then +elif [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then if [[ $TASK == "cuda" ]]; then sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 7ace90d9b34d..f911d158f570 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -199,7 +199,7 @@ Core Parameters - **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors -- ``device_type`` :raw-html:`🔗︎`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, aliases: ``device`` +- ``device_type`` :raw-html:`🔗︎`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, ``cuda_exp``, aliases: ``device`` - device for the tree learning, you can use GPU to achieve the faster learning @@ -209,6 +209,10 @@ Core Parameters - **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support + - **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda`` + + - **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future + - ``seed`` :raw-html:`🔗︎`, default = ``None``, type = int, aliases: ``random_seed``, ``random_state`` - this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc. diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 50371f3a2d91..7025e8104e1e 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -204,12 +204,14 @@ struct Config { // [doc-only] // type = enum - // options = cpu, gpu, cuda + // options = cpu, gpu, cuda, cuda_exp // alias = device // desc = device for the tree learning, you can use GPU to achieve the faster learning // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support + // desc = **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda`` + // desc = **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future std::string device_type = "cpu"; // [doc-only] From 52b1e88ddee12fae732c30506276396207195337 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 14 Dec 2021 07:56:32 +0000 Subject: [PATCH 132/166] add USE_CUDA_EXP flag --- .ci/test.sh | 6 +++++- CMakeLists.txt | 19 ++++++++++++++----- docs/Installation-Guide.rst | 2 ++ include/LightGBM/bin.h | 4 ++-- include/LightGBM/cuda/cuda_algorithms.hpp | 4 ++-- include/LightGBM/cuda/cuda_column_data.hpp | 4 ++-- include/LightGBM/cuda/cuda_metadata.hpp | 4 ++-- include/LightGBM/cuda/cuda_random.hpp | 4 ++-- include/LightGBM/cuda/cuda_row_data.hpp | 4 ++-- include/LightGBM/cuda/cuda_split_info.hpp | 4 ++-- include/LightGBM/cuda/cuda_tree.hpp | 4 ++-- include/LightGBM/cuda/cuda_utils.h | 19 ++++++++++++++----- include/LightGBM/cuda/vector_cudahost.h | 6 +++--- include/LightGBM/dataset.h | 16 ++++++++-------- include/LightGBM/train_share_states.h | 8 ++++---- python-package/setup.py | 8 ++++++-- src/boosting/gbdt.cpp | 4 +--- src/boosting/gbdt.h | 2 +- src/cuda/cuda_algorithms.cu | 4 ++-- src/cuda/cuda_utils.cpp | 4 ++-- src/io/cuda/cuda_column_data.cpp | 4 ++-- src/io/cuda/cuda_column_data.cu | 4 ++-- src/io/cuda/cuda_metadata.cpp | 4 ++-- src/io/cuda/cuda_row_data.cpp | 4 ++-- src/io/cuda/cuda_tree.cpp | 4 ++-- src/io/cuda/cuda_tree.cu | 4 ++-- src/io/dataset.cpp | 16 ++++++++-------- src/io/dataset_loader.cpp | 4 ++-- src/io/dense_bin.hpp | 2 +- src/io/metadata.cpp | 8 ++++---- src/io/multi_val_dense_bin.cpp | 4 ++-- src/io/multi_val_dense_bin.hpp | 4 ++-- src/io/multi_val_sparse_bin.cpp | 4 ++-- src/io/multi_val_sparse_bin.hpp | 4 ++-- .../cuda/cuda_best_split_finder.cpp | 4 ++-- .../cuda/cuda_best_split_finder.cu | 4 ++-- .../cuda/cuda_best_split_finder.hpp | 4 ++-- src/treelearner/cuda/cuda_data_partition.cpp | 4 ++-- src/treelearner/cuda/cuda_data_partition.cu | 4 ++-- src/treelearner/cuda/cuda_data_partition.hpp | 4 ++-- .../cuda/cuda_histogram_constructor.cpp | 4 ++-- .../cuda/cuda_histogram_constructor.cu | 4 ++-- .../cuda/cuda_histogram_constructor.hpp | 4 ++-- src/treelearner/cuda/cuda_leaf_splits.cpp | 4 ++-- src/treelearner/cuda/cuda_leaf_splits.cu | 4 ++-- src/treelearner/cuda/cuda_leaf_splits.hpp | 4 ++-- .../cuda/cuda_single_gpu_tree_learner.cpp | 4 ++-- .../cuda/cuda_single_gpu_tree_learner.cu | 4 ++-- .../cuda/cuda_single_gpu_tree_learner.hpp | 10 +++++----- src/treelearner/serial_tree_learner.cpp | 2 +- src/treelearner/serial_tree_learner.h | 4 ++-- 51 files changed, 149 insertions(+), 123 deletions(-) diff --git a/.ci/test.sh b/.ci/test.sh index 275ee97f5781..13c99c9c632b 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -200,7 +200,11 @@ elif [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then pytest $BUILD_DIRECTORY/tests || exit -1 exit 0 elif [[ $METHOD == "source" ]]; then - cmake -DUSE_CUDA=ON .. + if [[ $TASK == "cuda" ]]; then + cmake -DUSE_CUDA=ON .. + else + cmake -DUSE_CUDA_EXP=ON .. + fi fi elif [[ $TASK == "mpi" ]]; then if [[ $METHOD == "pip" ]]; then diff --git a/CMakeLists.txt b/CMakeLists.txt index afca52de129b..c2e41680bcb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ option(USE_SWIG "Enable SWIG to generate Java API" OFF) option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF) option(USE_TIMETAG "Set to ON to output time costs" OFF) option(USE_CUDA "Enable CUDA-accelerated training (EXPERIMENTAL)" OFF) +option(USE_CUDA_EXP "Enable CUDA-accelerated training with more acceleration (EXPERIMENTAL)" ON) option(USE_DEBUG "Set to ON for Debug mode" OFF) option(USE_SANITIZER "Use santizer flags" OFF) set( @@ -28,7 +29,7 @@ if(__INTEGRATE_OPENCL) cmake_minimum_required(VERSION 3.11) elseif(USE_GPU OR APPLE) cmake_minimum_required(VERSION 3.2) -elseif(USE_CUDA) +elseif(USE_CUDA OR USE_CUDA_EXP) cmake_minimum_required(VERSION 3.16) else() cmake_minimum_required(VERSION 3.0) @@ -133,7 +134,7 @@ else() add_definitions(-DUSE_SOCKET) endif() -if(USE_CUDA) +if(USE_CUDA OR USE_CUDA_EXP) set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}") enable_language(CUDA) set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE) @@ -171,7 +172,7 @@ if(__INTEGRATE_OPENCL) endif() endif() -if(USE_CUDA) +if(USE_CUDA OR USE_CUDA_EXP) find_package(CUDA 9.0 REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall") @@ -199,7 +200,12 @@ if(USE_CUDA) endif() message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") - add_definitions(-DUSE_CUDA) + if(USE_CUDA) + add_definitions(-DUSE_CUDA) + elseif(USE_CUDA_EXP) + add_definitions(-DUSE_CUDA_EXP) + endif() + if(NOT DEFINED CMAKE_CUDA_STANDARD) set(CMAKE_CUDA_STANDARD 11) set(CMAKE_CUDA_STANDARD_REQUIRED ON) @@ -371,6 +377,9 @@ file( src/treelearner/*.cpp if(USE_CUDA) src/treelearner/*.cu +endif() +if(USE_CUDA_EXP) + src/treelearner/*.cu src/treelearner/cuda/*.cpp src/treelearner/cuda/*.cu src/io/cuda/*.cu @@ -499,7 +508,7 @@ if(__INTEGRATE_OPENCL) target_link_libraries(lightgbm_objs PUBLIC ${INTEGRATED_OPENCL_LIBRARIES}) endif() -if(USE_CUDA) +if(USE_CUDA OR USE_CUDA_EXP) # Disable cmake warning about policy CMP0104. Refer to issue #3754 and PR #4268. # Custom target properties does not propagate, thus we need to specify for # each target that contains or depends on cuda source. diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst index 876002268aaa..f65c58c5f200 100644 --- a/docs/Installation-Guide.rst +++ b/docs/Installation-Guide.rst @@ -634,6 +634,8 @@ To build LightGBM CUDA version, run the following commands: cmake -DUSE_CUDA=1 .. make -j4 +Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace -DUSE_CUDA with -DUSE_CUDA_EXP in the above commands. + **Note**: glibc >= 2.14 is required. **Note**: In some rare cases you may need to install OpenMP runtime library separately (use your package manager and search for ``lib[g|i]omp`` for doing this). diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index c91f9c8d6366..66be49a911e4 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -474,13 +474,13 @@ class MultiValBin { virtual MultiValBin* Clone() = 0; - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP virtual const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, const void** out_data_ptr, uint8_t* data_ptr_bit_type) const = 0; - #endif // USE_CUDA + #endif // USE_CUDA_EXP }; inline uint32_t BinMapper::ValueToBin(double value) const { diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index a26f703083e1..6b953e2bce5c 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -6,7 +6,7 @@ #ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_ #define LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include #include @@ -388,5 +388,5 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP #endif // LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_ diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp index fd3683586053..5438f0103abc 100644 --- a/include/LightGBM/cuda/cuda_column_data.hpp +++ b/include/LightGBM/cuda/cuda_column_data.hpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #ifndef LIGHTGBM_CUDA_COLUMN_DATA_HPP_ #define LIGHTGBM_CUDA_COLUMN_DATA_HPP_ @@ -137,4 +137,4 @@ class CUDAColumnData { #endif // LIGHTGBM_CUDA_COLUMN_DATA_HPP_ -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp index 9abeb69f7468..09f8f27a4b59 100644 --- a/include/LightGBM/cuda/cuda_metadata.hpp +++ b/include/LightGBM/cuda/cuda_metadata.hpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #ifndef LIGHTGBM_CUDA_META_DATA_HPP_ #define LIGHTGBM_CUDA_META_DATA_HPP_ @@ -49,4 +49,4 @@ class CUDAMetadata { #endif // LIGHTGBM_CUDA_META_DATA_HPP_ -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/include/LightGBM/cuda/cuda_random.hpp b/include/LightGBM/cuda/cuda_random.hpp index 6c28e44bc2d3..1f07d64452da 100644 --- a/include/LightGBM/cuda/cuda_random.hpp +++ b/include/LightGBM/cuda/cuda_random.hpp @@ -5,7 +5,7 @@ #ifndef LIGHTGBM_CUDA_CUDA_RANDOM_HPP_ #define LIGHTGBM_CUDA_CUDA_RANDOM_HPP_ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include #include @@ -69,6 +69,6 @@ class CUDARandom { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP #endif // LIGHTGBM_CUDA_CUDA_RANDOM_HPP_ diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 08b885eac418..3013883abd6a 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #ifndef LIGHTGBM_CUDA_ROW_DATA_HPP_ #define LIGHTGBM_CUDA_ROW_DATA_HPP_ @@ -186,4 +186,4 @@ class CUDARowData { } // namespace LightGBM #endif // LIGHTGBM_CUDA_COLUMN_DATA_HPP_ -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/include/LightGBM/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp index 748d25b3dbe5..ec4afe538ef5 100644 --- a/include/LightGBM/cuda/cuda_split_info.hpp +++ b/include/LightGBM/cuda/cuda_split_info.hpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ #define LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ @@ -105,4 +105,4 @@ class CUDASplitInfo { #endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index b8fb47f38b8f..3bd5972dbc2c 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #ifndef LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ #define LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ @@ -139,4 +139,4 @@ class CUDATree : public Tree { #endif // LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index bbf8bb69a524..8a9fd7398f46 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -1,23 +1,30 @@ /*! - * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA +/*! + * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ #ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_ #define LIGHTGBM_CUDA_CUDA_UTILS_H_ +#if defined(USE_CUDA) || defined(USE_CUDA_EXP) #include #include #include - #include +#endif // USE_CUDA || USE_CUDA_EXP +#ifdef USE_CUDA_EXP #include +#endif // USE_CUDA_EXP namespace LightGBM { +#if defined(USE_CUDA) || defined(USE_CUDA_EXP) #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { if (code != cudaSuccess) { @@ -25,7 +32,9 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = if (abort) exit(code); } } +#endif // USE_CUDA || USE_CUDA_EXP +#ifdef USE_CUDA_EXP #define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); } void SetCUDADevice(int gpu_device_id, const char* file, int line); @@ -172,8 +181,8 @@ class CUDAVector { size_t size_; }; +#endif // USE_CUDA_EXP + } // namespace LightGBM #endif // LIGHTGBM_CUDA_CUDA_UTILS_H_ - -#endif // USE_CUDA diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index bd488d793d09..4eb9f8b6d876 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -7,7 +7,7 @@ #include -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_CUDA_EXP) #include #include #endif @@ -43,7 +43,7 @@ struct CHAllocator { T* ptr; if (n == 0) return NULL; n = (n + kAlignedSize - 1) & -kAlignedSize; - #ifdef USE_CUDA + #if defined(USE_CUDA) || defined(USE_CUDA_EXP) if (LGBM_config_::current_device == lgbm_device_cuda) { cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); if (ret != cudaSuccess) { @@ -62,7 +62,7 @@ struct CHAllocator { void deallocate(T* p, std::size_t n) { (void)n; // UNUSED if (p == NULL) return; - #ifdef USE_CUDA + #if defined(USE_CUDA) || defined(USE_CUDA_EXP) if (LGBM_config_::current_device == lgbm_device_cuda) { cudaPointerAttributes attributes; cudaPointerGetAttributes(&attributes, p); diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 29157e988e75..051b60c6a942 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -214,13 +214,13 @@ class Metadata { /*! \brief Disable copy */ Metadata(const Metadata&) = delete; - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP CUDAMetadata* cuda_metadata() const { return cuda_metadata_.get(); } void CreateCUDAMetadata(const int gpu_device_id); - #endif // USE_CUDA + #endif // USE_CUDA_EXP private: /*! \brief Load initial scores from file */ @@ -258,9 +258,9 @@ class Metadata { bool weight_load_from_file_; bool query_load_from_file_; bool init_score_load_from_file_; - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP std::unique_ptr cuda_metadata_; - #endif // USE_CUDA + #endif // USE_CUDA_EXP }; @@ -786,13 +786,13 @@ class Dataset { return feature_groups_[feature_group_index]->feature_min_bin(sub_feature_index); } - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP const CUDAColumnData* cuda_column_data() const { return cuda_column_data_.get(); } - #endif // USE_CUDA + #endif // USE_CUDA_EXP private: void CreateCUDAColumnData(); @@ -840,9 +840,9 @@ class Dataset { std::string device_type_; int gpu_device_id_; - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP std::unique_ptr cuda_column_data_; - #endif // USE_CUDA + #endif // USE_CUDA_EXP std::string parser_config_str_; }; diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index 93cbf60ff5dd..c86e45605345 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -126,7 +126,7 @@ class MultiValBinWrapper { } - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP const void* GetRowWiseData( uint8_t* bit_type, size_t* total_size, @@ -142,7 +142,7 @@ class MultiValBinWrapper { return multi_val_bin_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type); } } - #endif // USE_CUDA + #endif // USE_CUDA_EXP private: bool is_use_subcol_ = false; @@ -233,7 +233,7 @@ struct TrainingShareStates { } - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, @@ -248,7 +248,7 @@ struct TrainingShareStates { return nullptr; } } - #endif // USE_CUDA + #endif // USE_CUDA_EXP private: std::vector feature_hist_offsets_; diff --git a/python-package/setup.py b/python-package/setup.py index 56c67ffa3c39..4c239b60b37e 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -104,6 +104,7 @@ def compile_cpp( use_mingw: bool = False, use_gpu: bool = False, use_cuda: bool = False, + use_cuda_exp: bool = False, use_mpi: bool = False, use_hdfs: bool = False, boost_root: Optional[str] = None, @@ -144,6 +145,8 @@ def compile_cpp( cmake_cmd.append(f"-DOpenCL_LIBRARY={opencl_library}") elif use_cuda: cmake_cmd.append("-DUSE_CUDA=ON") + elif use_cuda_exp: + cmake_cmd.append("-DUSE_CUDA_EXP=ON") if use_mpi: cmake_cmd.append("-DUSE_MPI=ON") if nomp: @@ -163,7 +166,7 @@ def compile_cpp( else: status = 1 lib_path = CURRENT_DIR / "compile" / "windows" / "x64" / "DLL" / "lib_lightgbm.dll" - if not any((use_gpu, use_cuda, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)): + if not any((use_gpu, use_cuda, use_cuda_exp, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)): logger.info("Starting to compile with MSBuild from existing solution file.") platform_toolsets = ("v142", "v141", "v140") for pt in platform_toolsets: @@ -222,6 +225,7 @@ def initialize_options(self) -> None: self.integrated_opencl = False self.gpu = False self.cuda = False + self.cuda_exp = False self.boost_root = None self.boost_dir = None self.boost_include_dir = None @@ -245,7 +249,7 @@ def run(self) -> None: LOG_PATH.touch() if not self.precompile: copy_files(integrated_opencl=self.integrated_opencl, use_gpu=self.gpu) - compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_mpi=self.mpi, + compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_cuda_exp=self.cuda_exp, use_mpi=self.mpi, use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir, boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir, opencl_include_dir=self.opencl_include_dir, opencl_library=self.opencl_library, diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index ed7a2da6483f..7053e77557d0 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -395,9 +395,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto grad = gradients + offset; auto hess = hessians + offset; // need to copy gradients for bagging subset. - if (is_use_subset_ && bag_data_cnt_ < num_data_ && - !(LGBM_config_::current_learner == use_cuda_learner && config_->num_gpu == 1 && - (config_->tree_learner == std::string("serial") || Network::num_machines() == 1))) { + if (is_use_subset_ && bag_data_cnt_ < num_data_ && config_->device_type != std::string("cuda_exp")) { for (int i = 0; i < bag_data_cnt_; ++i) { gradients_[offset + i] = grad[bag_data_indices_[i]]; hessians_[offset + i] = hess[bag_data_indices_[i]]; diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index efeacfbfaef0..7dcc0a44c5ed 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -488,7 +488,7 @@ class GBDT : public GBDTBase { /*! \brief Parser config file content */ std::string parser_config_str_ = ""; -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_CUDA_EXP) /*! \brief First order derivative of training data */ std::vector> gradients_; /*! \brief Second order derivative of training data */ diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index ad1c8b2a2278..9bc52ceaedc8 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include @@ -79,4 +79,4 @@ void ShufflePrefixSumGlobal(uint64_t* values, size_t len, uint64_t* block_prefix } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp index d0fa773eec8b..051ea3ed2128 100644 --- a/src/cuda/cuda_utils.cpp +++ b/src/cuda/cuda_utils.cpp @@ -5,7 +5,7 @@ #include -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP namespace LightGBM { @@ -28,4 +28,4 @@ void SetCUDADevice(int gpu_device_id, const char* file, int line) { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp index a1080cb2b902..c4b0bb62e584 100644 --- a/src/io/cuda/cuda_column_data.cpp +++ b/src/io/cuda/cuda_column_data.cpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include @@ -308,4 +308,4 @@ void CUDAColumnData::InitColumnMetaInfo() { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/io/cuda/cuda_column_data.cu b/src/io/cuda/cuda_column_data.cu index 75ff6234e09e..3ab70e9a5758 100644 --- a/src/io/cuda/cuda_column_data.cu +++ b/src/io/cuda/cuda_column_data.cu @@ -4,7 +4,7 @@ */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include @@ -58,4 +58,4 @@ void CUDAColumnData::LaunchCopySubrowKernel(void* const* in_cuda_data_by_column) } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp index 1f83fd7efdf2..3ac657cf9955 100644 --- a/src/io/cuda/cuda_metadata.cpp +++ b/src/io/cuda/cuda_metadata.cpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include @@ -73,4 +73,4 @@ void CUDAMetadata::Init(const std::vector& label, } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index afe530b7205b..bed2793398c4 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include @@ -416,4 +416,4 @@ void CUDARowData::InitSparseData(const BIN_TYPE* host_data, } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index f318e66fd2c5..975edb44c9d5 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include @@ -307,4 +307,4 @@ void CUDATree::SyncLeafOutputFromCUDAToHost() { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index 67e60e6d7e51..d51308ae9942 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -4,7 +4,7 @@ */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include @@ -305,4 +305,4 @@ void CUDATree::LaunchAddBiasKernel(const double val) { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 0545564a5665..89694a118b3f 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -439,14 +439,14 @@ void Dataset::FinishLoad() { feature_groups_[i]->FinishLoad(); } } - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP if (device_type_ == std::string("cuda_exp")) { CreateCUDAColumnData(); metadata_.CreateCUDAMetadata(gpu_device_id_); } else { cuda_column_data_.reset(nullptr); } - #endif // USE_CUDA + #endif // USE_CUDA_EXP is_finish_load_ = true; } @@ -849,7 +849,7 @@ void Dataset::CopySubrow(const Dataset* fullset, device_type_ = fullset->device_type_; gpu_device_id_ = fullset->gpu_device_id_; - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP if (device_type_ == std::string("cuda_exp")) { global_timer.Start("prepare subset cuda column data"); if (cuda_column_data_ == nullptr) { @@ -861,7 +861,7 @@ void Dataset::CopySubrow(const Dataset* fullset, global_timer.Stop("copy subset cuda column data"); global_timer.Stop("prepare subset cuda column data"); } - #endif // USE_CUDA + #endif // USE_CUDA_EXP } bool Dataset::SetFloatField(const char* field_name, const float* field_data, @@ -1499,13 +1499,13 @@ void Dataset::AddFeaturesFrom(Dataset* other) { raw_data_.push_back(other->raw_data_[i]); } } - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP if (device_type_ == std::string("cuda_exp")) { CreateCUDAColumnData(); } else { cuda_column_data_ = nullptr; } - #endif // USE_CUDA + #endif // USE_CUDA_EXP } const void* Dataset::GetColWiseData( @@ -1527,7 +1527,7 @@ const void* Dataset::GetColWiseData( return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator); } -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP void Dataset::CreateCUDAColumnData() { cuda_column_data_.reset(new CUDAColumnData(num_data_, gpu_device_id_)); int num_columns = 0; @@ -1662,6 +1662,6 @@ void Dataset::CreateCUDAColumnData() { feature_to_column); } -#endif // USE_CUDA +#endif // USE_CUDA_EXP } // namespace LightGBM diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 1848debea4ae..63e4883d2ce4 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -270,7 +270,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac is_load_from_binary = true; Log::Info("Load from binary file %s", bin_filename.c_str()); dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines, &num_global_data, &used_data_indices)); - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP dataset->device_type_ = config_.device_type; dataset->gpu_device_id_ = config_.gpu_device_id; if (config_.device_type == std::string("cuda_exp")) { @@ -279,7 +279,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac } else { dataset->cuda_column_data_ = nullptr; } - #endif // USE_CUDA + #endif // USE_CUDA_EXP } // check meta data dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices); diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 0ebcdc1a6181..5d95d9dc6073 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -467,7 +467,7 @@ class DenseBin : public Bin { private: data_size_t num_data_; -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_CUDA_EXP) std::vector> data_; #else std::vector> data_; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 087ced92a833..3ef0edbe405d 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -18,9 +18,9 @@ Metadata::Metadata() { weight_load_from_file_ = false; query_load_from_file_ = false; init_score_load_from_file_ = false; - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP cuda_metadata_ = nullptr; - #endif // USE_CUDA + #endif // USE_CUDA_EXP } void Metadata::Init(const char* data_filename) { @@ -475,12 +475,12 @@ void Metadata::LoadQueryWeights() { } } -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP void Metadata::CreateCUDAMetadata(const int gpu_device_id) { cuda_metadata_.reset(new CUDAMetadata(gpu_device_id)); cuda_metadata_->Init(label_, weights_, query_boundaries_, query_weights_, init_score_, queries_); } -#endif // USE_CUDA +#endif // USE_CUDA_EXP void Metadata::LoadFromMemory(const void* memory) { const char* mem_ptr = reinterpret_cast(memory); diff --git a/src/io/multi_val_dense_bin.cpp b/src/io/multi_val_dense_bin.cpp index d1d70f8e3bed..f6cf41b9bb21 100644 --- a/src/io/multi_val_dense_bin.cpp +++ b/src/io/multi_val_dense_bin.cpp @@ -8,7 +8,7 @@ namespace LightGBM { -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP template <> const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, @@ -59,6 +59,6 @@ const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, return to_return; } -#endif // USE_CUDA +#endif // USE_CUDA_EXP } // namespace LightGBM diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index b4fbfbe673aa..8de9cf305952 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -211,13 +211,13 @@ class MultiValDenseBin : public MultiValBin { MultiValDenseBin* Clone() override; - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, const void** out_data_ptr, uint8_t* data_ptr_bit_type) const override; - #endif // USE_CUDA + #endif // USE_CUDA_EXP private: data_size_t num_data_; diff --git a/src/io/multi_val_sparse_bin.cpp b/src/io/multi_val_sparse_bin.cpp index 55d8e82492ad..359bf31c1053 100644 --- a/src/io/multi_val_sparse_bin.cpp +++ b/src/io/multi_val_sparse_bin.cpp @@ -7,7 +7,7 @@ namespace LightGBM { -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP template <> const void* MultiValSparseBin::GetRowWiseData( @@ -153,6 +153,6 @@ const void* MultiValSparseBin::GetRowWiseData( return to_return; } -#endif // USE_CUDA +#endif // USE_CUDA_EXP } // namespace LightGBM diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index eaa30ef0a0cc..80acbb681ab6 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -292,13 +292,13 @@ class MultiValSparseBin : public MultiValBin { MultiValSparseBin* Clone() override; - #ifdef USE_CUDA + #ifdef USE_CUDA_EXP const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, const void** out_data_ptr, uint8_t* data_ptr_bit_type) const override; - #endif // USE_CUDA + #endif // USE_CUDA_EXP private: data_size_t num_data_; diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp index eefabbbe2e2f..51589a673aa8 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cpp +++ b/src/treelearner/cuda/cuda_best_split_finder.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include @@ -366,4 +366,4 @@ void CUDABestSplitFinder::AllocateCatVectors(CUDASplitInfo* cuda_split_infos, ui } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 2b39cfdd4ba6..e52a2fb90dd6 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include @@ -1802,4 +1802,4 @@ void CUDABestSplitFinder::LaunchInitCUDARandomKernel() { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 4ae9aded5c3d..bd3ef063864f 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -7,7 +7,7 @@ #ifndef LIGHTGBM_CUDA_BEST_SPLIT_FINDER_HPP_ #define LIGHTGBM_CUDA_BEST_SPLIT_FINDER_HPP_ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include #include @@ -202,5 +202,5 @@ class CUDABestSplitFinder { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP #endif // LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index d6bcdfa953c7..2321d1112c52 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include #include @@ -399,4 +399,4 @@ void CUDADataPartition::ResetByLeafPred(const std::vector& leaf_pred, int n } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 1aa19367106a..a2cd87eac6cf 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include "cuda_data_partition.hpp" @@ -1071,4 +1071,4 @@ void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double* leaf_valu } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index c015f6d589d9..ba5c4a9150df 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -6,7 +6,7 @@ #ifndef LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ #define LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include #include @@ -390,5 +390,5 @@ class CUDADataPartition { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP #endif // LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 7e6be1c1069c..83227165af19 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include "cuda_histogram_constructor.hpp" @@ -193,4 +193,4 @@ void CUDAHistogramConstructor::ResetConfig(const Config* config) { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index de688e9fcfbc..d5274ff975b4 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include "cuda_histogram_constructor.hpp" @@ -594,4 +594,4 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel( } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 11e6894e2f72..3ddf9083eb87 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -6,7 +6,7 @@ #ifndef LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ #define LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include #include @@ -150,5 +150,5 @@ class CUDAHistogramConstructor { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP #endif // LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 6aa020d9ea0d..9d093f0f164b 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include "cuda_leaf_splits.hpp" @@ -68,4 +68,4 @@ void CUDALeafSplits::Resize(const data_size_t num_data) { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 29e42f67ead9..15c2983ef1d2 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -5,7 +5,7 @@ */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include "cuda_leaf_splits.hpp" #include @@ -126,4 +126,4 @@ void CUDALeafSplits::LaunchInitValuesKernal( } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index 1a54b1153c6b..db473a0bbb91 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -6,7 +6,7 @@ #ifndef LIGHTGBM_CUDA_LEAF_SPLITS_HPP_ #define LIGHTGBM_CUDA_LEAF_SPLITS_HPP_ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include #include @@ -156,5 +156,5 @@ class CUDALeafSplits { } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP #endif // LIGHTGBM_CUDA_LEAF_SPLITS_HPP_ diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 2617f7924290..ef0982386e33 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include "cuda_single_gpu_tree_learner.hpp" @@ -520,4 +520,4 @@ void CUDASingleGPUTreeLearner::CheckSplitValid( } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu index 8a558ddc43d1..f4a87de499cb 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include @@ -258,4 +258,4 @@ void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel( } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 9c2164f7378c..48191f895a6b 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -9,7 +9,7 @@ #include #include -#ifdef USE_CUDA +#ifdef USE_CUDA_EXP #include "cuda_leaf_splits.hpp" #include "cuda_histogram_constructor.hpp" @@ -123,7 +123,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { } // namespace LightGBM -#else // USE_CUDA +#else // USE_CUDA_EXP // When GPU support is not compiled in, quit with an error message @@ -133,12 +133,12 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { public: #pragma warning(disable : 4702) explicit CUDASingleGPUTreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) { - Log::Fatal("CUDA Tree Learner was not enabled in this build.\n" - "Please recompile with CMake option -DUSE_CUDA=1"); + Log::Fatal("CUDA Tree Learner experimental version was not enabled in this build.\n" + "Please recompile with CMake option -DUSE_CUDA_EXP=1"); } }; } // namespace LightGBM -#endif // USE_CUDA +#endif // USE_CUDA_EXP #endif // LIGHTGBM_NEW_CUDA_TREE_LEARNER_HPP_ diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 304c712f0723..3af2ff0f4bf3 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -340,7 +340,7 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set* fo } bool use_subtract = parent_leaf_histogram_array_ != nullptr; -#ifdef USE_CUDA +#if defined(USE_CUDA) || defined(USE_CUDA_EXP) if (LGBM_config_::current_learner == use_cpu_learner) { SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); } else { diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 7dfadf05d119..7d05debbc12b 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -206,12 +206,12 @@ class SerialTreeLearner: public TreeLearner { std::unique_ptr smaller_leaf_splits_; /*! \brief stores best thresholds for all feature for larger leaf */ std::unique_ptr larger_leaf_splits_; -#ifdef USE_GPU +#if defined(USE_GPU) /*! \brief gradients of current iteration, ordered for cache optimized, aligned to 4K page */ std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */ std::vector> ordered_hessians_; -#elif USE_CUDA +#elif defined(USE_CUDA) || defined(USE_CUDA_EXP) /*! \brief gradients of current iteration, ordered for cache optimized */ std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized */ From c2a0be879493df9955a7c750f60882978fc1e378 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 24 Dec 2021 03:46:42 +0000 Subject: [PATCH 133/166] switch off USE_CUDA_EXP --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c2e41680bcb1..8d026b4ac897 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ option(USE_SWIG "Enable SWIG to generate Java API" OFF) option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF) option(USE_TIMETAG "Set to ON to output time costs" OFF) option(USE_CUDA "Enable CUDA-accelerated training (EXPERIMENTAL)" OFF) -option(USE_CUDA_EXP "Enable CUDA-accelerated training with more acceleration (EXPERIMENTAL)" ON) +option(USE_CUDA_EXP "Enable CUDA-accelerated training with more acceleration (EXPERIMENTAL)" OFF) option(USE_DEBUG "Set to ON for Debug mode" OFF) option(USE_SANITIZER "Use santizer flags" OFF) set( From fbc3760f5dc95036b8e0bb95bba959adfc54f7f9 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 24 Dec 2021 04:12:17 +0000 Subject: [PATCH 134/166] revert changes in python-packages --- python-package/lightgbm/basic.py | 2 +- python-package/lightgbm/dask.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 841677bab2bf..64f1cb31edaa 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3530,7 +3530,7 @@ def refit(self, data, label, decay_rate=0.9, **kwargs): predictor = self._to_predictor(deepcopy(kwargs)) leaf_preds = predictor.predict(data, -1, pred_leaf=True) nrow, ncol = leaf_preds.shape - out_is_linear = ctypes.c_bool(False) + out_is_linear = ctypes.c_int(0) _safe_call(_LIB.LGBM_BoosterGetLinear( self.handle, ctypes.byref(out_is_linear))) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 4cab3cbf0332..062422286a47 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -11,7 +11,7 @@ from copy import deepcopy from enum import Enum, auto from functools import partial -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union from urllib.parse import urlparse import numpy as np @@ -21,8 +21,8 @@ from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait) -from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _lgbmmodel_doc_custom_eval_note, - _lgbmmodel_doc_fit, _lgbmmodel_doc_predict) +from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomEvalFunction, + _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit, _lgbmmodel_doc_predict) _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series] _DaskMatrixLike = Union[dask_Array, dask_DataFrame] @@ -404,7 +404,7 @@ def _train( eval_class_weight: Optional[List[Union[dict, str]]] = None, eval_init_score: Optional[List[_DaskCollection]] = None, eval_group: Optional[List[_DaskVectorLike]] = None, - eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, + eval_metric: Optional[Union[_LGBM_ScikitCustomEvalFunction, str, List[Union[_LGBM_ScikitCustomEvalFunction, str]]]] = None, eval_at: Optional[Iterable[int]] = None, **kwargs: Any ) -> LGBMModel: @@ -1036,7 +1036,7 @@ def _lgb_dask_fit( eval_class_weight: Optional[List[Union[dict, str]]] = None, eval_init_score: Optional[List[_DaskCollection]] = None, eval_group: Optional[List[_DaskVectorLike]] = None, - eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, + eval_metric: Optional[Union[_LGBM_ScikitCustomEvalFunction, str, List[Union[_LGBM_ScikitCustomEvalFunction, str]]]] = None, eval_at: Optional[Iterable[int]] = None, **kwargs: Any ) -> "_DaskLGBMModel": @@ -1099,7 +1099,7 @@ def __init__( learning_rate: float = 0.1, n_estimators: int = 100, subsample_for_bin: int = 200000, - objective: Optional[Union[Callable, str]] = None, + objective: Optional[str] = None, class_weight: Optional[Union[dict, str]] = None, min_split_gain: float = 0., min_child_weight: float = 1e-3, @@ -1275,7 +1275,7 @@ def __init__( learning_rate: float = 0.1, n_estimators: int = 100, subsample_for_bin: int = 200000, - objective: Optional[Union[Callable, str]] = None, + objective: Optional[str] = None, class_weight: Optional[Union[dict, str]] = None, min_split_gain: float = 0., min_child_weight: float = 1e-3, @@ -1431,7 +1431,7 @@ def __init__( learning_rate: float = 0.1, n_estimators: int = 100, subsample_for_bin: int = 200000, - objective: Optional[Union[Callable, str]] = None, + objective: Optional[str] = None, class_weight: Optional[Union[dict, str]] = None, min_split_gain: float = 0., min_child_weight: float = 1e-3, @@ -1499,7 +1499,7 @@ def fit( eval_sample_weight: Optional[List[_DaskVectorLike]] = None, eval_init_score: Optional[List[_DaskVectorLike]] = None, eval_group: Optional[List[_DaskVectorLike]] = None, - eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, + eval_metric: Optional[Union[_LGBM_ScikitCustomEvalFunction, str, List[Union[_LGBM_ScikitCustomEvalFunction, str]]]] = None, eval_at: Iterable[int] = (1, 2, 3, 4, 5), **kwargs: Any ) -> "DaskLGBMRanker": From c58635b2860a5e63e31a8fb97fa88a2597dfd2fc Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 24 Dec 2021 06:45:32 +0000 Subject: [PATCH 135/166] more careful separation for USE_CUDA_EXP --- include/LightGBM/config.h | 4 ++-- include/LightGBM/train_share_states.h | 4 ++++ include/LightGBM/tree.h | 4 ++++ src/application/predictor.hpp | 2 +- src/boosting/gbdt.cpp | 4 ---- src/io/config.cpp | 4 ++-- src/io/dataset.cpp | 4 ---- src/io/dataset_loader.cpp | 2 +- src/io/train_share_states.cpp | 2 ++ src/io/tree.cpp | 4 ++++ 10 files changed, 20 insertions(+), 14 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 9643b343dbbb..dc2224a872b5 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -212,7 +212,7 @@ struct Config { // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support // desc = **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda`` // desc = **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future - std::string device_type = "cpu"; + std::string device_type = "cuda_exp"; // [doc-only] // alias = random_seed, random_state @@ -1023,7 +1023,7 @@ struct Config { // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) // desc = **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported - bool gpu_use_dp = false; + bool gpu_use_dp = true; // check = >0 // desc = number of GPUs diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index c86e45605345..5c14c7d51a47 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -183,7 +183,9 @@ struct TrainingShareStates { const std::vector& feature_hist_offsets() const { return feature_hist_offsets_; } + #ifdef USE_CUDA_EXP const std::vector& column_hist_offsets() const { return column_hist_offsets_; } + #endif // USE_CUDA_EXP bool IsSparseRowwise() { return (multi_val_bin_wrapper_ != nullptr && multi_val_bin_wrapper_->IsSparse()); @@ -252,7 +254,9 @@ struct TrainingShareStates { private: std::vector feature_hist_offsets_; + #ifdef USE_CUDA_EXP std::vector column_hist_offsets_; + #endif // USE_CUDA_EXP int num_hist_total_bin_ = 0; std::unique_ptr multi_val_bin_wrapper_; std::vector> hist_buf_; diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 853396d4b445..6ff0370e2ea6 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -319,7 +319,9 @@ class Tree { inline bool is_linear() const { return is_linear_; } + #ifdef USE_CUDA_EXP inline bool is_cuda_tree() const { return is_cuda_tree_; } + #endif // USE_CUDA_EXP inline void SetIsLinear(bool is_linear) { is_linear_ = is_linear; @@ -530,8 +532,10 @@ class Tree { std::vector> leaf_features_; /* \brief features used in leaf linear models; indexing is relative to used_features_ */ std::vector> leaf_features_inner_; + #ifdef USE_CUDA_EXP /*! \brief Marks whether this tree is a CUDATree */ bool is_cuda_tree_; + #endif // USE_CUDA_EXP }; inline void Tree::Split(int leaf, int feature, int real_feature, diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp index e26665813893..d1a8aca4d041 100644 --- a/src/application/predictor.hpp +++ b/src/application/predictor.hpp @@ -161,7 +161,7 @@ class Predictor { * \param data_filename Filename of data * \param result_filename Filename of output result */ - virtual void Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check, bool precise_float_parser) { + void Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check, bool precise_float_parser) { auto writer = VirtualFileWriter::Make(result_filename); if (!writer->Init()) { Log::Fatal("Prediction results file %s cannot be created", result_filename); diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 7053e77557d0..55e6ef512bc2 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -255,14 +255,10 @@ void GBDT::Bagging(int iter) { } else { // get subset tmp_subset_->ReSize(bag_data_cnt_); - global_timer.Start("GBDT::CopySubrow"); tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); - global_timer.Stop("GBDT::CopySubrow"); - global_timer.Start("GBDT::SetBaggingData"); tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), bag_data_cnt_); - global_timer.Stop("GBDT::SetBaggingData"); } } } diff --git a/src/io/config.cpp b/src/io/config.cpp index 73fc7d7bdef9..550c0e944a85 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -345,8 +345,8 @@ void Config::CheckParamConflict() { force_col_wise = false; force_row_wise = true; } - // force gpu_use_dp for non-single GPU CUDA version - if (device_type == std::string("cuda")) { + // force gpu_use_dp for CUDA + if (device_type == std::string("cuda") && !gpu_use_dp) { Log::Warning("CUDA currently requires double precision calculations."); gpu_use_dp = true; } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 89694a118b3f..c207294c79de 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -851,15 +851,11 @@ void Dataset::CopySubrow(const Dataset* fullset, #ifdef USE_CUDA_EXP if (device_type_ == std::string("cuda_exp")) { - global_timer.Start("prepare subset cuda column data"); if (cuda_column_data_ == nullptr) { cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_)); metadata_.CreateCUDAMetadata(gpu_device_id_); } - global_timer.Start("copy subset cuda column data"); cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices); - global_timer.Stop("copy subset cuda column data"); - global_timer.Stop("prepare subset cuda column data"); } #endif // USE_CUDA_EXP } diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 9e41adb5f972..f2704a5155c3 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -272,9 +272,9 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac is_load_from_binary = true; Log::Info("Load from binary file %s", bin_filename.c_str()); dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines, &num_global_data, &used_data_indices)); - #ifdef USE_CUDA_EXP dataset->device_type_ = config_.device_type; dataset->gpu_device_id_ = config_.gpu_device_id; + #ifdef USE_CUDA_EXP if (config_.device_type == std::string("cuda_exp")) { dataset->CreateCUDAColumnData(); dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_); diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp index 8a3ff5b1e91e..199424733f80 100644 --- a/src/io/train_share_states.cpp +++ b/src/io/train_share_states.cpp @@ -382,7 +382,9 @@ void TrainingShareStates::CalcBinOffsets(const std::vector(feature_hist_offsets_.back()); } + #ifdef USE_CUDA_EXP column_hist_offsets_ = *offsets; + #endif // USE_CUDA_EXP } void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data, diff --git a/src/io/tree.cpp b/src/io/tree.cpp index 7d45c66c540c..8c6f9a63973e 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -53,7 +53,9 @@ Tree::Tree(int max_leaves, bool track_branch_features, bool is_linear) leaf_features_.resize(max_leaves_); leaf_features_inner_.resize(max_leaves_); } + #ifdef USE_CUDA_EXP is_cuda_tree_ = false; + #endif // USE_CUDA_EXP } int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, @@ -735,7 +737,9 @@ Tree::Tree(const char* str, size_t* used_len) { is_linear_ = false; } + #ifdef USE_CUDA_EXP is_cuda_tree_ = false; + #endif // USE_CUDA_EXP if ((num_leaves_ <= 1) && !is_linear_) { return; From 93d595065933258d4d8b92d09664a968b64651e6 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 3 Jan 2022 09:32:58 +0000 Subject: [PATCH 136/166] fix CUDARowData::DivideCUDAFeatureGroups fix set fields for cuda metadata --- include/LightGBM/cuda/cuda_metadata.hpp | 12 ++++++--- src/io/cuda/cuda_metadata.cpp | 35 ++++++++++++++++++------- src/io/cuda/cuda_row_data.cpp | 1 + src/io/dataset.cpp | 6 ++--- src/io/metadata.cpp | 27 ++++++++++++++++++- src/treelearner/serial_tree_learner.cpp | 6 ++--- 6 files changed, 68 insertions(+), 19 deletions(-) diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp index 09f8f27a4b59..7c77bef6ba97 100644 --- a/include/LightGBM/cuda/cuda_metadata.hpp +++ b/include/LightGBM/cuda/cuda_metadata.hpp @@ -25,8 +25,15 @@ class CUDAMetadata { const std::vector& weight, const std::vector& query_boundaries, const std::vector& query_weights, - const std::vector& init_score, - const std::vector& queries); + const std::vector& init_score); + + void SetLabel(const label_t* label, data_size_t len); + + void SetWeights(const label_t* weights, data_size_t len); + + void SetQuery(const data_size_t* query, const label_t* query_weights, data_size_t num_queries); + + void SetInitScore(const double* init_score, data_size_t len); const label_t* cuda_label() const { return cuda_label_; } @@ -42,7 +49,6 @@ class CUDAMetadata { data_size_t* cuda_query_boundaries_; label_t* cuda_query_weights_; double* cuda_init_score_; - data_size_t* cuda_queries_; }; } // namespace LightGBM diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp index 3ac657cf9955..ead4114b067f 100644 --- a/src/io/cuda/cuda_metadata.cpp +++ b/src/io/cuda/cuda_metadata.cpp @@ -20,7 +20,6 @@ CUDAMetadata::CUDAMetadata(const int gpu_device_id) { cuda_query_boundaries_ = nullptr; cuda_query_weights_ = nullptr; cuda_init_score_ = nullptr; - cuda_queries_ = nullptr; } CUDAMetadata::~CUDAMetadata() { @@ -29,15 +28,13 @@ CUDAMetadata::~CUDAMetadata() { DeallocateCUDAMemory(&cuda_query_boundaries_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_query_weights_, __FILE__, __LINE__); DeallocateCUDAMemory(&cuda_init_score_, __FILE__, __LINE__); - DeallocateCUDAMemory(&cuda_queries_, __FILE__, __LINE__); } void CUDAMetadata::Init(const std::vector& label, const std::vector& weight, const std::vector& query_boundaries, const std::vector& query_weights, - const std::vector& init_score, - const std::vector& queries) { + const std::vector& init_score) { if (label.size() == 0) { cuda_label_ = nullptr; } else { @@ -63,14 +60,34 @@ void CUDAMetadata::Init(const std::vector& label, } else { InitCUDAMemoryFromHostMemory(&cuda_init_score_, init_score.data(), init_score.size(), __FILE__, __LINE__); } - if (queries.size() == 0) { - cuda_queries_ = nullptr; - } else { - InitCUDAMemoryFromHostMemory(&cuda_queries_, queries.data(), queries.size(), __FILE__, __LINE__); - } SynchronizeCUDADevice(__FILE__, __LINE__); } +void CUDAMetadata::SetLabel(const label_t* label, data_size_t len) { + DeallocateCUDAMemory(&cuda_label_, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_label_, label, static_cast(len), __FILE__, __LINE__); +} + +void CUDAMetadata::SetWeights(const label_t* weights, data_size_t len) { + DeallocateCUDAMemory(&cuda_weights_, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_weights_, weights, static_cast(len), __FILE__, __LINE__); +} + +void CUDAMetadata::SetQuery(const data_size_t* query_boundaries, const label_t* query_weights, data_size_t num_queries) { + Log::Warning("error !!! setting query!!!"); + DeallocateCUDAMemory(&cuda_query_boundaries_, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_query_boundaries_, query_boundaries, static_cast(num_queries) + 1, __FILE__, __LINE__); + if (query_weights != nullptr) { + DeallocateCUDAMemory(&cuda_query_weights_, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_query_weights_, query_weights, static_cast(num_queries), __FILE__, __LINE__); + } +} + +void CUDAMetadata::SetInitScore(const double* init_score, data_size_t len) { + DeallocateCUDAMemory(&cuda_init_score_, __FILE__, __LINE__); + InitCUDAMemoryFromHostMemory(&cuda_init_score_, init_score, static_cast(len), __FILE__, __LINE__); +} + } // namespace LightGBM #endif // USE_CUDA_EXP diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index bed2793398c4..68c965b39f1d 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -186,6 +186,7 @@ void CUDARowData::DivideCUDAFeatureGroups(const Dataset* train_data, TrainingSha const int feature_group_index = train_data->Feature2Group(feature_index); if (prev_group_index == -1 || feature_group_index != prev_group_index) { feature_group_num_feature_offsets.emplace_back(offsets); + prev_group_index = feature_group_index; } ++offsets; } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c207294c79de..c9bacb71ef35 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -339,12 +339,12 @@ void Dataset::Construct(std::vector>* bin_mappers, auto features_in_group = NoGroup(used_features); auto is_sparse = io_config.is_enable_sparse; - if (io_config.device_type == std::string("cuda")) { + if (io_config.device_type == std::string("cuda") || io_config.device_type == std::string("cuda_exp")) { LGBM_config_::current_device = lgbm_device_cuda; - if (is_sparse) { + if (io_config.device_type == std::string("cuda") && is_sparse) { Log::Warning("Using sparse features with CUDA is currently not supported."); + is_sparse = false; } - is_sparse = false; } std::vector group_is_multi_val(used_features.size(), 0); diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 3ef0edbe405d..edd001ee2fa1 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -305,6 +305,11 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) { init_score_[i] = Common::AvoidInf(init_score[i]); } init_score_load_from_file_ = false; + #ifdef USE_CUDA_EXP + if (cuda_metadata_ != nullptr) { + cuda_metadata_->SetInitScore(init_score_.data(), len); + } + #endif // USE_CUDA_EXP } void Metadata::SetLabel(const label_t* label, data_size_t len) { @@ -321,6 +326,11 @@ void Metadata::SetLabel(const label_t* label, data_size_t len) { for (data_size_t i = 0; i < num_data_; ++i) { label_[i] = Common::AvoidInf(label[i]); } + #ifdef USE_CUDA_EXP + if (cuda_metadata_ != nullptr) { + cuda_metadata_->SetLabel(label_.data(), len); + } + #endif // USE_CUDA_EXP } void Metadata::SetWeights(const label_t* weights, data_size_t len) { @@ -343,6 +353,11 @@ void Metadata::SetWeights(const label_t* weights, data_size_t len) { } LoadQueryWeights(); weight_load_from_file_ = false; + #ifdef USE_CUDA_EXP + if (cuda_metadata_ != nullptr) { + cuda_metadata_->SetWeights(weights_.data(), len); + } + #endif // USE_CUDA_EXP } void Metadata::SetQuery(const data_size_t* query, data_size_t len) { @@ -369,6 +384,16 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) { } LoadQueryWeights(); query_load_from_file_ = false; + #ifdef USE_CUDA_EXP + if (cuda_metadata_ != nullptr) { + if (query_weights_.size() > 0) { + CHECK_EQ(query_weights_.size(), static_cast(num_queries_)); + cuda_metadata_->SetQuery(query_boundaries_.data(), query_weights_.data(), num_queries_); + } else { + cuda_metadata_->SetQuery(query_boundaries_.data(), nullptr, num_queries_); + } + } + #endif // USE_CUDA_EXP } void Metadata::LoadWeights() { @@ -478,7 +503,7 @@ void Metadata::LoadQueryWeights() { #ifdef USE_CUDA_EXP void Metadata::CreateCUDAMetadata(const int gpu_device_id) { cuda_metadata_.reset(new CUDAMetadata(gpu_device_id)); - cuda_metadata_->Init(label_, weights_, query_boundaries_, query_weights_, init_score_, queries_); + cuda_metadata_->Init(label_, weights_, query_boundaries_, query_weights_, init_score_); } #endif // USE_CUDA_EXP diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 3af2ff0f4bf3..e69abf91b671 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -340,15 +340,15 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set* fo } bool use_subtract = parent_leaf_histogram_array_ != nullptr; -#if defined(USE_CUDA) || defined(USE_CUDA_EXP) +#ifdef USE_CUDA if (LGBM_config_::current_learner == use_cpu_learner) { SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); } else { ConstructHistograms(is_feature_used, use_subtract); } -#else +#else // USE_CUDA ConstructHistograms(is_feature_used, use_subtract); -#endif +#endif // USE_CUDA FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree); } From 9f6aa8a2a06defeb036c2ec79b63641d07ba32cd Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 3 Jan 2022 09:46:42 +0000 Subject: [PATCH 137/166] revert config.h --- include/LightGBM/config.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index dc2224a872b5..9643b343dbbb 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -212,7 +212,7 @@ struct Config { // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support // desc = **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda`` // desc = **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future - std::string device_type = "cuda_exp"; + std::string device_type = "cpu"; // [doc-only] // alias = random_seed, random_state @@ -1023,7 +1023,7 @@ struct Config { // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used) // desc = **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported - bool gpu_use_dp = true; + bool gpu_use_dp = false; // check = >0 // desc = number of GPUs From 12d81613616e4bb53043d5cf124ccb843d55e4df Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 3 Jan 2022 12:45:21 +0000 Subject: [PATCH 138/166] fix test settings for cuda experimental version --- .ci/test.sh | 12 ++++++++++-- python-package/setup.py | 3 +++ src/io/cuda/cuda_metadata.cpp | 1 - tests/python_package_test/test_dask.py | 3 ++- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/.ci/test.sh b/.ci/test.sh index 8f20d16a958e..b2688dde1f22 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -184,11 +184,19 @@ elif [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then fi if [[ $METHOD == "pip" ]]; then cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1 - pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1 + if [[ $TASK == "cuda" ]]; then + pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1 + else + pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda-exp || exit -1 + fi pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1 exit 0 elif [[ $METHOD == "wheel" ]]; then - cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1 + if [[ $TASK == "cuda" ]]; then + cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1 + else + cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda-exp || exit -1 + fi pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1 pytest $BUILD_DIRECTORY/tests || exit -1 exit 0 diff --git a/python-package/setup.py b/python-package/setup.py index 6d31db9c1089..6a0878fbe0ae 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -21,6 +21,7 @@ ('integrated-opencl', None, 'Compile integrated OpenCL version'), ('gpu', 'g', 'Compile GPU version'), ('cuda', None, 'Compile CUDA version'), + ('cuda-exp', None, 'Compile CUDA Experimental version'), ('mpi', None, 'Compile MPI version'), ('nomp', None, 'Compile version without OpenMP support'), ('hdfs', 'h', 'Compile HDFS version'), @@ -274,6 +275,7 @@ def initialize_options(self) -> None: self.integrated_opencl = False self.gpu = False self.cuda = False + self.cuda_exp = False self.boost_root = None self.boost_dir = None self.boost_include_dir = None @@ -295,6 +297,7 @@ def finalize_options(self) -> None: install.integrated_opencl = self.integrated_opencl install.gpu = self.gpu install.cuda = self.cuda + install.cuda_exp = self.cuda_exp install.boost_root = self.boost_root install.boost_dir = self.boost_dir install.boost_include_dir = self.boost_include_dir diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp index ead4114b067f..2a3dd380254a 100644 --- a/src/io/cuda/cuda_metadata.cpp +++ b/src/io/cuda/cuda_metadata.cpp @@ -74,7 +74,6 @@ void CUDAMetadata::SetWeights(const label_t* weights, data_size_t len) { } void CUDAMetadata::SetQuery(const data_size_t* query_boundaries, const label_t* query_weights, data_size_t num_queries) { - Log::Warning("error !!! setting query!!!"); DeallocateCUDAMemory(&cuda_query_boundaries_, __FILE__, __LINE__); InitCUDAMemoryFromHostMemory(&cuda_query_boundaries_, query_boundaries, static_cast(num_queries) + 1, __FILE__, __LINE__); if (query_weights != nullptr) { diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index b4a948070420..5f124aaae124 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -61,7 +61,8 @@ pytestmark = [ pytest.mark.skipif(getenv('TASK', '') == 'mpi', reason='Fails to run with MPI interface'), - pytest.mark.skipif(getenv('TASK', '') == 'gpu', reason='Fails to run with GPU interface') + pytest.mark.skipif(getenv('TASK', '') == 'gpu', reason='Fails to run with GPU interface'), + pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Fails to run with CUDA Experimental interface') ] From 354845e540011ab77860caef2831a7e7af777f0d Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 4 Jan 2022 07:42:06 +0000 Subject: [PATCH 139/166] skip some tests due to unsupported features or differences in implementation details for CUDA Experimental version --- tests/python_package_test/test_basic.py | 3 ++- tests/python_package_test/test_engine.py | 9 +++++++++ tests/python_package_test/test_sklearn.py | 4 ++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 18a8403eba85..9f82253d2dc6 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -4,6 +4,7 @@ from pathlib import Path import numpy as np +from os import getenv import pytest from scipy import sparse from sklearn.datasets import dump_svmlight_file, load_svmlight_file @@ -14,7 +15,7 @@ from .utils import load_breast_cancer - +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') def test_basic(tmp_path): X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index c4dcacd2e0c4..fa9781609cb4 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -9,6 +9,7 @@ from pathlib import Path import numpy as np +from os import getenv import psutil import pytest from scipy.sparse import csr_matrix, isspmatrix_csc, isspmatrix_csr @@ -569,6 +570,7 @@ def test_multi_class_error(): assert results['training']['multi_error@2'][-1] == pytest.approx(0) +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') def test_auc_mu(): # should give same result as binary auc for 2 classes X, y = load_digits(n_class=10, return_X_y=True) @@ -1469,6 +1471,7 @@ def generate_trainset_for_monotone_constraints_tests(x3_to_category=True): return trainset +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version') @pytest.mark.parametrize("test_with_categorical_variable", [True, False]) def test_monotone_constraints(test_with_categorical_variable): def is_increasing(y): @@ -1558,6 +1561,7 @@ def has_interaction(treef): assert are_interactions_enforced(constrained_model, feature_sets) +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version') def test_monotone_penalty(): def are_first_splits_non_monotone(tree, n, monotone_constraints): if n <= 0: @@ -1597,6 +1601,7 @@ def are_there_monotone_splits(tree, monotone_constraints): # test if a penalty as high as the depth indeed prohibits all monotone splits +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version') def test_monotone_penalty_max(): max_depth = 5 monotone_constraints = [1, -1, 0] @@ -2279,6 +2284,7 @@ def test_model_size(): pytest.skipTest('not enough RAM') +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') def test_get_split_value_histogram(): X, y = load_boston(return_X_y=True) lgb_train = lgb.Dataset(X, y, categorical_feature=[2]) @@ -2359,6 +2365,7 @@ def test_get_split_value_histogram(): gbm.get_split_value_histogram(2) +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') def test_early_stopping_for_only_first_metric(): def metrics_combination_train_regression(valid_sets, metric_list, assumed_iteration, @@ -2756,6 +2763,7 @@ def _imptcs_to_numpy(X, impcts_dict): assert tree_df.loc[0, col] is None +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Interaction constraints are not yet supported by CUDA Experimental version') def test_interaction_constraints(): X, y = load_boston(return_X_y=True) num_features = X.shape[1] @@ -3144,6 +3152,7 @@ def hook(obj): assert "LV" in dumped_model_str +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Forced splits are not yet supported by CUDA Experimental version') def test_force_split_with_feature_fraction(tmp_path): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index d4112078f39e..9ea43a62f167 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -5,6 +5,7 @@ import joblib import numpy as np +from os import getenv import pytest from pkg_resources import parse_version from sklearn import __version__ as sk_version @@ -109,6 +110,7 @@ def test_regression(): assert gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1] == pytest.approx(ret) +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') def test_multiclass(): X, y = load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -121,6 +123,7 @@ def test_multiclass(): assert gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1] == pytest.approx(ret) +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') def test_lambdarank(): rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train')) @@ -1099,6 +1102,7 @@ def test_nan_handle(): np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.nan) +@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') def test_first_metric_only(): def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_only): From cb49dd1599a78614b80b56a1065975b3ea62d0a3 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 4 Jan 2022 07:50:15 +0000 Subject: [PATCH 140/166] fix lint issue by adding a blank line --- tests/python_package_test/test_basic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 9f82253d2dc6..1dc121fd1bc3 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -15,6 +15,7 @@ from .utils import load_breast_cancer + @pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') def test_basic(tmp_path): X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), From 2e2c6963b30d5c3090e4adcd9055b02cbd278020 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 4 Jan 2022 07:56:40 +0000 Subject: [PATCH 141/166] fix lint errors by resorting imports --- tests/python_package_test/test_basic.py | 2 +- tests/python_package_test/test_engine.py | 2 +- tests/python_package_test/test_sklearn.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 1dc121fd1bc3..d4a6ef005894 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -4,8 +4,8 @@ from pathlib import Path import numpy as np -from os import getenv import pytest +from os import getenv from scipy import sparse from sklearn.datasets import dump_svmlight_file, load_svmlight_file from sklearn.model_selection import train_test_split diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index fa9781609cb4..242938180197 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -9,9 +9,9 @@ from pathlib import Path import numpy as np -from os import getenv import psutil import pytest +from os import getenv from scipy.sparse import csr_matrix, isspmatrix_csc, isspmatrix_csr from sklearn.datasets import load_svmlight_file, make_multilabel_classification from sklearn.metrics import average_precision_score, log_loss, mean_absolute_error, mean_squared_error, roc_auc_score diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 9ea43a62f167..1825f8e2ad21 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -5,8 +5,8 @@ import joblib import numpy as np -from os import getenv import pytest +from os import getenv from pkg_resources import parse_version from sklearn import __version__ as sk_version from sklearn.base import clone From 343367485e8221b457615cf79cc9d28cd496878d Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 4 Jan 2022 08:32:54 +0000 Subject: [PATCH 142/166] fix lint errors by resorting imports --- tests/python_package_test/test_basic.py | 2 +- tests/python_package_test/test_engine.py | 2 +- tests/python_package_test/test_sklearn.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index d4a6ef005894..c32be874023d 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -1,11 +1,11 @@ # coding: utf-8 import filecmp import numbers +from os import getenv from pathlib import Path import numpy as np import pytest -from os import getenv from scipy import sparse from sklearn.datasets import dump_svmlight_file, load_svmlight_file from sklearn.model_selection import train_test_split diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 242938180197..2ae01f454e6c 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -6,12 +6,12 @@ import pickle import platform import random +from os import getenv from pathlib import Path import numpy as np import psutil import pytest -from os import getenv from scipy.sparse import csr_matrix, isspmatrix_csc, isspmatrix_csr from sklearn.datasets import load_svmlight_file, make_multilabel_classification from sklearn.metrics import average_precision_score, log_loss, mean_absolute_error, mean_squared_error, roc_auc_score diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 1825f8e2ad21..22b441b96481 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1,12 +1,12 @@ # coding: utf-8 import itertools import math +from os import getenv from pathlib import Path import joblib import numpy as np import pytest -from os import getenv from pkg_resources import parse_version from sklearn import __version__ as sk_version from sklearn.base import clone From c72d55557bbba0fc3b31fb7cc23aea4c99b398d9 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 4 Jan 2022 08:54:29 +0000 Subject: [PATCH 143/166] fix lint errors by resorting imports --- tests/python_package_test/test_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index b24ffc8f5a4e..25d7bd78ffa6 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -1,8 +1,8 @@ # coding: utf-8 import filecmp import numbers -from os import getenv import re +from os import getenv from pathlib import Path import numpy as np From 63a9dc1a81620567213fbdf1d3bcaf9344f79b3a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 5 Jan 2022 02:38:15 +0000 Subject: [PATCH 144/166] merge cuda.yml and cuda_exp.yml --- .github/workflows/cuda.yml | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 822ea2d44b82..8cef022a87ef 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -11,7 +11,6 @@ on: env: github_actions: 'true' os_name: linux - task: cuda conda_env: test-env jobs: @@ -27,14 +26,32 @@ jobs: compiler: gcc python_version: 3.7 cuda_version: "11.4.2" + tree_learner: cuda - method: pip compiler: clang python_version: 3.8 cuda_version: "10.0" + tree_learner: cuda - method: wheel compiler: gcc python_version: 3.9 cuda_version: "9.0" + tree_learner: cuda + - method: source + compiler: gcc + python_version: 3.7 + cuda_version: "11.4.2" + tree_learner: cuda_exp + - method: pip + compiler: clang + python_version: 3.8 + cuda_version: "10.0" + tree_learner: cuda_exp + - method: wheel + compiler: gcc + python_version: 3.7 + cuda_version: "11.4.2" + tree_learner: cuda_exp steps: - name: Setup or update software on host machine run: | @@ -73,7 +90,7 @@ jobs: GITHUB_ACTIONS=${{ env.github_actions }} OS_NAME=${{ env.os_name }} COMPILER=${{ matrix.compiler }} - TASK=${{ env.task }} + TASK=${{ matrix.tree_learner }} METHOD=${{ matrix.method }} CONDA_ENV=${{ env.conda_env }} PYTHON_VERSION=${{ matrix.python_version }} From 31ac33be953d6cf80099eeb044ca9179a8f3b746 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 5 Jan 2022 02:39:47 +0000 Subject: [PATCH 145/166] update python version in cuda.yml --- .github/workflows/cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 8cef022a87ef..adc57efa1da3 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -49,7 +49,7 @@ jobs: tree_learner: cuda_exp - method: wheel compiler: gcc - python_version: 3.7 + python_version: 3.9 cuda_version: "11.4.2" tree_learner: cuda_exp steps: From 5f1f38d015bb2cde555ae15844f8dd54f09b6cb0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 5 Jan 2022 02:41:24 +0000 Subject: [PATCH 146/166] remove cuda_exp.yml --- .github/workflows/cuda_exp.yml | 99 ---------------------------------- 1 file changed, 99 deletions(-) delete mode 100644 .github/workflows/cuda_exp.yml diff --git a/.github/workflows/cuda_exp.yml b/.github/workflows/cuda_exp.yml deleted file mode 100644 index 48e07016b17e..000000000000 --- a/.github/workflows/cuda_exp.yml +++ /dev/null @@ -1,99 +0,0 @@ -name: CUDA Experimental Version - -on: - push: - branches: - - master - pull_request: - branches: - - master - -env: - github_actions: 'true' - os_name: linux - task: cuda_exp - conda_env: test-env - -jobs: - test: - name: cuda ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }}) - runs-on: [self-hosted, linux] - timeout-minutes: 60 - strategy: - fail-fast: false - matrix: - include: - - method: source - compiler: gcc - python_version: 3.7 - cuda_version: "11.4.2" - - method: pip - compiler: clang - python_version: 3.8 - cuda_version: "10.0" - steps: - - name: Setup or update software on host machine - run: | - sudo apt-get update - sudo apt-get install --no-install-recommends -y \ - apt-transport-https \ - ca-certificates \ - curl \ - git \ - gnupg-agent \ - lsb-release \ - software-properties-common - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - - sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" -y - curl -sL https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - - curl -sL https://nvidia.github.io/nvidia-docker/$(. /etc/os-release;echo $ID$VERSION_ID)/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list - sudo apt-get update - sudo apt-get install --no-install-recommends -y \ - containerd.io \ - docker-ce \ - docker-ce-cli \ - nvidia-docker2 - sudo chmod a+rw /var/run/docker.sock - sudo systemctl restart docker - - name: Remove old folder with repository - run: sudo rm -rf $GITHUB_WORKSPACE - - name: Checkout repository - uses: actions/checkout@v1 - with: - fetch-depth: 5 - submodules: true - - name: Setup and run tests - run: | - export ROOT_DOCKER_FOLDER=/LightGBM - cat > docker.env < docker-script.sh < Date: Thu, 6 Jan 2022 02:51:56 +0000 Subject: [PATCH 147/166] remove unrelated changes --- src/io/dataset.cpp | 8 ++++---- src/treelearner/serial_tree_learner.cpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index c9bacb71ef35..16256af0d7e2 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -1161,10 +1161,10 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, template void Dataset::ConstructHistogramsInner( - const std::vector& is_feature_used, const data_size_t* data_indices, - data_size_t num_data, const score_t* gradients, const score_t* hessians, - score_t* ordered_gradients, score_t* ordered_hessians, - TrainingShareStates* share_state, hist_t* hist_data) const { + const std::vector& is_feature_used, const data_size_t* data_indices, + data_size_t num_data, const score_t* gradients, const score_t* hessians, + score_t* ordered_gradients, score_t* ordered_hessians, + TrainingShareStates* share_state, hist_t* hist_data) const { if (!share_state->is_col_wise) { return ConstructHistogramsMultiVal( data_indices, num_data, gradients, hessians, share_state, hist_data); diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index e69abf91b671..304c712f0723 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -346,9 +346,9 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set* fo } else { ConstructHistograms(is_feature_used, use_subtract); } -#else // USE_CUDA +#else ConstructHistograms(is_feature_used, use_subtract); -#endif // USE_CUDA +#endif FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree); } From b0084249634c595bf895173df557eecbc39e00c7 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 16 Feb 2022 14:46:26 +0000 Subject: [PATCH 148/166] fix compilation warnings fix cuda exp ci task name --- .github/workflows/cuda.yml | 4 ++-- include/LightGBM/config.h | 20 ++++++++++++++++++++ include/LightGBM/cuda/vector_cudahost.h | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index adc57efa1da3..8969b89c41e3 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -15,7 +15,7 @@ env: jobs: test: - name: cuda ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }}) + name: cuda ${{ matrix.tree_learner }} ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }}) runs-on: [self-hosted, linux] timeout-minutes: 60 strategy: @@ -90,7 +90,7 @@ jobs: GITHUB_ACTIONS=${{ env.github_actions }} OS_NAME=${{ env.os_name }} COMPILER=${{ matrix.compiler }} - TASK=${{ matrix.tree_learner }} + TASK=${{ env.task }} METHOD=${{ matrix.method }} CONDA_ENV=${{ env.conda_env }} PYTHON_VERSION=${{ matrix.python_version }} diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 9643b343dbbb..1df35f3a33dc 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -81,9 +81,11 @@ struct Config { static void KV2Map(std::unordered_map* params, const char* kv); static std::unordered_map Str2Map(const char* parameters); + #ifndef __NVCC__ #pragma region Parameters #pragma region Core Parameters + #endif // __NVCC__ // [no-save] // [doc-only] @@ -230,9 +232,11 @@ struct Config { // desc = **Note**: to avoid potential instability due to numerical issues, please set ``force_col_wise=true`` or ``force_row_wise=true`` when setting ``deterministic=true`` bool deterministic = false; + #ifndef __NVCC__ #pragma endregion #pragma region Learning Control Parameters + #endif // __NVCC__ // desc = used only with ``cpu`` device type // desc = set this to ``true`` to force col-wise histogram building @@ -570,11 +574,13 @@ struct Config { // desc = **Note**: can be used only in CLI version int snapshot_freq = -1; + #ifndef __NVCC__ #pragma endregion #pragma region IO Parameters #pragma region Dataset Parameters + #endif // __NVCC__ // alias = linear_trees // desc = fit piecewise linear gradient boosting tree @@ -728,9 +734,11 @@ struct Config { // desc = **Note**: ``lightgbm-transform`` is not maintained by LightGBM's maintainers. Bug reports or feature requests should go to `issues page `__ std::string parser_config_file = ""; + #ifndef __NVCC__ #pragma endregion #pragma region Predict Parameters + #endif // __NVCC__ // [no-save] // desc = used only in ``prediction`` task @@ -800,9 +808,11 @@ struct Config { // desc = **Note**: can be used only in CLI version std::string output_result = "LightGBM_predict_result.txt"; + #ifndef __NVCC__ #pragma endregion #pragma region Convert Parameters + #endif // __NVCC__ // [no-save] // desc = used only in ``convert_model`` task @@ -818,11 +828,13 @@ struct Config { // desc = **Note**: can be used only in CLI version std::string convert_model = "gbdt_prediction.cpp"; + #ifndef __NVCC__ #pragma endregion #pragma endregion #pragma region Objective Parameters + #endif // __NVCC__ // desc = used only in ``rank_xendcg`` objective // desc = random seed for objectives, if random process is needed @@ -902,9 +914,11 @@ struct Config { // desc = separate by ``,`` std::vector label_gain; + #ifndef __NVCC__ #pragma endregion #pragma region Metric Parameters + #endif // __NVCC__ // [doc-only] // alias = metrics, metric_types @@ -976,9 +990,11 @@ struct Config { // desc = if not specified, will use equal weights for all classes std::vector auc_mu_weights; + #ifndef __NVCC__ #pragma endregion #pragma region Network Parameters + #endif // __NVCC__ // check = >0 // alias = num_machine @@ -1007,9 +1023,11 @@ struct Config { // desc = list of machines in the following format: ``ip1:port1,ip2:port2`` std::string machines = ""; + #ifndef __NVCC__ #pragma endregion #pragma region GPU Parameters + #endif // __NVCC__ // desc = OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform // desc = ``-1`` means the system-wide default platform @@ -1030,9 +1048,11 @@ struct Config { // desc = **Note**: can be used only in CUDA implementation int num_gpu = 1; + #ifndef __NVCC__ #pragma endregion #pragma endregion + #endif // __NVCC__ size_t file_load_progress_interval_bytes = size_t(10) * 1024 * 1024 * 1024; diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 4eb9f8b6d876..37e4f4843b05 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -42,7 +42,7 @@ struct CHAllocator { T* allocate(std::size_t n) { T* ptr; if (n == 0) return NULL; - n = (n + kAlignedSize - 1) & -kAlignedSize; + n = (n + kAlignedSize - 1) / kAlignedSize * kAlignedSize; #if defined(USE_CUDA) || defined(USE_CUDA_EXP) if (LGBM_config_::current_device == lgbm_device_cuda) { cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); From d77dd23e0bbb54a69654a0039c20326c72ca0a77 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 22 Feb 2022 05:48:40 +0000 Subject: [PATCH 149/166] recover task --- .github/workflows/cuda.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 6a53df95a58b..4335140f4243 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -11,6 +11,7 @@ on: env: github_actions: 'true' os_name: linux + task: cuda conda_env: test-env jobs: From 6a9d5307e3d36c40df056e9a2ec52369bf7205cc Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 22 Feb 2022 09:02:46 +0000 Subject: [PATCH 150/166] use multi-level template in histogram construction check split only in debug mode --- include/LightGBM/cuda/cuda_row_data.hpp | 27 +- src/io/cuda/cuda_row_data.cpp | 57 ++++ .../cuda/cuda_histogram_constructor.cu | 321 +++++------------- .../cuda/cuda_histogram_constructor.hpp | 15 + .../cuda/cuda_single_gpu_tree_learner.cpp | 71 +--- .../cuda/cuda_single_gpu_tree_learner.hpp | 7 +- 6 files changed, 166 insertions(+), 332 deletions(-) diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 3013883abd6a..f1b5a84bedb8 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -48,6 +48,15 @@ class CUDARowData { void CopySubrowAndSubcol(const CUDARowData* full_set, const data_size_t* used_indices, const data_size_t num_used_indices, const std::vector& is_feature_used, const Dataset* train_data); + template + const BIN_TYPE* GetBin() const; + + template + const PTR_TYPE* GetPartitionPtr() const; + + template + const PTR_TYPE* GetRowPtr() const; + int NumLargeBinPartition() const { return static_cast(large_bin_partitions_.size()); } int num_feature_partitions() const { return num_feature_partitions_; } @@ -60,24 +69,6 @@ class CUDARowData { uint8_t row_ptr_bit_type() const { return row_ptr_bit_type_; } - const uint8_t* cuda_data_uint8() const { return cuda_data_uint8_t_; } - - const uint16_t* cuda_data_uint16() const { return cuda_data_uint16_t_; } - - const uint32_t* cuda_data_uint32() const { return cuda_data_uint32_t_; } - - const uint16_t* cuda_row_ptr_uint16() const { return cuda_row_ptr_uint16_t_; } - - const uint32_t* cuda_row_ptr_uint32() const { return cuda_row_ptr_uint32_t_; } - - const uint64_t* cuda_row_ptr_uint64() const { return cuda_row_ptr_uint64_t_; } - - const uint16_t* cuda_partition_ptr_uint16() const { return cuda_partition_ptr_uint16_t_; } - - const uint32_t* cuda_partition_ptr_uint32() const { return cuda_partition_ptr_uint32_t_; } - - const uint64_t* cuda_partition_ptr_uint64() const { return cuda_partition_ptr_uint64_t_; } - const int* cuda_feature_partition_column_index_offsets() const { return cuda_feature_partition_column_index_offsets_; } const uint32_t* cuda_column_hist_offsets() const { return cuda_column_hist_offsets_; } diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 68c965b39f1d..c5896f76465e 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -415,6 +415,63 @@ void CUDARowData::InitSparseData(const BIN_TYPE* host_data, } } +template +const BIN_TYPE* CUDARowData::GetBin() const { + if (bit_type_ == 8) { + return reinterpret_cast(cuda_data_uint8_t_); + } else if (bit_type_ == 16) { + return reinterpret_cast(cuda_data_uint16_t_); + } else if (bit_type_ == 32) { + return reinterpret_cast(cuda_data_uint32_t_); + } else { + Log::Fatal("Unknown bit_type %d for GetBin.", bit_type_); + } +} + +template const uint8_t* CUDARowData::GetBin() const; + +template const uint16_t* CUDARowData::GetBin() const; + +template const uint32_t* CUDARowData::GetBin() const; + +template +const PTR_TYPE* CUDARowData::GetRowPtr() const { + if (row_ptr_bit_type_ == 16) { + return reinterpret_cast(cuda_row_ptr_uint16_t_); + } else if (row_ptr_bit_type_ == 32) { + return reinterpret_cast(cuda_row_ptr_uint32_t_); + } else if (row_ptr_bit_type_ == 64) { + return reinterpret_cast(cuda_row_ptr_uint64_t_); + } else { + Log::Fatal("Unknown row_ptr_bit_type = %d for GetRowPtr.", row_ptr_bit_type_); + } +} + +template const uint16_t* CUDARowData::GetRowPtr() const; + +template const uint32_t* CUDARowData::GetRowPtr() const; + +template const uint64_t* CUDARowData::GetRowPtr() const; + +template +const PTR_TYPE* CUDARowData::GetPartitionPtr() const { + if (row_ptr_bit_type_ == 16) { + return reinterpret_cast(cuda_partition_ptr_uint16_t_); + } else if (row_ptr_bit_type_ == 32) { + return reinterpret_cast(cuda_partition_ptr_uint32_t_); + } else if (row_ptr_bit_type_ == 64) { + return reinterpret_cast(cuda_partition_ptr_uint64_t_); + } else { + Log::Fatal("Unknown row_ptr_bit_type = %d for GetPartitionPtr.", row_ptr_bit_type_); + } +} + +template const uint16_t* CUDARowData::GetPartitionPtr() const; + +template const uint32_t* CUDARowData::GetPartitionPtr() const; + +template const uint64_t* CUDARowData::GetPartitionPtr() const; + } // namespace LightGBM #endif // USE_CUDA_EXP diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index d5274ff975b4..3bee9a74925a 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -260,6 +260,47 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel( template void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const data_size_t num_data_in_smaller_leaf) { + if (cuda_row_data_->bit_type() == 8) { + LaunchConstructHistogramKernelInner0(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); + } else if (cuda_row_data_->bit_type() == 16) { + LaunchConstructHistogramKernelInner0(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); + } else if (cuda_row_data_->bit_type() == 32) { + LaunchConstructHistogramKernelInner0(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); + } else { + Log::Fatal("Unknown bit_type = %d", cuda_row_data_->bit_type()); + } +} + +template +void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner0( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const data_size_t num_data_in_smaller_leaf) { + if (cuda_row_data_->row_ptr_bit_type() == 16) { + LaunchConstructHistogramKernelInner1(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); + } else if (cuda_row_data_->row_ptr_bit_type() == 32) { + LaunchConstructHistogramKernelInner1(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); + } else if (cuda_row_data_->row_ptr_bit_type() == 64) { + LaunchConstructHistogramKernelInner1(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); + } else { + Log::Fatal("Unknown row_ptr_bit_type = %d", cuda_row_data_->row_ptr_bit_type()); + } +} + +template +void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner1( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const data_size_t num_data_in_smaller_leaf) { + if (cuda_row_data_->NumLargeBinPartition() == 0) { + LaunchConstructHistogramKernelInner2(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); + } else { + LaunchConstructHistogramKernelInner2(cuda_smaller_leaf_splits, num_data_in_smaller_leaf); + } +} + +template +void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner2( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf) { int grid_dim_x = 0; @@ -269,257 +310,47 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner( CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf); dim3 grid_dim(grid_dim_x, grid_dim_y); dim3 block_dim(block_dim_x, block_dim_y); - if (cuda_row_data_->NumLargeBinPartition() == 0) { + if (!USE_GLOBAL_MEM_BUFFER) { if (cuda_row_data_->is_sparse()) { - if (cuda_row_data_->bit_type() == 8) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } - } else if (cuda_row_data_->bit_type() == 16) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } - } else if (cuda_row_data_->bit_type() == 32) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_); - } - } + CUDAConstructHistogramSparseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->GetBin(), + cuda_row_data_->GetRowPtr(), + cuda_row_data_->GetPartitionPtr(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_); } else { - if (cuda_row_data_->bit_type() == 8) { - CUDAConstructHistogramDenseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_); - } else if (cuda_row_data_->bit_type() == 16) { - CUDAConstructHistogramDenseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_); - } else if (cuda_row_data_->bit_type() == 32) { - CUDAConstructHistogramDenseKernel<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_); - } + CUDAConstructHistogramDenseKernel<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->GetBin(), + cuda_row_data_->cuda_column_hist_offsets(), + cuda_row_data_->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_feature_partition_column_index_offsets(), + num_data_); } } else { if (cuda_row_data_->is_sparse()) { - if (cuda_row_data_->bit_type() == 8) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, - cuda_hist_buffer_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, - cuda_hist_buffer_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, - cuda_hist_buffer_); - } - } else if (cuda_row_data_->bit_type() == 16) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, - cuda_hist_buffer_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, - cuda_hist_buffer_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, - cuda_hist_buffer_); - } - } else if (cuda_row_data_->bit_type() == 32) { - if (cuda_row_data_->row_ptr_bit_type() == 16) { - CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint16(), - cuda_row_data_->cuda_partition_ptr_uint16(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, - cuda_hist_buffer_); - } else if (cuda_row_data_->row_ptr_bit_type() == 32) { - CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint32(), - cuda_row_data_->cuda_partition_ptr_uint32(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, - cuda_hist_buffer_); - } else if (cuda_row_data_->row_ptr_bit_type() == 64) { - CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_row_ptr_uint64(), - cuda_row_data_->cuda_partition_ptr_uint64(), - cuda_row_data_->cuda_partition_hist_offsets(), - num_data_, - cuda_hist_buffer_); - } - } + CUDAConstructHistogramSparseKernel_GlobalMemory<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->GetBin(), + cuda_row_data_->GetRowPtr(), + cuda_row_data_->GetPartitionPtr(), + cuda_row_data_->cuda_partition_hist_offsets(), + num_data_, + cuda_hist_buffer_); } else { - if (cuda_row_data_->bit_type() == 8) { - CUDAConstructHistogramDenseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint8(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_, - cuda_hist_buffer_); - } else if (cuda_row_data_->bit_type() == 16) { - CUDAConstructHistogramDenseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint16(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_, - cuda_hist_buffer_); - } else if (cuda_row_data_->bit_type() == 32) { - CUDAConstructHistogramDenseKernel_GlobalMemory<<>>( - cuda_smaller_leaf_splits, - cuda_gradients_, cuda_hessians_, - cuda_row_data_->cuda_data_uint32(), - cuda_row_data_->cuda_column_hist_offsets(), - cuda_row_data_->cuda_partition_hist_offsets(), - cuda_row_data_->cuda_feature_partition_column_index_offsets(), - num_data_, - cuda_hist_buffer_); - } + CUDAConstructHistogramDenseKernel_GlobalMemory<<>>( + cuda_smaller_leaf_splits, + cuda_gradients_, cuda_hessians_, + cuda_row_data_->GetBin(), + cuda_row_data_->cuda_column_hist_offsets(), + cuda_row_data_->cuda_partition_hist_offsets(), + cuda_row_data_->cuda_feature_partition_column_index_offsets(), + num_data_, + cuda_hist_buffer_); } } } diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 3ddf9083eb87..9ff0f91d7741 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -76,6 +76,21 @@ class CUDAHistogramConstructor { const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf); + template + void LaunchConstructHistogramKernelInner0( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const data_size_t num_data_in_smaller_leaf); + + template + void LaunchConstructHistogramKernelInner1( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const data_size_t num_data_in_smaller_leaf); + + template + void LaunchConstructHistogramKernelInner2( + const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, + const data_size_t num_data_in_smaller_leaf); + void LaunchConstructHistogramKernel( const CUDALeafSplitsStruct* cuda_smaller_leaf_splits, const data_size_t num_data_in_smaller_leaf); diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index ef0982386e33..f8a4fcd92f9f 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -219,9 +219,9 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, &leaf_sum_hessians_[right_leaf_index], &sum_left_gradients, &sum_right_gradients); - CheckSplitValid(leaf_best_split_feature_[best_leaf_index_], leaf_best_split_threshold_[best_leaf_index_], - best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients, - leaf_num_data_[best_leaf_index_], leaf_num_data_[right_leaf_index]); + #ifdef DEBUG + CheckSplitValid(best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients); + #endif // DEBUG smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index); larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_); global_timer.Stop("CUDASingleGPUTreeLearner::Split"); @@ -426,14 +426,12 @@ void CUDASingleGPUTreeLearner::AllocateBitset() { cuda_bitset_inner_len_ = 0; } +#ifdef DEBUG void CUDASingleGPUTreeLearner::CheckSplitValid( - const int /*inner_split_feature*/, - const uint32_t /*inner_threshold*/, const int left_leaf, const int right_leaf, const double split_sum_left_gradients, - const double split_sum_right_gradients, - const data_size_t /*left_count*/, const data_size_t /*right_count*/) { + const double split_sum_right_gradients) { std::vector left_data_indices(leaf_num_data_[left_leaf]); std::vector right_data_indices(leaf_num_data_[right_leaf]); CopyFromCUDADeviceToHost(left_data_indices.data(), @@ -454,69 +452,12 @@ void CUDASingleGPUTreeLearner::CheckSplitValid( sum_right_gradients += gradients_[index]; sum_right_hessians += hessians_[index]; } - /*Log::Warning("inner_split_feature = %d", inner_split_feature); - Log::Warning("sum_left_gradients = %f, split_sum_left_gradients = %f", sum_left_gradients, split_sum_left_gradients); - Log::Warning("sum_left_hessians = %f, leaf_sum_hessians_[%d] = %f", sum_left_hessians, left_leaf, leaf_sum_hessians_[left_leaf]); - Log::Warning("sum_right_gradients = %f, split_sum_right_gradients = %f", sum_right_gradients, split_sum_right_gradients); - Log::Warning("sum_right_hessians = %f, leaf_sum_hessians_[%d] = %f", sum_right_hessians, right_leaf, leaf_sum_hessians_[right_leaf]);*/ - - /*if (train_data_->FeatureBinMapper(inner_split_feature)->bin_type() == BinType::CategoricalBin) { - std::vector host_bitset_inner(cuda_bitset_inner_len_); - CopyFromCUDADeviceToHost(host_bitset_inner.data(), cuda_bitset_inner_, cuda_bitset_inner_len_, __FILE__, __LINE__); - std::vector host_left_data_indices(leaf_num_data_[left_leaf]); - std::vector host_right_data_indices(leaf_num_data_[right_leaf]); - CopyFromCUDADeviceToHost(host_left_data_indices.data(), cuda_data_partition_->cuda_data_indices() + leaf_data_start_[left_leaf], - static_cast(leaf_num_data_[left_leaf]), __FILE__, __LINE__); - CopyFromCUDADeviceToHost(host_right_data_indices.data(), cuda_data_partition_->cuda_data_indices() + leaf_data_start_[right_leaf], - static_cast(leaf_num_data_[right_leaf]), __FILE__, __LINE__); - BinIterator* iter = train_data_->FeatureIterator(inner_split_feature); - for (size_t i = 0; i < host_left_data_indices.size(); ++i) { - const data_size_t data_index = host_left_data_indices[i]; - const uint32_t bin = iter->RawGet(data_index); - const bool to_left = Common::FindInBitset(host_bitset_inner.data(), cuda_bitset_inner_len_, bin); - if (!to_left) { - Log::Warning("error !!! bin = %d found in left"); - } - } - for (size_t i = 0; i < host_right_data_indices.size(); ++i) { - const data_size_t data_index = host_right_data_indices[i]; - const uint32_t bin = iter->RawGet(data_index); - const bool to_right = (bin == 0 || !Common::FindInBitset(host_bitset_inner.data(), cuda_bitset_inner_len_, bin)); - if (!to_right) { - Log::Warning("error !!! bin = %d found in right"); - } - } - - // construct histogram manually - std::vector hist(500, 0.0f); - for (size_t i = 0; i < host_left_data_indices.size(); ++i) { - const data_size_t data_index = host_left_data_indices[i]; - const uint32_t bin = iter->RawGet(data_index); - const score_t gradient = gradients_[data_index]; - const score_t hessian = hessians_[data_index]; - hist[2 * bin] += gradient; - hist[2 * bin + 1] += hessian; - } - for (size_t i = 0; i < host_right_data_indices.size(); ++i) { - const data_size_t data_index = host_right_data_indices[i]; - const uint32_t bin = iter->RawGet(data_index); - const score_t gradient = gradients_[data_index]; - const score_t hessian = hessians_[data_index]; - hist[2 * bin] += gradient; - hist[2 * bin + 1] += hessian; - } - - Log::Warning("==================== manual histogram for leaf %d (====================", left_leaf); - for (size_t i = 0; i < 100; ++i) { - Log::Warning("bin %d, grad %f, hess %f", i, hist[2 * i], hist[2 * i + 1]); - } - }*/ - CHECK_LE(std::fabs(sum_left_gradients - split_sum_left_gradients), 1e-6f); CHECK_LE(std::fabs(sum_left_hessians - leaf_sum_hessians_[left_leaf]), 1e-6f); CHECK_LE(std::fabs(sum_right_gradients - split_sum_right_gradients), 1e-6f); CHECK_LE(std::fabs(sum_right_hessians - leaf_sum_hessians_[right_leaf]), 1e-6f); } +#endif // DEBUG } // namespace LightGBM diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 48191f895a6b..4eddac491ac3 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -64,12 +64,11 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { void AllocateBitset(); + #ifdef DEUBG void CheckSplitValid( - const int inner_split_feature, - const uint32_t inner_threshold, const int left_leaf, const int right_leaf, - const double sum_left_gradients, const double sum_right_gradients, - const data_size_t left_count, const data_size_t right_count); + const double sum_left_gradients, const double sum_right_gradients); + #endif // DEBUG // GPU device ID int gpu_device_id_; From 4adca58a5f90ab11d4153680da436ea6f546a003 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 22 Feb 2022 09:20:02 +0000 Subject: [PATCH 151/166] ignore NVCC related lines in parameter_generator.py --- helpers/parameter_generator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py index 4932a1a07316..a749645c6c9b 100644 --- a/helpers/parameter_generator.py +++ b/helpers/parameter_generator.py @@ -34,6 +34,10 @@ def get_parameter_infos( member_infos: List[List[Dict[str, List]]] = [] with open(config_hpp) as config_hpp_file: for line in config_hpp_file: + if line.strip() == "#ifndef __NVCC__": + continue + if line.strip() == "#endif // __NVCC__": + continue if "#pragma region Parameters" in line: is_inparameter = True elif "#pragma region" in line and "Parameters" in line: From 1e233425d54d3b115705c8dfa34fcd1743df6204 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 23 Feb 2022 03:27:16 +0000 Subject: [PATCH 152/166] update job name for CUDA tests --- .github/workflows/cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 4335140f4243..ab9a6f842ff0 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -16,7 +16,7 @@ env: jobs: test: - name: cuda ${{ matrix.tree_learner }} ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }}) + name: device_type ${{ matrix.tree_learner }} cuda_version ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }}) runs-on: [self-hosted, linux] timeout-minutes: 60 strategy: From f44b881e94e6a759d62708024af5b1b5dd6c85a4 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 8 Mar 2022 05:11:29 +0000 Subject: [PATCH 153/166] apply review suggestions --- .github/workflows/cuda.yml | 4 ++-- CMakeLists.txt | 9 ++++++--- docs/Installation-Guide.rst | 2 +- helpers/parameter_generator.py | 4 +--- include/LightGBM/cuda/cuda_metadata.hpp | 2 +- include/LightGBM/cuda/cuda_row_data.hpp | 8 ++++---- include/LightGBM/cuda/cuda_split_info.hpp | 9 +++------ include/LightGBM/cuda/cuda_tree.hpp | 8 ++++---- include/LightGBM/cuda/cuda_utils.h | 7 +------ include/LightGBM/cuda/vector_cudahost.h | 2 +- 10 files changed, 24 insertions(+), 31 deletions(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index ab9a6f842ff0..eacbf0d279cd 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -16,7 +16,7 @@ env: jobs: test: - name: device_type ${{ matrix.tree_learner }} cuda_version ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }}) + name: ${{ matrix.tree_learner }} ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }}) runs-on: [self-hosted, linux] timeout-minutes: 60 strategy: @@ -39,7 +39,7 @@ jobs: cuda_version: "9.0" tree_learner: cuda - method: source - compiler: gcc + compiler: clang python_version: "3.8" cuda_version: "11.5.1" tree_learner: cuda_exp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d026b4ac897..146db7147e5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,7 +173,11 @@ if(__INTEGRATE_OPENCL) endif() if(USE_CUDA OR USE_CUDA_EXP) - find_package(CUDA 9.0 REQUIRED) + if (USE_CUDA) + find_package(CUDA 9.0 REQUIRED) + else() + find_package(CUDA 10.0 REQUIRED) + endif() include_directories(${CUDA_INCLUDE_DIRS}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall") @@ -375,11 +379,10 @@ file( src/objective/*.cpp src/network/*.cpp src/treelearner/*.cpp -if(USE_CUDA) +if(USE_CUDA OR USE_CUDA_EXP) src/treelearner/*.cu endif() if(USE_CUDA_EXP) - src/treelearner/*.cu src/treelearner/cuda/*.cpp src/treelearner/cuda/*.cu src/io/cuda/*.cu diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst index d224d8eec7d3..596daf80dc52 100644 --- a/docs/Installation-Guide.rst +++ b/docs/Installation-Guide.rst @@ -636,7 +636,7 @@ To build LightGBM CUDA version, run the following commands: cmake -DUSE_CUDA=1 .. make -j4 -Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace -DUSE_CUDA with -DUSE_CUDA_EXP in the above commands. +Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``-DUSE_CUDA`` with ``-DUSE_CUDA_EXP`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries. **Note**: glibc >= 2.14 is required. diff --git a/helpers/parameter_generator.py b/helpers/parameter_generator.py index a749645c6c9b..abc770bc8e43 100644 --- a/helpers/parameter_generator.py +++ b/helpers/parameter_generator.py @@ -34,9 +34,7 @@ def get_parameter_infos( member_infos: List[List[Dict[str, List]]] = [] with open(config_hpp) as config_hpp_file: for line in config_hpp_file: - if line.strip() == "#ifndef __NVCC__": - continue - if line.strip() == "#endif // __NVCC__": + if line.strip() in {"#ifndef __NVCC__", "#endif // __NVCC__"}: continue if "#pragma region Parameters" in line: is_inparameter = True diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp index 7c77bef6ba97..5837a0fcb653 100644 --- a/include/LightGBM/cuda/cuda_metadata.hpp +++ b/include/LightGBM/cuda/cuda_metadata.hpp @@ -11,7 +11,7 @@ #include #include -#include "../meta.h" +#include namespace LightGBM { diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index f1b5a84bedb8..58507c045b25 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -16,11 +16,11 @@ #include -#include "../train_share_states.h" +#include #define COPY_SUBROW_BLOCK_SIZE_ROW_DATA (1024) -#if CUDART_VERSION <= 10000 +#if CUDART_VERSION == 10000 #define DP_SHARED_HIST_SIZE (5560) #else #define DP_SHARED_HIST_SIZE (6144) @@ -123,7 +123,7 @@ class CUDARowData { int max_num_column_per_partition_; /*! \brief number of partitions */ int num_feature_partitions_; - /*! \brief used when bagging with subset, number of used indice */ + /*! \brief used when bagging with subset, number of used indices */ data_size_t num_used_indices_; /*! \brief used when bagging with subset, number of total elements */ uint64_t num_total_elements_; @@ -175,6 +175,6 @@ class CUDARowData { }; } // namespace LightGBM -#endif // LIGHTGBM_CUDA_COLUMN_DATA_HPP_ +#endif // LIGHTGBM_CUDA_ROW_DATA_HPP_ #endif // USE_CUDA_EXP diff --git a/include/LightGBM/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp index ec4afe538ef5..5c525b431548 100644 --- a/include/LightGBM/cuda/cuda_split_info.hpp +++ b/include/LightGBM/cuda/cuda_split_info.hpp @@ -6,8 +6,8 @@ #ifdef USE_CUDA_EXP -#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ -#define LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ +#ifndef LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_ +#define LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_ #include @@ -39,15 +39,12 @@ class CUDASplitInfo { int* cat_threshold_real = nullptr; __device__ CUDASplitInfo() { - printf("default constructor is called\n"); num_cat_threshold = 0; cat_threshold = nullptr; cat_threshold_real = nullptr; - printf("default constructor is called, num_cat_threshold = %d\n", num_cat_threshold); } __device__ ~CUDASplitInfo() { - printf("default destructor is called\n"); if (num_cat_threshold > 0) { if (cat_threshold != nullptr) { cudaFree(cat_threshold); @@ -103,6 +100,6 @@ class CUDASplitInfo { } // namespace LightGBM -#endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_SPLIT_INFO_HPP_ +#endif // LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_ #endif // USE_CUDA_EXP diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index 3bd5972dbc2c..aa09df4140f1 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -5,13 +5,13 @@ #ifdef USE_CUDA_EXP -#ifndef LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ -#define LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ +#ifndef LIGHTGBM_CUDA_CUDA_TREE_HPP_ +#define LIGHTGBM_CUDA_CUDA_TREE_HPP_ #include #include #include -#include "../bin.h" +#include namespace LightGBM { @@ -137,6 +137,6 @@ class CUDATree : public Tree { } // namespace LightGBM -#endif // LIGHTGBM_IO_CUDA_CUDA_TREE_HPP_ +#endif // LIGHTGBM_CUDA_CUDA_TREE_HPP_ #endif // USE_CUDA_EXP diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 8a9fd7398f46..f1c28213d9f3 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -1,10 +1,5 @@ /*! - * Copyright (c) 2021 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ - -/*! - * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Copyright (c) 2020-2021 IBM Corporation, Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 37e4f4843b05..566972cb3683 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -42,7 +42,7 @@ struct CHAllocator { T* allocate(std::size_t n) { T* ptr; if (n == 0) return NULL; - n = (n + kAlignedSize - 1) / kAlignedSize * kAlignedSize; + n = SIZE_ALIGNED(n); #if defined(USE_CUDA) || defined(USE_CUDA_EXP) if (LGBM_config_::current_device == lgbm_device_cuda) { cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); From d7b65c4f651f6514f422a57c70cbdc8f34708912 Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Wed, 9 Mar 2022 13:15:33 +0800 Subject: [PATCH 154/166] Update .github/workflows/cuda.yml Co-authored-by: Nikita Titov --- .github/workflows/cuda.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index eacbf0d279cd..9a31d67e1815 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -48,11 +48,6 @@ jobs: python_version: "3.9" cuda_version: "10.0" tree_learner: cuda_exp - - method: wheel - compiler: gcc - python_version: "3.10" - cuda_version: "11.5.1" - tree_learner: cuda_exp steps: - name: Setup or update software on host machine run: | From a6a51fd70f782f9a547e5332d9cb45f3be2fac9e Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Wed, 9 Mar 2022 13:15:42 +0800 Subject: [PATCH 155/166] Update .github/workflows/cuda.yml Co-authored-by: Nikita Titov --- .github/workflows/cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 9a31d67e1815..47b5a50242bf 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -39,7 +39,7 @@ jobs: cuda_version: "9.0" tree_learner: cuda - method: source - compiler: clang + compiler: gcc python_version: "3.8" cuda_version: "11.5.1" tree_learner: cuda_exp From 9135582b4aca9a25125fc4ed66a198df6906a8bb Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 9 Mar 2022 05:18:41 +0000 Subject: [PATCH 156/166] update header --- include/LightGBM/cuda/vector_cudahost.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 566972cb3683..7c6e219cbbd9 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -1,5 +1,5 @@ /*! - * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Copyright (c) 2020 IBM Corporation, Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ From cd101ae79d0e0f16442b2d4cf8d286132862fd7f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 9 Mar 2022 05:23:39 +0000 Subject: [PATCH 157/166] remove useless TODOs --- include/LightGBM/cuda/cuda_algorithms.hpp | 2 -- src/treelearner/cuda/cuda_best_split_finder.cu | 1 - 2 files changed, 3 deletions(-) diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index 6b953e2bce5c..a4e91eb9bf38 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -110,7 +110,6 @@ void ShufflePrefixSumGlobal(T* values, size_t len, T* block_prefix_sum_buffer); template __device__ __forceinline__ T ShuffleReduceSumWarp(T value, const data_size_t len) { if (len > 0) { - // TODO(shiyu1994): check how mask works const uint32_t mask = 0xffffffff; for (int offset = warpSize / 2; offset > 0; offset >>= 1) { value += __shfl_down_sync(mask, value, offset); @@ -141,7 +140,6 @@ __device__ __forceinline__ T ShuffleReduceSum(T value, T* shared_mem_buffer, con template __device__ __forceinline__ T ShuffleReduceMaxWarp(T value, const data_size_t len) { if (len > 0) { - // TODO(shiyu1994): check how mask works const uint32_t mask = 0xffffffff; for (int offset = warpSize / 2; offset > 0; offset >>= 1) { value = max(value, __shfl_down_sync(mask, value, offset)); diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index e52a2fb90dd6..f66a338c73d5 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -464,7 +464,6 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( used_bin = local_used_bin; } __syncthreads(); - // TODO(shiyu1994): with more threads, this kernel may use out registers BitonicArgSort_1024(shared_value_buffer, shared_index_buffer, bin_end); __syncthreads(); const int max_num_cat = min(max_cat_threshold, (used_bin + 1) / 2); From 9af98ace805226daf82c306df498ed52d8b50e32 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 9 Mar 2022 05:31:35 +0000 Subject: [PATCH 158/166] remove [TODO(shiyu1994): constrain the split with min_data_in_group] and record in #5062 --- src/treelearner/cuda/cuda_best_split_finder.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index f66a338c73d5..e11fe436a320 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -491,7 +491,6 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( double sum_left_gradient = ShufflePrefixSum(grad, shared_mem_buffer_double); __syncthreads(); double sum_left_hessian = ShufflePrefixSum(hess, shared_mem_buffer_double); - // TODO(shiyu1994): constrain the split with min_data_in_group if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); const double sum_right_gradient = sum_gradients - sum_left_gradient; @@ -531,7 +530,6 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner( sum_left_gradient = ShufflePrefixSum(grad, shared_mem_buffer_double); __syncthreads(); sum_left_hessian = ShufflePrefixSum(hess, shared_mem_buffer_double); - // TODO(shiyu1994): constrain the split with min_data_in_group if (threadIdx_x < used_bin && threadIdx_x < max_num_cat) { const data_size_t left_count = static_cast(__double2int_rn(sum_left_hessian * cnt_factor)); const double sum_right_gradient = sum_gradients - sum_left_gradient; @@ -1113,7 +1111,6 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( GlobalMemoryPrefixSum(hist_grad_buffer_ptr, static_cast(bin_end)); __syncthreads(); GlobalMemoryPrefixSum(hist_hess_buffer_ptr, static_cast(bin_end)); - // TODO(shiyu1994): constrain the split with min_data_in_group for (int bin = static_cast(threadIdx_x); bin < used_bin && bin < max_num_cat; bin += static_cast(blockDim.x)) { const double sum_left_gradient = hist_grad_buffer_ptr[bin]; const double sum_left_hessian = hist_hess_buffer_ptr[bin]; @@ -1153,7 +1150,6 @@ __device__ void FindBestSplitsForLeafKernelCategoricalInner_GlobalMemory( GlobalMemoryPrefixSum(hist_grad_buffer_ptr, static_cast(bin_end)); __syncthreads(); GlobalMemoryPrefixSum(hist_hess_buffer_ptr, static_cast(bin_end)); - // TODO(shiyu1994): constrain the split with min_data_in_group for (int bin = static_cast(threadIdx_x); bin < used_bin && bin < max_num_cat; bin += static_cast(blockDim.x)) { const double sum_left_gradient = hist_grad_buffer_ptr[bin]; const double sum_left_hessian = hist_hess_buffer_ptr[bin]; From e34fccef03b0f92b45500cfd14e0e0d98f3d67cf Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 9 Mar 2022 05:42:11 +0000 Subject: [PATCH 159/166] #include for USE_CUDA_EXP only --- include/LightGBM/cuda/cuda_utils.h | 2 +- src/cuda/cuda_utils.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index f1c28213d9f3..682c370a3aef 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -10,11 +10,11 @@ #include #include #include -#include #endif // USE_CUDA || USE_CUDA_EXP #ifdef USE_CUDA_EXP #include +#include #endif // USE_CUDA_EXP namespace LightGBM { diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp index 051ea3ed2128..d0dfd3bc0db6 100644 --- a/src/cuda/cuda_utils.cpp +++ b/src/cuda/cuda_utils.cpp @@ -3,10 +3,10 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#include - #ifdef USE_CUDA_EXP +#include + namespace LightGBM { void SynchronizeCUDADevice(const char* file, const int line) { From 499639da894e280c4a8e4293f84880d6410c9f17 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 9 Mar 2022 05:44:33 +0000 Subject: [PATCH 160/166] fix include order --- include/LightGBM/cuda/cuda_metadata.hpp | 4 ++-- include/LightGBM/cuda/cuda_row_data.hpp | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp index 5837a0fcb653..a72d03f02592 100644 --- a/include/LightGBM/cuda/cuda_metadata.hpp +++ b/include/LightGBM/cuda/cuda_metadata.hpp @@ -9,10 +9,10 @@ #define LIGHTGBM_CUDA_META_DATA_HPP_ #include -#include - #include +#include + namespace LightGBM { class CUDAMetadata { diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 58507c045b25..8f5e2f8a0e03 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -12,12 +12,11 @@ #include #include #include +#include #include #include -#include - #define COPY_SUBROW_BLOCK_SIZE_ROW_DATA (1024) #if CUDART_VERSION == 10000 From 6fe487487c329609e697d2d9e144ea320e64cc48 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 9 Mar 2022 05:57:39 +0000 Subject: [PATCH 161/166] fix include order --- include/LightGBM/cuda/cuda_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 682c370a3aef..ee88c52a0404 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -13,8 +13,8 @@ #endif // USE_CUDA || USE_CUDA_EXP #ifdef USE_CUDA_EXP -#include #include +#include #endif // USE_CUDA_EXP namespace LightGBM { From 3cf4c7424f3b62a3a52101cc7098b2a13605ec59 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 9 Mar 2022 06:19:00 +0000 Subject: [PATCH 162/166] remove extra space --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 146db7147e5d..05e8dbaf5da5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,7 +173,7 @@ if(__INTEGRATE_OPENCL) endif() if(USE_CUDA OR USE_CUDA_EXP) - if (USE_CUDA) + if(USE_CUDA) find_package(CUDA 9.0 REQUIRED) else() find_package(CUDA 10.0 REQUIRED) From 34fdfe47072308ef5b7ecfa9d8792257e48da38d Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 15 Mar 2022 07:08:48 +0000 Subject: [PATCH 163/166] address review comments --- include/LightGBM/dataset.h | 5 ++--- python-package/README.rst | 4 ++++ src/boosting/gbdt.cpp | 2 +- src/cuda/cuda_utils.cpp | 4 ++-- src/treelearner/cuda/cuda_best_split_finder.hpp | 11 +++++------ src/treelearner/cuda/cuda_data_partition.hpp | 15 +++++++-------- .../cuda/cuda_histogram_constructor.hpp | 6 +++--- src/treelearner/cuda/cuda_leaf_splits.hpp | 6 +++--- .../cuda/cuda_single_gpu_tree_learner.hpp | 6 +++--- tests/python_package_test/test_basic.py | 6 +++--- 10 files changed, 33 insertions(+), 32 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 051b60c6a942..7ef878c55725 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -6,6 +6,8 @@ #define LIGHTGBM_DATASET_H_ #include +#include +#include #include #include #include @@ -22,9 +24,6 @@ #include #include -#include -#include - namespace LightGBM { /*! \brief forward declaration */ diff --git a/python-package/README.rst b/python-package/README.rst index 94b3db2adb90..aaa55bc668b9 100644 --- a/python-package/README.rst +++ b/python-package/README.rst @@ -123,6 +123,8 @@ All requirements from `Build from Sources section <#build-from-sources>`__ apply **CUDA** library (version 9.0 or higher) is needed: details for installation can be found in `Installation Guide `__. +Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``--cuda`` with ``--cuda-exp`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries. + Build HDFS Version ~~~~~~~~~~~~~~~~~~ @@ -198,6 +200,8 @@ Run ``python setup.py install --gpu`` to enable GPU support. All requirements fr Run ``python setup.py install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well. +Run ``python setup.py install --cuda-exp`` to enable the new experimental version of CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well. + Run ``python setup.py install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well. Run ``python setup.py install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well. diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 55e6ef512bc2..cba1517084f6 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -810,7 +810,7 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { double average_bag_rate = (static_cast(bag_data_cnt_) / num_data_) / config->bagging_freq; is_use_subset_ = false; - if (config_->device_type != std::string("cuda") && config_->device_type != std::string("cuda_exp")) { + if (config_->device_type != std::string("cuda_exp")) { const int group_threshold_usesubset = 100; if (average_bag_rate <= 0.5 && (train_data_->num_feature_groups() < group_threshold_usesubset)) { diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp index d0dfd3bc0db6..bab1e1b8ff37 100644 --- a/src/cuda/cuda_utils.cpp +++ b/src/cuda/cuda_utils.cpp @@ -1,5 +1,5 @@ /*! - * Copyright (c) 2020 IBM Corporation. All rights reserved. + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ @@ -15,7 +15,7 @@ void SynchronizeCUDADevice(const char* file, const int line) { void PrintLastCUDAError() { const char* error_name = cudaGetErrorName(cudaGetLastError()); - Log::Warning(error_name); + Log::Fatal(error_name); } void SetCUDADevice(int gpu_device_id, const char* file, int line) { diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index bd3ef063864f..13c02a944e65 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -4,19 +4,18 @@ * license information. */ -#ifndef LIGHTGBM_CUDA_BEST_SPLIT_FINDER_HPP_ -#define LIGHTGBM_CUDA_BEST_SPLIT_FINDER_HPP_ +#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_ +#define LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_ #ifdef USE_CUDA_EXP #include +#include +#include #include #include -#include -#include - #include "cuda_leaf_splits.hpp" #define NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER (256) @@ -203,4 +202,4 @@ class CUDABestSplitFinder { } // namespace LightGBM #endif // USE_CUDA_EXP -#endif // LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ +#endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_ diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index ba5c4a9150df..45b781d178f4 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -3,20 +3,19 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ -#ifndef LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ -#define LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ +#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_ +#define LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_ #ifdef USE_CUDA_EXP -#include -#include #include - -#include - #include #include #include +#include +#include + +#include #include "cuda_leaf_splits.hpp" @@ -391,4 +390,4 @@ class CUDADataPartition { } // namespace LightGBM #endif // USE_CUDA_EXP -#endif // LIGHTGBM_CUDA_DATA_SPLITTER_HPP_ +#endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_ diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index 9ff0f91d7741..e364003ed934 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -3,8 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ -#ifndef LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ -#define LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ +#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ +#define LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ #ifdef USE_CUDA_EXP @@ -166,4 +166,4 @@ class CUDAHistogramConstructor { } // namespace LightGBM #endif // USE_CUDA_EXP -#endif // LIGHTGBM_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ +#endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index db473a0bbb91..fe04cf5bcace 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -3,8 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ -#ifndef LIGHTGBM_CUDA_LEAF_SPLITS_HPP_ -#define LIGHTGBM_CUDA_LEAF_SPLITS_HPP_ +#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_ +#define LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_ #ifdef USE_CUDA_EXP @@ -157,4 +157,4 @@ class CUDALeafSplits { } // namespace LightGBM #endif // USE_CUDA_EXP -#endif // LIGHTGBM_CUDA_LEAF_SPLITS_HPP_ +#endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_ diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 4eddac491ac3..1b32b14cfba1 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -3,8 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for * license information. */ -#ifndef LIGHTGBM_NEW_CUDA_TREE_LEARNER_HPP_ -#define LIGHTGBM_NEW_CUDA_TREE_LEARNER_HPP_ +#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_ +#define LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_ #include #include @@ -140,4 +140,4 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { } // namespace LightGBM #endif // USE_CUDA_EXP -#endif // LIGHTGBM_NEW_CUDA_TREE_LEARNER_HPP_ +#endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_ diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 25d7bd78ffa6..4c18983773cb 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -17,7 +17,6 @@ from .utils import load_breast_cancer -@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') def test_basic(tmp_path): X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2) @@ -49,8 +48,9 @@ def test_basic(tmp_path): assert bst.current_iteration() == 20 assert bst.num_trees() == 20 assert bst.num_model_per_iteration() == 1 - assert bst.lower_bound() == pytest.approx(-2.9040190126976606) - assert bst.upper_bound() == pytest.approx(3.3182142872462883) + if getenv('TASK', '') != 'cuda_exp': + assert bst.lower_bound() == pytest.approx(-2.9040190126976606) + assert bst.upper_bound() == pytest.approx(3.3182142872462883) tname = tmp_path / "svm_light.dat" model_file = tmp_path / "model.txt" From 3bb91aebd72de3b399a07fedcae0def8b188c7b1 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 15 Mar 2022 07:15:22 +0000 Subject: [PATCH 164/166] add warning when cuda_exp is used together with deterministic --- src/io/config.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/io/config.cpp b/src/io/config.cpp index 550c0e944a85..090ce79b830f 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -344,6 +344,9 @@ void Config::CheckParamConflict() { // force row-wise for cuda_exp version force_col_wise = false; force_row_wise = true; + if (deterministic) { + Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); + } } // force gpu_use_dp for CUDA if (device_type == std::string("cuda") && !gpu_use_dp) { From e47d009f6502369c9295bf4012bb3a4831ecc46f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 21 Mar 2022 06:51:02 +0000 Subject: [PATCH 165/166] add comment about gpu_use_dp in .ci/test.sh --- .ci/test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.ci/test.sh b/.ci/test.sh index 1f7fd6aadb7e..0ba52ecbb998 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -197,6 +197,7 @@ elif [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then else sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda_exp";/' $BUILD_DIRECTORY/include/LightGBM/config.h grep -q 'std::string device_type = "cuda_exp"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done + # by default ``gpu_use_dp=false`` for efficiency. change to ``true`` here for exact results in ci tests sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done fi From 53430dd8026ef1593d310a18f6022e8e288ae4b7 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 21 Mar 2022 07:15:41 +0000 Subject: [PATCH 166/166] revert changing order of included headers --- include/LightGBM/dataset.h | 5 +++-- src/treelearner/cuda/cuda_best_split_finder.hpp | 5 +++-- src/treelearner/cuda/cuda_data_partition.hpp | 7 ++++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 7ef878c55725..051b60c6a942 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -6,8 +6,6 @@ #define LIGHTGBM_DATASET_H_ #include -#include -#include #include #include #include @@ -24,6 +22,9 @@ #include #include +#include +#include + namespace LightGBM { /*! \brief forward declaration */ diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index 13c02a944e65..3efc6011c83b 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -10,12 +10,13 @@ #ifdef USE_CUDA_EXP #include -#include -#include #include #include +#include +#include + #include "cuda_leaf_splits.hpp" #define NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER (256) diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 45b781d178f4..c4c58f3ebac0 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -9,14 +9,15 @@ #ifdef USE_CUDA_EXP #include -#include -#include -#include #include #include #include +#include +#include +#include + #include "cuda_leaf_splits.hpp" #define FILL_INDICES_BLOCK_SIZE_DATA_PARTITION (1024)