Skip to content

Commit

Permalink
Merge pull request #155 from sony/feature/20190522-logsoftmax-and-fp3…
Browse files Browse the repository at this point in the history
…2-softmax

Add CUDNN LogSoftmax and disable fp32 Softmax
  • Loading branch information
AkioHayakawa-sony authored May 22, 2019
2 parents fea69db + b04979d commit 3cb68ed
Show file tree
Hide file tree
Showing 9 changed files with 223 additions and 77 deletions.
4 changes: 3 additions & 1 deletion build-tools/code_generator/function_types_cudnn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ ReLU:
# float: [float]
Softmax:
float: [float]
half: [Half]
LogSoftmax:
float: [float]
# half: [Half]
# ELU:
# float: [float]
# SELU:
Expand Down
22 changes: 22 additions & 0 deletions include/nbla/cuda/cudnn/cudnn.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,28 @@ class CudnnPooling {
const void *beta, void *dx) const;
};

/**
CUDNN softmax function wrapper
*/
class CudnnSoftmax {
CudnnTensorDescriptor input_desc_;
CudnnTensorDescriptor output_desc_;
cudnnSoftmaxAlgorithm_t algo_;
int device_;

public:
typedef shared_ptr<CudnnSoftmax> Ptr;
CudnnSoftmax(const Shape_t &inshape, int axis, cudnnSoftmaxAlgorithm_t algo,
cudnnDataType_t dtype, int device);
static Ptr create(const Shape_t &inshape, int axis,
cudnnSoftmaxAlgorithm_t algo, cudnnDataType_t dtype,
int device);
void forward(const void *alpha, const void *x, const void *beta,
void *y) const;
void backward(const void *alpha, const void *y, const void *dy,
const void *beta, void *dx) const;
};

/** cuDNN Convolution resource cache.
*/
struct NBLA_CUDA_API CudnnConvResource {
Expand Down
51 changes: 51 additions & 0 deletions include/nbla/cuda/cudnn/function/log_softmax.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright (c) 2017 Sony Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef __NBLA_CUDA_CUDNN_FUNCTION_LOG_SOFTMAX_HPP__
#define __NBLA_CUDA_CUDNN_FUNCTION_LOG_SOFTMAX_HPP__

#include <nbla/cuda/common.hpp>
#include <nbla/cuda/cuda.hpp>
#include <nbla/cuda/cudnn/cudnn.hpp>
#include <nbla/function/log_softmax.hpp>

namespace nbla {

/** @copydoc LogSoftmax
@note The default algorithm is set as ACCURATE. TODO: Set an algorithm by
context.
*/
template <typename T> class LogSoftmaxCudaCudnn : public LogSoftmax<T> {
public:
typedef typename CudaType<T>::type Tw;

explicit LogSoftmaxCudaCudnn(const Context &ctx, int axis)
: LogSoftmax<T>(ctx, axis), device_(std::stoi(ctx.device_id)) {}
virtual string name() { return "LogSoftmaxCudaCudnn"; }
virtual vector<string> allowed_array_classes() {
return SingletonManager::get<Cuda>()->array_classes();
}

protected:
int device_;
CudnnSoftmax::Ptr cudnn_softmax_;
virtual void setup_impl(const Variables &inputs, const Variables &outputs);
virtual void forward_impl(const Variables &inputs, const Variables &outputs);
virtual void backward_impl(const Variables &inputs, const Variables &outputs,
const vector<bool> &propagate_down,
const vector<bool> &accum);
};
}
#endif
19 changes: 4 additions & 15 deletions include/nbla/cuda/cudnn/function/softmax.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,34 +24,23 @@ namespace nbla {

/** @copydoc Softmax
@note The default algorithm is set as ACCURATE. TODO: Set an algorithm by
context.
@note The default algorithm is set as ACCURATE.
*/
template <typename T> class SoftmaxCudaCudnn : public Softmax<T> {
public:
typedef typename CudaType<T>::type Tw;

explicit SoftmaxCudaCudnn(const Context &ctx, int axis)
: Softmax<T>(ctx, axis), device_(std::stoi(ctx.device_id)) {
NBLA_CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc_));
NBLA_CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc_));
}
virtual ~SoftmaxCudaCudnn() {
NBLA_CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc_));
NBLA_CUDNN_CHECK(cudnnDestroyTensorDescriptor(output_desc_));
}
: Softmax<T>(ctx, axis), device_(std::stoi(ctx.device_id)) {}
virtual string name() { return "SoftmaxCudaCudnn"; }
virtual vector<string> allowed_array_classes() {
return SingletonManager::get<Cuda>()->array_classes();
}
void set_cudnn_softmax_algorithm(std::string algorithm);

protected:
int device_;
cudnnHandle_t cudnn_handle_;
cudnnTensorDescriptor_t input_desc_;
cudnnTensorDescriptor_t output_desc_;
cudnnSoftmaxAlgorithm_t algorithm_;
CudnnSoftmax::Ptr cudnn_softmax_;

virtual void setup_impl(const Variables &inputs, const Variables &outputs);
virtual void forward_impl(const Variables &inputs, const Variables &outputs);
virtual void backward_impl(const Variables &inputs, const Variables &outputs,
Expand Down
45 changes: 45 additions & 0 deletions src/nbla/cuda/cudnn/cudnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,51 @@ void CudnnPooling::backward(const void *alpha, const void *y, const void *dy,
output_desc_.desc, dy, input_desc_.desc, x, beta, input_desc_.desc, dx));
}

//////////////////////////////
// CUDNN Softmax wrapper
//////////////////////////////
CudnnSoftmax::CudnnSoftmax(const Shape_t &inshape, int axis,
cudnnSoftmaxAlgorithm_t algo, cudnnDataType_t dtype,
int device)
: algo_(algo), device_(device) {
const size_t size = std::accumulate(inshape.cbegin(), inshape.cend(),
(size_t)1, std::multiplies<size_t>());
const size_t size_axis = ndi::inner_size(inshape, axis);
const int N = size / size_axis; // Batch size.
const int C = inshape[axis]; // Size of specified axis.
const int H = size / (N * C); // Size of rest.
const int W = 1;
const int stride_w = 1;
const int stride_h = W * stride_w;
const int stride_c = H * stride_h;
const int stride_n = C * stride_c;
NBLA_CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(input_desc_.desc, dtype, N, C,
H, W, stride_n, stride_c,
stride_h, stride_w));
NBLA_CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(output_desc_.desc, dtype, N, C,
H, W, stride_n, stride_c,
stride_h, stride_w));
}
CudnnSoftmax::Ptr CudnnSoftmax::create(const Shape_t &inshape, int axis,
cudnnSoftmaxAlgorithm_t algo,
cudnnDataType_t dtype, int device) {
return make_shared<CudnnSoftmax>(inshape, axis, algo, dtype, device);
}
void CudnnSoftmax::forward(const void *alpha, const void *x, const void *beta,
void *y) const {
auto handle = SingletonManager::get<CudnnHandleManager>()->handle(device_);
NBLA_CUDNN_CHECK(
cudnnSoftmaxForward(handle, algo_, CUDNN_SOFTMAX_MODE_CHANNEL, alpha,
input_desc_.desc, x, beta, output_desc_.desc, y));
}
void CudnnSoftmax::backward(const void *alpha, const void *y, const void *dy,
const void *beta, void *dx) const {
auto handle = SingletonManager::get<CudnnHandleManager>()->handle(device_);
NBLA_CUDNN_CHECK(cudnnSoftmaxBackward(
handle, algo_, CUDNN_SOFTMAX_MODE_CHANNEL, alpha, output_desc_.desc, y,
output_desc_.desc, dy, beta, input_desc_.desc, dx));
}

//////////////////////////////
// cuDNN Handle implementation
//////////////////////////////
Expand Down
62 changes: 62 additions & 0 deletions src/nbla/cuda/cudnn/function/generic/log_softmax.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright (c) 2017 Sony Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// log_softmax.cu

#include <algorithm>
#include <nbla/array.hpp>
#include <nbla/cuda/common.hpp>
#include <nbla/cuda/cudnn/cudnn.hpp>
#include <nbla/cuda/cudnn/function/log_softmax.hpp>
#include <nbla/variable.hpp>

namespace nbla {

template <typename T>
void LogSoftmaxCudaCudnn<T>::setup_impl(const Variables &inputs,
const Variables &outputs) {
LogSoftmax<T>::setup_impl(inputs, outputs);
auto dtype = cudnn_data_type<T>::type();
cudnn_softmax_ = CudnnSoftmax::create(
inputs[0]->shape(), this->axis_, CUDNN_SOFTMAX_LOG, dtype, this->device_);
}

template <class T>
void LogSoftmaxCudaCudnn<T>::forward_impl(const Variables &inputs,
const Variables &outputs) {
NBLA_CHECK(cudnn_softmax_, error_code::value, "setup not called.");
auto x = inputs[0]->get_data_pointer<Tw>(this->ctx_);
auto y = outputs[0]->cast_data_and_get_pointer<Tw>(this->ctx_, true);
auto alpha = get_cudnn_scalar_arg<T>(1);
auto beta = get_cudnn_scalar_arg<T>(0);
cudnn_softmax_->forward(&alpha, x, &beta, y);
}

template <class T>
void LogSoftmaxCudaCudnn<T>::backward_impl(const Variables &inputs,
const Variables &outputs,
const vector<bool> &propagate_down,
const vector<bool> &accum) {
if (!propagate_down[0]) {
return;
}
NBLA_CHECK(cudnn_softmax_, error_code::value, "setup not called.");
auto y = outputs[0]->get_data_pointer<Tw>(this->ctx_);
auto dy = outputs[0]->get_grad_pointer<Tw>(this->ctx_);
auto dx = inputs[0]->cast_grad_and_get_pointer<Tw>(this->ctx_, !accum[0]);
auto alpha = get_cudnn_scalar_arg<T>(1);
auto beta = get_cudnn_scalar_arg<T>(accum[0] ? 1 : 0);
cudnn_softmax_->backward(&alpha, y, dy, &beta, dx);
}
} // namespace nbla
57 changes: 14 additions & 43 deletions src/nbla/cuda/cudnn/function/generic/softmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,37 +27,21 @@ template <typename T>
void SoftmaxCudaCudnn<T>::setup_impl(const Variables &inputs,
const Variables &outputs) {
Softmax<T>::setup_impl(inputs, outputs);
cudnn_handle_ = SingletonManager::get<CudnnHandleManager>()->handle(device_);
int N = this->size0_;
int C = this->size1_;
int H = this->size2_;
int W = 1;
const int stride_w = 1;
const int stride_h = W * stride_w;
const int stride_c = H * stride_h;
const int stride_n = C * stride_c;
NBLA_CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(
input_desc_, cudnn_data_type<T>::type(), N, C, H, W, stride_n, stride_c,
stride_h, stride_w));
NBLA_CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(
output_desc_, cudnn_data_type<T>::type(), N, C, H, W, stride_n, stride_c,
stride_h, stride_w));
// default algorithm setting.
// TODO: set by context.
set_cudnn_softmax_algorithm("ACCURATE");
auto dtype = cudnn_data_type<T>::type();
cudnn_softmax_ =
CudnnSoftmax::create(inputs[0]->shape(), this->axis_,
CUDNN_SOFTMAX_ACCURATE, dtype, this->device_);
}

template <class T>
void SoftmaxCudaCudnn<T>::forward_impl(const Variables &inputs,
const Variables &outputs) {
cuda_set_device(std::stoi(this->ctx_.device_id));
const Tw *x = inputs[0]->get_data_pointer<Tw>(this->ctx_);
Tw *y = outputs[0]->cast_data_and_get_pointer<Tw>(this->ctx_, true);
NBLA_CHECK(cudnn_softmax_, error_code::value, "setup not called.");
auto x = inputs[0]->get_data_pointer<Tw>(this->ctx_);
auto y = outputs[0]->cast_data_and_get_pointer<Tw>(this->ctx_, true);
auto alpha = get_cudnn_scalar_arg<T>(1);
auto beta = get_cudnn_scalar_arg<T>(0);
NBLA_CUDNN_CHECK(cudnnSoftmaxForward(cudnn_handle_, algorithm_,
CUDNN_SOFTMAX_MODE_CHANNEL, &alpha,
input_desc_, x, &beta, output_desc_, y));
cudnn_softmax_->forward(&alpha, x, &beta, y);
}

template <class T>
Expand All @@ -68,25 +52,12 @@ void SoftmaxCudaCudnn<T>::backward_impl(const Variables &inputs,
if (!propagate_down[0]) {
return;
}
cuda_set_device(std::stoi(this->ctx_.device_id));
const Tw *y = outputs[0]->get_data_pointer<Tw>(this->ctx_);
const Tw *dy = outputs[0]->get_grad_pointer<Tw>(this->ctx_);
Tw *dx = inputs[0]->cast_grad_and_get_pointer<Tw>(this->ctx_, !accum[0]);
NBLA_CHECK(cudnn_softmax_, error_code::value, "setup not called.");
auto y = outputs[0]->get_data_pointer<Tw>(this->ctx_);
auto dy = outputs[0]->get_grad_pointer<Tw>(this->ctx_);
auto dx = inputs[0]->cast_grad_and_get_pointer<Tw>(this->ctx_, !accum[0]);
auto alpha = get_cudnn_scalar_arg<T>(1);
auto beta = get_cudnn_scalar_arg<T>(accum[0] ? 1 : 0);
NBLA_CUDNN_CHECK(cudnnSoftmaxBackward(
cudnn_handle_, algorithm_, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha,
output_desc_, y, output_desc_, dy, &beta, input_desc_, dx));
}

template <class T>
void SoftmaxCudaCudnn<T>::set_cudnn_softmax_algorithm(std::string algorithm) {
if (algorithm == "FAST") {
algorithm_ = CUDNN_SOFTMAX_FAST;
} else if (algorithm == "ACCURATE") {
algorithm_ = CUDNN_SOFTMAX_ACCURATE;
} else {
NBLA_ERROR(error_code::target_specific, "Specified unsupported algorithm");
}
}
cudnn_softmax_->backward(&alpha, y, dy, &beta, dx);
}
} // namespace nbla
10 changes: 6 additions & 4 deletions src/nbla/cuda/function/generic/softmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,21 @@ namespace nbla {
template <typename T>
__global__ void kernel_softmax_forward(const int size0x2_, const int size1_,
const int size2_, const T *x, T *y) {
typedef typename CudaTypeForceFloat<T>::type AccumType;
NBLA_CUDA_KERNEL_LOOP(idx, size0x2_) {
const int i0 = idx / size2_;
const int i2 = idx % size2_;
// compute maximum
T max_x = nbla::numeric_limits_cuda<T>::min();
AccumType max_x = -nbla::numeric_limits_cuda<T>::max();
for (int i1 = 0; i1 < size1_; ++i1) {
const int k = (i0 * size1_ + i1) * size2_ + i2;
max_x = max(max_x, x[k]);
}
// Compute exponential and sum
T exp_sum = T(0);
AccumType exp_sum = T(0);
for (int i1 = 0; i1 < size1_; ++i1) {
const int k = (i0 * size1_ + i1) * size2_ + i2;
const T tmp = std::exp(x[k] - max_x);
const AccumType tmp = std::exp(x[k] - max_x);
y[k] = tmp;
exp_sum += tmp;
}
Expand All @@ -55,11 +56,12 @@ template <typename T, bool accum>
__global__ void kernel_softmax_backward(const int size0x2_, const int size1_,
const int size2_, const T *y,
const T *dy, T *dx) {
typedef typename CudaTypeForceFloat<T>::type AccumType;
NBLA_CUDA_KERNEL_LOOP(idx, size0x2_) {
const int i0 = idx / size2_;
const int i2 = idx % size2_;
// compute sum of dy * y
T dyy_sum = T(0);
AccumType dyy_sum = T(0);
for (int i1 = 0; i1 < size1_; ++i1) {
const int k = (i0 * size1_ + i1) * size2_ + i2;
dyy_sum += dy[k] * y[k];
Expand Down
Loading

0 comments on commit 3cb68ed

Please sign in to comment.