From a281f443f864a08613d7fedc7190cfa5320bd978 Mon Sep 17 00:00:00 2001
From: Yukio Oobuchi <Yukio.Oobuchi@sony.com>
Date: Sat, 15 Sep 2018 07:01:19 +0900
Subject: [PATCH] Fix misspell with aspell.

---
 CMakeLists.txt                                       |  4 ++--
 CONTRIBUTING.md                                      |  2 +-
 build-tools/code_generator/generate.py               | 10 +++++-----
 build-tools/make/build.mk                            | 12 ++++++------
 doc/build/build.md                                   |  4 ++--
 doc/build/build_distributed.md                       |  8 ++++----
 doc/build/build_windows.md                           |  4 ++--
 doc/build/quick_build_tools.md                       |  2 +-
 docker/README.md                                     |  2 +-
 docker/tutorial/run-nnabla-examples.ipynb            |  2 +-
 examples/cpp/mnist_training/README.md                |  6 +++---
 include/nbla/cuda/array/cuda_array.hpp               |  4 ++--
 include/nbla/cuda/common.hpp                         |  2 +-
 .../cuda/communicator/data_parallel_communicator.hpp |  2 +-
 .../multi_process_data_parallel_communicator.hpp     |  2 +-
 include/nbla/cuda/cudnn/cudnn.hpp                    |  8 ++++----
 include/nbla/cuda/cudnn/function/convolution.hpp     |  2 +-
 include/nbla/cuda/cudnn/function/deconvolution.hpp   |  2 +-
 .../nbla/cuda/cudnn/function/function_impl.hpp.tmpl  |  4 ++--
 include/nbla/cuda/cudnn/function/softmax.hpp         |  2 +-
 include/nbla/cuda/function/function_impl.hpp.tmpl    |  2 +-
 include/nbla/cuda/function/interpolate.hpp           |  2 +-
 include/nbla/cuda/utils/device_reduce.cuh            |  4 ++--
 python/setup.py                                      |  2 +-
 python/src/nnabla_ext/cuda/init.pyx                  |  4 ++--
 src/nbla/cuda/array/cuda_array.cpp                   |  2 +-
 src/nbla/cuda/array/cuda_array.cu                    |  2 +-
 .../cuda/communicator/data_parallel_communicator.cu  |  2 +-
 .../multi_process_data_parallel_communicator.cu      |  6 +++---
 src/nbla/cuda/cuda.cpp                               |  2 +-
 .../cudnn/function/generic/batch_normalization.cu    |  2 +-
 src/nbla/cuda/cudnn/function/generic/convolution.cu  |  6 +++---
 .../cudnn/function/generic/function_impl.cu.tmpl     |  8 ++++----
 .../cuda/function/generic/batch_normalization.cu     |  2 +-
 src/nbla/cuda/function/generic/clip_grad_by_value.cu |  2 +-
 .../cuda/function/generic/depthwise_convolution.cu   |  2 +-
 src/nbla/cuda/function/generic/function_impl.cu.tmpl |  8 ++++----
 .../test/multi_process_data_parallel_communicator.cu |  6 +++---
 38 files changed, 74 insertions(+), 74 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 981a72088..5235eeef4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,7 +95,7 @@ if(BUILD_CPP_LIB)
   
   if (NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release" CACHE STRING
-      "Build type release is default on single-configuration build system like GnuMake."
+      "Build type release is default on single-configuration build system like GNU make."
       FORCE)
   endif()
   
@@ -192,7 +192,7 @@ if(BUILD_CPP_LIB)
       message(FATAL_ERROR
         "Python build_ext compiler inference is only supported on Win, Unix or Apple.")
     endif()
-    message("Python build_ext compiler is infered as '${NBLA_PYTHON_BUILD_EXT_COMPILER}'.")
+    message("Python build_ext compiler is inferred as '${NBLA_PYTHON_BUILD_EXT_COMPILER}'.")
     message("You can specify a compiler manually setting a variable"
       " NBLA_PYTHON_BUILD_EXT_COMPILER. You can see a list of supported"
       " compiler by `python setup.py build_ext --help-compiler`.")
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 78082c968..79bcc3217 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -5,4 +5,4 @@ CUDA extension follows [the instruction found in NNabla](https://github.com/sony
 ## Development Guide
 
 * [Adding a new function (layer implementation)](doc/contributing/add_function.md).
-* [Adding a new solver (gradient descent algorighm implemenation)](doc/contributing/add_solver.md).
+* [Adding a new solver (gradient descent algorithm implementation)](doc/contributing/add_solver.md).
diff --git a/build-tools/code_generator/generate.py b/build-tools/code_generator/generate.py
index a24f1f5cb..0d5284eb5 100644
--- a/build-tools/code_generator/generate.py
+++ b/build-tools/code_generator/generate.py
@@ -78,7 +78,7 @@ def generate():
         base, 'python/src/nnabla_ext/cudnn/_version.py.tmpl'),
         rootdir=base)
 
-    # Generate function skeltons
+    # Generate function skeletons
     func_src_template = join(
         base,
         'src/nbla/cuda/function/generic/function_impl.cu.tmpl')
@@ -91,16 +91,16 @@ def generate():
     func_header_template_cudnn = join(
         base,
         'include/nbla/cuda/cudnn/function/function_impl.hpp.tmpl')
-    utils.generate_skelton_function_impl(
+    utils.generate_skeleton_function_impl(
         function_info, function_types, ext_info={},
         template=func_src_template, output_format='%s.cu')
-    utils.generate_skelton_function_impl(
+    utils.generate_skeleton_function_impl(
         function_info, function_types, ext_info={},
         template=func_header_template, output_format='%s.hpp')
-    utils.generate_skelton_function_impl(
+    utils.generate_skeleton_function_impl(
         function_info, function_types_cudnn, ext_info={},
         template=func_src_template_cudnn, output_format='%s.cu')
-    utils.generate_skelton_function_impl(
+    utils.generate_skeleton_function_impl(
         function_info, function_types_cudnn, ext_info={},
         template=func_header_template_cudnn, output_format='%s.hpp')
 
diff --git a/build-tools/make/build.mk b/build-tools/make/build.mk
index 1ef6d26a9..4e7392acc 100644
--- a/build-tools/make/build.mk
+++ b/build-tools/make/build.mk
@@ -40,7 +40,7 @@ nnabla-ext-cuda-clean-all:
 
 ########################################################################################################################
 # cpplib
-.PHNOY: nnabla-ext-cuda-cpplib
+.PHONY: nnabla-ext-cuda-cpplib
 nnabla-ext-cuda-cpplib:
 	mkdir -p $(BUILD_EXT_CUDA_DIRECTORY_CPPLIB)
 	cd $(BUILD_EXT_CUDA_DIRECTORY_CPPLIB) \
@@ -57,7 +57,7 @@ nnabla-ext-cuda-cpplib:
 		$(NNABLA_EXT_CUDA_DIRECTORY)
 	$(MAKE) -C $(BUILD_EXT_CUDA_DIRECTORY_CPPLIB) -j$(PARALLEL_BUILD_NUM)
 
-.PHNOY: nnabla-ext-cuda-cpplib-multi-gpu
+.PHONY: nnabla-ext-cuda-cpplib-multi-gpu
 nnabla-ext-cuda-cpplib-multi-gpu:
 	mkdir -p $(BUILD_EXT_CUDA_DIRECTORY_CPPLIB_MULTI_GPU)
 	cd $(BUILD_EXT_CUDA_DIRECTORY_CPPLIB_MULTI_GPU) \
@@ -77,7 +77,7 @@ nnabla-ext-cuda-cpplib-multi-gpu:
 
 ########################################################################################################################
 # wheel
-.PHNOY: nnabla-ext-cuda-wheel
+.PHONY: nnabla-ext-cuda-wheel
 nnabla-ext-cuda-wheel:
 	$(call with-virtualenv, \
 		$(NNABLA_EXT_CUDA_DIRECTORY), \
@@ -85,7 +85,7 @@ nnabla-ext-cuda-wheel:
 		-f build-tools/make/build.mk, \
 		nnabla-ext-cuda-wheel-local)
 
-.PHNOY: nnabla-ext-cuda-wheel-local
+.PHONY: nnabla-ext-cuda-wheel-local
 nnabla-ext-cuda-wheel-local: nnabla-install \
 		$(BUILD_DIRECTORY_CPPLIB)/lib/libnnabla.so \
 		$(BUILD_EXT_CUDA_DIRECTORY_CPPLIB)/lib/libnnabla_cuda.so
@@ -105,7 +105,7 @@ nnabla-ext-cuda-wheel-local: nnabla-install \
 		$(NNABLA_EXT_CUDA_DIRECTORY) \
 	&& $(MAKE) -C $(BUILD_EXT_CUDA_DIRECTORY_WHEEL) wheel
 
-.PHNOY: nnabla-ext-cuda-wheel-multi-gpu
+.PHONY: nnabla-ext-cuda-wheel-multi-gpu
 nnabla-ext-cuda-wheel-multi-gpu: \
 			nnabla-cpplib \
 			nnabla-wheel \
@@ -161,7 +161,7 @@ nnabla-ext-cuda-test-local: nnabla-install nnabla-ext-cuda-install
 	&& PYTHONPATH=$(NNABLA_EXT_CUDA_DIRECTORY)/python/test \
 		python -m pytest $(NNABLA_DIRECTORY)/python/test
 
-.PHNOY: nnabla-ext-cuda-multi-gpu-test-local
+.PHONY: nnabla-ext-cuda-multi-gpu-test-local
 nnabla-ext-cuda-multi-gpu-test-local: nnabla-ext-cuda-multi-gpu-install
 	cd $(BUILD_EXT_CUDA_DIRECTORY_WHEEL_MULTI_GPU) \
 	&& PYTHONPATH=$(NNABLA_EXT_CUDA_DIRECTORY)/python/test:$(NNABLA_DIRECTORY)/python/test \
diff --git a/doc/build/build.md b/doc/build/build.md
index b43b2e722..bf0bcb15c 100644
--- a/doc/build/build.md
+++ b/doc/build/build.md
@@ -7,9 +7,9 @@ This document shows how to install CUDA extension on Ubuntu 16.04 LTS. This proc
 
 ## Prerequisites
 
-In addition to NNabla's requirements, CUDA extension requires CUDA setup has done on your system. If you don't have CUDA on your system, follow the procedure desribed below.
+In addition to NNabla's requirements, CUDA extension requires CUDA setup has done on your system. If you don't have CUDA on your system, follow the procedure described below.
 
-Download and install CUDA and cuDNN library (both runtime library and developement library). Please follow the instruction in the document provided by NVIDIA. Do NOT see any instruction provided by any third party. They are often incorrect or based on old instructions, that could destroy your system.
+Download and install CUDA and cuDNN library (both runtime library and development library). Please follow the instruction in the document provided by NVIDIA. Do NOT see any instruction provided by any third party. They are often incorrect or based on old instructions, that could destroy your system.
 
 * [CUDA toolkit](https://developer.nvidia.com/cuda-downloads)
 * [cuDNN library](https://developer.nvidia.com/rdp/cudnn-download) (Registration required)
diff --git a/doc/build/build_distributed.md b/doc/build/build_distributed.md
index 212c7183d..c59a8abed 100644
--- a/doc/build/build_distributed.md
+++ b/doc/build/build_distributed.md
@@ -15,7 +15,7 @@ In addition to [requirements of NNabla without distributed execution](build.md),
 In order to use the distributed training, the only difference, when building, is
 the procedure described here.
 
-Download `nccl <https://developer.nvidia.com/nccl/nccl-download>`_ according to your environemnt,
+Download `nccl <https://developer.nvidia.com/nccl/nccl-download>`_ according to your environment,
 then install it manually in case of ubuntu16.04,
 
 ```shell
@@ -25,13 +25,13 @@ sudo apt-get install libnccl2 libnccl-dev
 ```
 
 For developer, if you want to use another nccl not publicly distributed,
-specify **NCCL_HOME** environment variable as the folloing.
+specify **NCCL_HOME** environment variable as the following.
 
 ```shell
 export NCCL_HOME=${path}/build
 ```
 
-Here, we assume the directry structure,
+Here, we assume the directory structure,
 
 * ${path}/build/include
 * ${path}/build/lib
@@ -66,7 +66,7 @@ CUDA includes: /usr/local/cuda-8.0/include;/usr/lib/openmpi/include/openmpi/opal
 ## Unit test
 
 
-Follow the unit test section in [Build CUDA extension](build.md). Now you could see the communicater
+Follow the unit test section in [Build CUDA extension](build.md). Now you could see the communicator
 test passed.
 
 ```
diff --git a/doc/build/build_windows.md b/doc/build/build_windows.md
index 81ff86717..cfe9e4fb8 100644
--- a/doc/build/build_windows.md
+++ b/doc/build/build_windows.md
@@ -2,10 +2,10 @@
 
 ## Prerequisites
 
-In addition to NNabla's requirements, CUDA extension requires CUDA setup has done on your system. If you don't have CUDA on your system, follow the procedure desribed below.
+In addition to NNabla's requirements, CUDA extension requires CUDA setup has done on your system. If you don't have CUDA on your system, follow the procedure described below.
 
 
-Download and install CUDA and cuDNN library (both runtime library and developement library). Please follow the instruction in the document provided by NVIDIA. Do NOT see any instruction provided by any third party. They are often incorrect or based on old instructions, that could destroy your system.
+Download and install CUDA and cuDNN library (both runtime library and development library). Please follow the instruction in the document provided by NVIDIA. Do NOT see any instruction provided by any third party. They are often incorrect or based on old instructions, that could destroy your system.
 
 * [CUDA toolkit](https://developer.nvidia.com/cuda-downloads)
 * [cuDNN library](https://developer.nvidia.com/rdp/cudnn-download) (Registration required)
diff --git a/doc/build/quick_build_tools.md b/doc/build/quick_build_tools.md
index 5c30080e0..606a3469c 100644
--- a/doc/build/quick_build_tools.md
+++ b/doc/build/quick_build_tools.md
@@ -47,7 +47,7 @@ Install CUDA8.0, CUDA9.0, CUDA9.1 from following site.
  - https://developer.nvidia.com/cuda-toolkit-archive
 
 
-Get several versions of cuDNN from following site. (Registration requried)
+Get several versions of cuDNN from following site. (Registration required)
 - cuDNN
  - https://developer.nvidia.com/rdp/cudnn-download
 
diff --git a/docker/README.md b/docker/README.md
index 06f4d50e0..1259f6906 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -38,6 +38,6 @@ nvidia-docker run -it --rm -p 8888:8888 nnabla/nnabla-ext-cuda:tutorial jupyter
 You can connect the jupyter server with your browser by accessing
 `http://<Host OS address>:8888`. The login password is `nnabla`.
 
-After logging in, the page lists a directory that contains jupyter `.ipynb` tutorials and the `nnabla-examples/` foler.
+After logging in, the page lists a directory that contains jupyter `.ipynb` tutorials and the `nnabla-examples/` folder.
 You can open any tutorial by clicking a `.ipynb` file.
 A DCGAN in `nnabla-examples` is demonstrated in `run-nnabla-examples.ipynb`.
diff --git a/docker/tutorial/run-nnabla-examples.ipynb b/docker/tutorial/run-nnabla-examples.ipynb
index d6b73d412..4222d7c06 100644
--- a/docker/tutorial/run-nnabla-examples.ipynb
+++ b/docker/tutorial/run-nnabla-examples.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The following command executes training of Deep Convolutional GAN on MNIST dataset. It takes a couple of minites on an average GPU."
+    "The following command executes training of Deep Convolutional GAN on MNIST dataset. It takes a couple of minutes on an average GPU."
    ]
   },
   {
diff --git a/examples/cpp/mnist_training/README.md b/examples/cpp/mnist_training/README.md
index 3af102f29..bbe725dd0 100644
--- a/examples/cpp/mnist_training/README.md
+++ b/examples/cpp/mnist_training/README.md
@@ -17,7 +17,7 @@ If you tried the CPU version of this script, you must have downloaded MNIST data
 
 Please copy them to this directory.
 
-## Create NNP file of an initialized model for MNIST classifcation.
+## Create NNP file of an initialized model for MNIST classification.
 You might also have an NNP file of the initialized model in `nnabla/examples/cpp/mnist_training`.
 Please copy it to this directory.
 
@@ -31,9 +31,9 @@ export NNABLA_DIR='path to your nnabla directory'
 make
 ```
 
-The above command generates an executable `mnist_training_cuda` at the current directry.
+The above command generates an executable `mnist_training_cuda` at the current directory.
 
-The build file `GNUMakefile` is simple.
+The build file `GNUmakefile` is simple.
 It links `libnnabla.so`, `libnnabla_utils.so`, `libnnabla_utils.so` and `libz.so` with the executable generated from `main.cpp`, and compiles with C++11 option `-std=c++11`.
 It also needs to include path to `mnist_training.hpp` which is located in `nnabla/examples/cpp/mnist_training` directory.
 
diff --git a/include/nbla/cuda/array/cuda_array.hpp b/include/nbla/cuda/array/cuda_array.hpp
index eb3b0867d..95557469e 100644
--- a/include/nbla/cuda/array/cuda_array.hpp
+++ b/include/nbla/cuda/array/cuda_array.hpp
@@ -31,7 +31,7 @@ using std::shared_ptr;
 class CudaArray : public Array {
 protected:
   int device_;
-  /* Holding CudaMemory until the instance is destoryed to prevent freeing.
+  /* Holding CudaMemory until the instance is destroyed to prevent freeing.
    */
   shared_ptr<CudaMemory> inuse_memory_;
 
@@ -54,7 +54,7 @@ NBLA_CUDA_API void synchronizer_cpu_array_cuda_array(Array *src, Array *dst);
 
 /** Array allocated on CUDA device with Memory Pool
 
-This is a necessary ingredient for imperative programing interface of
+This is a necessary ingredient for imperative programming interface of
 neural networks (aka define-by-run or dynamic). Memory allocation of
 CUDA is not asynchronous. Hence, allocating memory region between each
 function will lead thread synchronization that will block executions of
diff --git a/include/nbla/cuda/common.hpp b/include/nbla/cuda/common.hpp
index f328a3a03..08c49f64b 100644
--- a/include/nbla/cuda/common.hpp
+++ b/include/nbla/cuda/common.hpp
@@ -162,7 +162,7 @@ CUBLAS_TYPE_T(HalfCuda, HALF);
 /** Block size */
 #define NBLA_CUDA_GET_BLOCKS(num) NBLA_CEIL_INT_DIV(num, NBLA_CUDA_NUM_THREADS)
 
-/** Get an appropreate block size given a size of elements.
+/** Get an appropriate block size given a size of elements.
 
     The kernel is assumed to contain a grid-strided loop.
  */
diff --git a/include/nbla/cuda/communicator/data_parallel_communicator.hpp b/include/nbla/cuda/communicator/data_parallel_communicator.hpp
index 1002ac693..f97ca55c6 100644
--- a/include/nbla/cuda/communicator/data_parallel_communicator.hpp
+++ b/include/nbla/cuda/communicator/data_parallel_communicator.hpp
@@ -127,7 +127,7 @@ class NBLA_API DataParallelCommunicatorNccl
 protected:
   void wait_by_devices_synchronization();
   void wait_by_streams_synchronization();
-  void divide_by_num_divices(bool division);
+  void divide_by_num_devices(bool division);
 
   DISABLE_COPY_AND_ASSIGN(DataParallelCommunicatorNccl);
 };
diff --git a/include/nbla/cuda/communicator/multi_process_data_parallel_communicator.hpp b/include/nbla/cuda/communicator/multi_process_data_parallel_communicator.hpp
index 97ce2ff05..8f14c811e 100644
--- a/include/nbla/cuda/communicator/multi_process_data_parallel_communicator.hpp
+++ b/include/nbla/cuda/communicator/multi_process_data_parallel_communicator.hpp
@@ -215,7 +215,7 @@ class NBLA_API MultiProcessDataParallelCommunicatorNccl
 
   void wait_by_device_synchronization();
   void wait_by_streams_synchronization();
-  void divide_by_num_divices(bool division);
+  void divide_by_num_devices(bool division);
 
   DISABLE_COPY_AND_ASSIGN(MultiProcessDataParallelCommunicatorNccl);
 };
diff --git a/include/nbla/cuda/cudnn/cudnn.hpp b/include/nbla/cuda/cudnn/cudnn.hpp
index 3b4609252..dcfa05a88 100644
--- a/include/nbla/cuda/cudnn/cudnn.hpp
+++ b/include/nbla/cuda/cudnn/cudnn.hpp
@@ -62,7 +62,7 @@ template <> class cudnn_data_type<HalfCuda> {
   static cudnnDataType_t type() { return CUDNN_DATA_HALF; }
 };
 
-/** Convret cuDNN enum dtype to NNabla enum dtype.
+/** Convert cuDNN enum dtype to NNabla enum dtype.
  */
 inline dtypes get_dtype_by_cudnn_data_type(cudnnDataType_t dtype) {
   switch (dtype) {
@@ -135,7 +135,7 @@ inline string cudnn_status_to_string(cudnnStatus_t status) {
 
 http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnSetTensorNdDescriptor
 
-According to the doc above, cudnnSetTensorNdDescriptor does not suport a tensor
+According to the doc above, cudnnSetTensorNdDescriptor does not support a tensor
 less than 4 dimensions. This wrapper function adds unused dimensions with a
 value of 1 at last.
 
@@ -203,7 +203,7 @@ struct NBLA_CUDA_API CudnnConvResource {
   cudnnTensorDescriptor_t y_desc;         ///< Output desc.
   cudnnTensorDescriptor_t b_desc;         ///< Bias desc.
   cudnnTensorDescriptor_t b_desc_deconv;  ///< Bias desc for deconvolution.
-  cudnnFilterDescriptor_t w_desc;         ///< Wegiht desc.
+  cudnnFilterDescriptor_t w_desc;         ///< Weight desc.
   cudnnConvolutionDescriptor_t conv_desc; ///< Conv desc.
   cudnnConvolutionFwdAlgo_t fwd_algo;     ///< Best forward algorithm found.
   cudnnConvolutionBwdFilterAlgo_t
@@ -236,7 +236,7 @@ class NBLA_CUDA_API CudnnHandleManager {
   ~CudnnHandleManager();
 
   /**
-     Get cuDNN handle for devive.
+     Get cuDNN handle for device.
    */
   cudnnHandle_t handle(int device = -1);
 
diff --git a/include/nbla/cuda/cudnn/function/convolution.hpp b/include/nbla/cuda/cudnn/function/convolution.hpp
index e3124bce9..934667d64 100644
--- a/include/nbla/cuda/cudnn/function/convolution.hpp
+++ b/include/nbla/cuda/cudnn/function/convolution.hpp
@@ -39,7 +39,7 @@ template <typename T> class ConvolutionCudaCudnn : public Convolution<T> {
     // NOTE: dilation > 1 is not supported by cudnn. (2016.10.19)
     for (int i = 0; i < dilation.size(); ++i) {
       if (dilation[i] > 1) {
-        // Fall back to origianl CUDA implementation if dilation > 1.
+        // Fall back to original CUDA implementation if dilation > 1.
         // Setting fall_back_func_ overwrites behaviors of setup, forward and
         // backward functions by the specified function class instance.
         std::cout << "Falling back to ConvolutionCuda since dilation > 1 is "
diff --git a/include/nbla/cuda/cudnn/function/deconvolution.hpp b/include/nbla/cuda/cudnn/function/deconvolution.hpp
index 970ff50e1..d3da846e1 100644
--- a/include/nbla/cuda/cudnn/function/deconvolution.hpp
+++ b/include/nbla/cuda/cudnn/function/deconvolution.hpp
@@ -42,7 +42,7 @@ template <typename T> class DeconvolutionCudaCudnn : public Deconvolution<T> {
     // NOTE: dilation > 1 is not supported by cudnn. (2016.10.19)
     for (int i = 0; i < dilation.size(); ++i) {
       if (dilation[i] > 1) {
-        // Fall back to origianl CUDA implementation if dilation > 1.
+        // Fall back to original CUDA implementation if dilation > 1.
         // Setting fall_back_func_ overwrites behaviors of setup, forward and
         // backward functions by the specified function class instance.
         std::cout << "Falling back to DeconvolutionCuda since dilation > 1 is "
diff --git a/include/nbla/cuda/cudnn/function/function_impl.hpp.tmpl b/include/nbla/cuda/cudnn/function/function_impl.hpp.tmpl
index 35a4c96ba..59aeeb85b 100644
--- a/include/nbla/cuda/cudnn/function/function_impl.hpp.tmpl
+++ b/include/nbla/cuda/cudnn/function/function_impl.hpp.tmpl
@@ -38,7 +38,7 @@ template <${dec_targs}> class ${name}CudaCudnn : public ${name}<${targs}> {
 public:
   /* TODO: remove this help message.
   Typedef of CUDA scalar types used in source file.
-  This template function class might be instanciated for each CPU scalar types
+  This template function class might be instantiated for each CPU scalar types
   (double, float, nbla::Half), however, for Half, CUDA kernel functions
   must use nbla::HalfCuda in which a bunch of device operator functions are
   overloaded. nbla::CudaType<T>::type will translate nbla::Half
@@ -67,7 +67,7 @@ public:
 % for oname in outputs.keys():
     NBLA_CUDNN_CHECK(cudnnDestroyTensorDescriptor(${oname}_desc_));
 % endfor
-    // TODO: Destoy other descriptors
+    // TODO: Destroy other descriptors
   }
   virtual string name() { return "${name}CudaCudnn"; }
   virtual vector<string> allowed_array_classes() {
diff --git a/include/nbla/cuda/cudnn/function/softmax.hpp b/include/nbla/cuda/cudnn/function/softmax.hpp
index 656d42352..bc6453a19 100644
--- a/include/nbla/cuda/cudnn/function/softmax.hpp
+++ b/include/nbla/cuda/cudnn/function/softmax.hpp
@@ -24,7 +24,7 @@ namespace nbla {
 
 /** @copydoc Softmax
 
-@note The default algrithm is set as ACCURATE. TODO: Set an algorithm by
+@note The default algorithm is set as ACCURATE. TODO: Set an algorithm by
       context.
 */
 template <typename T> class SoftmaxCudaCudnn : public Softmax<T> {
diff --git a/include/nbla/cuda/function/function_impl.hpp.tmpl b/include/nbla/cuda/function/function_impl.hpp.tmpl
index 8294a9136..009fdca2e 100644
--- a/include/nbla/cuda/function/function_impl.hpp.tmpl
+++ b/include/nbla/cuda/function/function_impl.hpp.tmpl
@@ -32,7 +32,7 @@ template <${dec_targs}> class ${name}Cuda : public ${name}<${targs}> {
 public:
   /* TODO: remove this help message.
   Typedef of CUDA scalar types used in source file.
-  This template function class might be instanciated for each CPU scalar types
+  This template function class might be instantiated for each CPU scalar types
   (double, float, nbla::Half), however, for Half, CUDA kernel functions
   must use nbla::HalfCuda in which a bunch of device operator functions are
   overloaded. nbla::CudaType<T>::type will translate nbla::Half
diff --git a/include/nbla/cuda/function/interpolate.hpp b/include/nbla/cuda/function/interpolate.hpp
index 4a367c967..16868f4ff 100644
--- a/include/nbla/cuda/function/interpolate.hpp
+++ b/include/nbla/cuda/function/interpolate.hpp
@@ -24,7 +24,7 @@ template <typename T> class InterpolateCuda : public Interpolate<T> {
 public:
   /* TODO: remove this help message.
   Typedef of CUDA scalar types used in source file.
-  This template function class might be instanciated for each CPU scalar types
+  This template function class might be instantiated for each CPU scalar types
   (double, float, nbla::Half), however, for Half, CUDA kernel functions
   must use nbla::HalfCuda in which a bunch of device operator functions are
   overloaded. nbla::CudaType<T>::type will translate nbla::Half
diff --git a/include/nbla/cuda/utils/device_reduce.cuh b/include/nbla/cuda/utils/device_reduce.cuh
index 8a6b94b29..85fc1832a 100644
--- a/include/nbla/cuda/utils/device_reduce.cuh
+++ b/include/nbla/cuda/utils/device_reduce.cuh
@@ -21,10 +21,10 @@
 
 namespace nbla {
 
-/** Geric block-wise reduction kernel.
+/** Generic block-wise reduction kernel.
 
 @param[in] N Number of valid input items.
-@param[in,out] op Reduciton operator class. TODO: doc.
+@param[in,out] op Reduction operator class. TODO: doc.
  */
 template <class ReduceOp>
 __global__ void kernel_reduce_per_block(const int N, ReduceOp op,
diff --git a/python/setup.py b/python/setup.py
index 1754540c5..362800f9a 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -203,7 +203,7 @@ def get_setup_config(root_dir):
         version=__version__,
         author_email=__email__,
         url="https://github.com/sony/nnabla-ext-cuda",
-        license='Apache Licence 2.0',
+        license='Apache License 2.0',
         classifiers=[
                 'Development Status :: 4 - Beta',
                 'Intended Audience :: Developers',
diff --git a/python/src/nnabla_ext/cuda/init.pyx b/python/src/nnabla_ext/cuda/init.pyx
index b9f019b9c..407a1a6a6 100644
--- a/python/src/nnabla_ext/cuda/init.pyx
+++ b/python/src/nnabla_ext/cuda/init.pyx
@@ -44,7 +44,7 @@ def clear_memory_cache():
 
 
 ###############################################################################
-# Array pereference API
+# Array preference API
 # TODO: Move these to C++
 ###############################################################################
 _original_array_classes = cuda_array_classes()
@@ -94,7 +94,7 @@ def device_synchronize(str device):
 def get_device_count():
     """Call ``cudaGetDeviceCount`` in runtime API`.
 
-    Retuns:
+    Returns:
         int: Number of devices available.
 
     """
diff --git a/src/nbla/cuda/array/cuda_array.cpp b/src/nbla/cuda/array/cuda_array.cpp
index d5de92b60..b5fedbd1e 100644
--- a/src/nbla/cuda/array/cuda_array.cpp
+++ b/src/nbla/cuda/array/cuda_array.cpp
@@ -72,7 +72,7 @@ Context CudaArray::filter_context(const Context &ctx) {
 /////////////////////////////////////
 void synchronizer_cuda_array_cpu_array(Array *src, Array *dst) {
   if (src->dtype() != dst->dtype()) {
-    // if dtype mismatces, transfer gpu-cpu first, then convert dtype.
+    // if dtype mismatches, transfer gpu-cpu first, then convert dtype.
     Context ctx = dst->context();
     unique_ptr<Array> tmp(new CpuCachedArray(src->size(), src->dtype(), ctx));
     synchronizer_cuda_array_cpu_array(src, tmp.get());
diff --git a/src/nbla/cuda/array/cuda_array.cu b/src/nbla/cuda/array/cuda_array.cu
index 4f43c2d70..197324890 100644
--- a/src/nbla/cuda/array/cuda_array.cu
+++ b/src/nbla/cuda/array/cuda_array.cu
@@ -68,7 +68,7 @@ void cuda_array_copy(const Array *src, Array *dst) {
     thrust_copy<Ta, Tb>(src, dst);
     return;
   }
-  // Inter-devcie copy.
+  // Inter-device copy.
   std::unique_ptr<Array> src_tmp;
 
   // At first convert dtype on source device if necessary.
diff --git a/src/nbla/cuda/communicator/data_parallel_communicator.cu b/src/nbla/cuda/communicator/data_parallel_communicator.cu
index 4ddd16443..3a40bfb45 100644
--- a/src/nbla/cuda/communicator/data_parallel_communicator.cu
+++ b/src/nbla/cuda/communicator/data_parallel_communicator.cu
@@ -310,7 +310,7 @@ void DataParallelCommunicatorNccl<T>::wait_by_streams_synchronization() {
 }
 
 template <typename T>
-void DataParallelCommunicatorNccl<T>::divide_by_num_divices(bool division) {
+void DataParallelCommunicatorNccl<T>::divide_by_num_devices(bool division) {
   if (division) {
     for (int i = 0; i < device_ids_.size(); ++i) {
       auto device_id = device_ids_[i];
diff --git a/src/nbla/cuda/communicator/multi_process_data_parallel_communicator.cu b/src/nbla/cuda/communicator/multi_process_data_parallel_communicator.cu
index fc61a98e6..ea2226de5 100644
--- a/src/nbla/cuda/communicator/multi_process_data_parallel_communicator.cu
+++ b/src/nbla/cuda/communicator/multi_process_data_parallel_communicator.cu
@@ -743,7 +743,7 @@ MultiProcessDataParallelCommunicatorNccl<T>::AllReduceCallback::
       unpack_stream_(parent.nonblocking_streams_[2]) {
   dtypes dtype = get_dtype<Tc>();
 
-  /* Split gpu_memory into buffers of size n_params_thoreshold */
+  /* Split gpu_memory into buffers of size n_params_threshold */
   Tc *buff = this->gpu_memory_->cast(dtype, this->parent_.ctx_)->pointer<Tc>();
   for (size_t i = 0; i < this->gpu_memory_->size() / this->n_params_threshold_;
        ++i) {
@@ -817,7 +817,7 @@ void MultiProcessDataParallelCommunicatorNccl<T>::AllReduceCallback::
         /* Use a next GPU workspace in the packing phase */
         this->release_workspace(this->workspace_, this->unpack_stream_);
         this->workspace_ = this->allocate_workspace(this->pack_stream_);
-        /* Notes: workspace_.n_param_buffered is initilaized in the above
+        /* Notes: workspace_.n_param_buffered is initialized in the above
          *        function. */
       }
     }
@@ -895,7 +895,7 @@ void MultiProcessDataParallelCommunicatorNccl<
   /* Notify that this workspace is currently not used (i.e., unpacking phase is
    * completed). */
   NBLA_CUDA_CHECK(cudaEventRecord(*workspace.event, stream));
-  /* Store GPU memory into unused meomry space set. */
+  /* Store GPU memory into unused memory space set. */
   this->buffers_.emplace(workspace.gpu_buffer, workspace.event);
 }
 
diff --git a/src/nbla/cuda/cuda.cpp b/src/nbla/cuda/cuda.cpp
index 66199e3d2..85cecf3c5 100644
--- a/src/nbla/cuda/cuda.cpp
+++ b/src/nbla/cuda/cuda.cpp
@@ -86,7 +86,7 @@ std::shared_ptr<cudaEvent_t> Cuda::cuda_event(unsigned int flags, int device) {
   std::default_delete<cudaEvent_t> deleter;
   return std::shared_ptr<cudaEvent_t>(
       new cudaEvent_t(event), [this, device, flags, deleter](cudaEvent_t *ptr) {
-        /* This lambda funtion is a custum deleter of the std::shared_ptr.
+        /* This lambda function is a custom deleter of the std::shared_ptr.
          * It is invoked when deleting the managed cudaEvent_t.
          */
 
diff --git a/src/nbla/cuda/cudnn/function/generic/batch_normalization.cu b/src/nbla/cuda/cudnn/function/generic/batch_normalization.cu
index a7da0c906..4f643849a 100644
--- a/src/nbla/cuda/cudnn/function/generic/batch_normalization.cu
+++ b/src/nbla/cuda/cudnn/function/generic/batch_normalization.cu
@@ -168,7 +168,7 @@ void BatchNormalizationCudaCudnn<T>::backward_impl_batch(
                                                    propagate_down, accum);
     return;
   }
-  // Commont inputs wrt. gradient.
+  // Common inputs wrt. gradient.
   const Tw *dy = outputs[0]->get_grad_pointer<Tw>(this->ctx_);
   const void *m =
       batch_mean->data()->get(DRV_BN_T(), this->ctx_)->const_pointer();
diff --git a/src/nbla/cuda/cudnn/function/generic/convolution.cu b/src/nbla/cuda/cudnn/function/generic/convolution.cu
index da74b24ca..33b616908 100644
--- a/src/nbla/cuda/cudnn/function/generic/convolution.cu
+++ b/src/nbla/cuda/cudnn/function/generic/convolution.cu
@@ -186,7 +186,7 @@ void ConvolutionCudaCudnn<T>::backward_impl(const Variables &inputs,
 // Manually selecting algorithms is not supported for now.
 /*
 // Basically this functions is not invoked,
-// because it is choosen by cudnnGetConvolutionForwardAlgorithm()
+// because it is chosen by cudnnGetConvolutionForwardAlgorithm()
 template <class T>
 void ConvolutionCudaCudnn<T>::set_cudnn_convolution_forward_algorithm(
     std::string algorithm) {
@@ -215,7 +215,7 @@ void ConvolutionCudaCudnn<T>::set_cudnn_convolution_forward_algorithm(
 }
 
 // Basically this functions is not invoked,
-// because it is choosen by cudnnGetConvolutionBackwardFilterAlgorithm()
+// because it is chosen by cudnnGetConvolutionBackwardFilterAlgorithm()
 template <class T>
 void ConvolutionCudaCudnn<T>::set_cudnn_convolution_backward_filter_algorithm(
     std::string algorithm) {
@@ -240,7 +240,7 @@ void ConvolutionCudaCudnn<T>::set_cudnn_convolution_backward_filter_algorithm(
 }
 
 // Basically this functions is not invoked,
-// because it is choosen by cudnnGetConvolutionBackwardDataAlgorithm()
+// because it is chosen by cudnnGetConvolutionBackwardDataAlgorithm()
 template <class T>
 void ConvolutionCudaCudnn<T>::set_cudnn_convolution_backward_data_algorithm(
     std::string algorithm) {
diff --git a/src/nbla/cuda/cudnn/function/generic/function_impl.cu.tmpl b/src/nbla/cuda/cudnn/function/generic/function_impl.cu.tmpl
index ce548be10..439a947e2 100644
--- a/src/nbla/cuda/cudnn/function/generic/function_impl.cu.tmpl
+++ b/src/nbla/cuda/cudnn/function/generic/function_impl.cu.tmpl
@@ -34,7 +34,7 @@ void ${name}CudaCudnn<${targs}>::setup_impl(const Variables &inputs,
                                const Variables &outputs) {
   /* TODO: Write a setup implementation.
 
-     Note that, although it is called only when a compuation graph is
+     Note that, although it is called only when a computation graph is
      constructed in a static computation graph, in a dynamic computation graph,
      it's called every time. Keep the setup computation light for the performance
      (caching heavy computation, device synchronization in GPU etc.)
@@ -66,14 +66,14 @@ void ${name}CudaCudnn<${targs}>::forward_impl(const Variables &inputs,
 
   /* TODO: remove this help message.
     The type `Variables` is a typedef of `vector<Variable*>`.
-    The `Variable` class owns storages of data (storage for forward propagation)
+    The `Variable` class owns storage of data (storage for forward propagation)
     and grad (for backprop) respectively.
 
     You can get a raw device pointer of a scalar type (template type
     suffixed with `cu`. See ${snake_name}.hpp for definitions.) of the
     storage using:
 
-    - `cosnt T* Variable::get_{data|grad}_pointer<Tcu>(ctx)` for read-only access.
+    - `const T* Variable::get_{data|grad}_pointer<Tcu>(ctx)` for read-only access.
     - `T* Variable::cast_{data|grad}_and_get_pointer<Tcu>(ctx)` for r/w access.
 
     By this, automatic type conversion would occur if data was held in a
@@ -146,7 +146,7 @@ pp = '!(%s)' % ' || '.join(pp)
 
   /** TODO: remove this help message.
       The backward error signals are propagated through the graph, and the
-      error from decsendant functions are set in the grad region of the output variables.
+      error from descendant functions are set in the grad region of the output variables.
    */
   // Gradient of outputs
 % for i, (vout_name, vout) in enumerate(outputs.items()):
diff --git a/src/nbla/cuda/function/generic/batch_normalization.cu b/src/nbla/cuda/function/generic/batch_normalization.cu
index 52c7287af..5169ecb10 100644
--- a/src/nbla/cuda/function/generic/batch_normalization.cu
+++ b/src/nbla/cuda/function/generic/batch_normalization.cu
@@ -191,7 +191,7 @@ void BatchNormalizationCuda<T>::backward_impl_batch(
     batch_mean = outputs[1];
     batch_var = outputs[2];
   }
-  // Commont inputs wrt. gradient.
+  // Common inputs wrt. gradient.
   const Tc *dy = outputs[0]->get_grad_pointer<Tc>(this->ctx_);
   const Tc *m = batch_mean->get_data_pointer<Tc>(this->ctx_);
   const Tc *v = batch_var->get_data_pointer<Tc>(this->ctx_);
diff --git a/src/nbla/cuda/function/generic/clip_grad_by_value.cu b/src/nbla/cuda/function/generic/clip_grad_by_value.cu
index e080185fa..d038e8335 100644
--- a/src/nbla/cuda/function/generic/clip_grad_by_value.cu
+++ b/src/nbla/cuda/function/generic/clip_grad_by_value.cu
@@ -65,7 +65,7 @@ void ClipGradByValueCuda<T>::backward_impl(const Variables &inputs,
                                            const vector<bool> &propagate_down,
                                            const vector<bool> &accum) {
   cuda_set_device(this->device_);
-  // No backward to min and max varialbes.
+  // No backward to min and max variables.
   if (!propagate_down[0]) {
     return;
   }
diff --git a/src/nbla/cuda/function/generic/depthwise_convolution.cu b/src/nbla/cuda/function/generic/depthwise_convolution.cu
index 788650ec6..808cd64ce 100644
--- a/src/nbla/cuda/function/generic/depthwise_convolution.cu
+++ b/src/nbla/cuda/function/generic/depthwise_convolution.cu
@@ -412,7 +412,7 @@ void DepthwiseConvolutionCuda<T>::setup_impl(const Variables &inputs,
 
   cudaDeviceProp prop;
   cudaGetDeviceProperties(&prop, std::stoi(this->ctx_.device_id));
-  // TODO: See the funcion definition of `max_threads_per_block_for_half`
+  // TODO: See the function definition of `max_threads_per_block_for_half`
   // found above.
   max_threads_per_block_ =
       max_threads_per_block_for_half<T>::reduce(prop.maxThreadsPerBlock);
diff --git a/src/nbla/cuda/function/generic/function_impl.cu.tmpl b/src/nbla/cuda/function/generic/function_impl.cu.tmpl
index 2c4bea208..f8830c466 100644
--- a/src/nbla/cuda/function/generic/function_impl.cu.tmpl
+++ b/src/nbla/cuda/function/generic/function_impl.cu.tmpl
@@ -38,7 +38,7 @@ void ${name}Cuda<${targs}>::setup_impl(const Variables &inputs,
      done. See the base class implementation in nnabla, and add additional setup
      code if necessary.
 
-     Note that, although it is called only when a compuation graph is
+     Note that, although it is called only when a computation graph is
      constructed in a static computation graph, in a dynamic computation graph,
      it's called every time. Keep the setup computation light for the performance
      (caching heavy computation, device synchronization in GPU etc.)
@@ -62,14 +62,14 @@ void ${name}Cuda<${targs}>::forward_impl(const Variables &inputs,
 
   /* TODO: remove this help message.
     The type `Variables` is a typedef of `vector<Variable*>`.
-    The `Variable` class owns storages of data (storage for forward propagation)
+    The `Variable` class owns storage of data (storage for forward propagation)
     and grad (for backprop) respectively.
 
     You can get a raw device pointer of a scalar type (template type
     suffixed with `cu`. See ${snake_name}.hpp for definitions.) of the
     storage using:
 
-    - `cosnt T* Variable::get_{data|grad}_pointer<Tcu>(ctx)` for read-only access.
+    - `const T* Variable::get_{data|grad}_pointer<Tcu>(ctx)` for read-only access.
     - `T* Variable::cast_{data|grad}_and_get_pointer<Tcu>(ctx)` for r/w access.
 
     By this, automatic type conversion would occur if data was held in a
@@ -142,7 +142,7 @@ pp = '!(%s)' % ' || '.join(pp)
 
   /** TODO: remove this help message.
       The backward error signals are propagated through the graph, and the
-      error from decsendant functions are set in the grad region of the output variables.
+      error from descendant functions are set in the grad region of the output variables.
    */
   // Gradient of outputs
 % for i, (vout_name, vout) in enumerate(outputs.items()):
diff --git a/src/nbla/cuda/test/multi_process_data_parallel_communicator.cu b/src/nbla/cuda/test/multi_process_data_parallel_communicator.cu
index 33c9101f0..14cd06bef 100644
--- a/src/nbla/cuda/test/multi_process_data_parallel_communicator.cu
+++ b/src/nbla/cuda/test/multi_process_data_parallel_communicator.cu
@@ -16,16 +16,16 @@
 
 namespace nbla {
 
-__global__ void cuda_incerement_vector(float *a) {
+__global__ void cuda_increment_vector(float *a) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   a[i] = a[i] + 1;
 }
 
 void increment_vector(cudaStream_t stream, float *vec, size_t size) {
   if (size <= 512) {
-    cuda_incerement_vector<<<1, size, 0, stream>>>(vec);
+    cuda_increment_vector<<<1, size, 0, stream>>>(vec);
   } else {
-    cuda_incerement_vector<<<size / 512, 512, 0, stream>>>(vec);
+    cuda_increment_vector<<<size / 512, 512, 0, stream>>>(vec);
   }
 }
 }