diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 1f6e5a2ab16..0f01216da52 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -117,133 +117,6 @@ void apply(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
-template <typename ValueType>
-void compute_dot(std::shared_ptr<const CudaExecutor> exec,
-                 const matrix::Dense<ValueType> *x,
-                 const matrix::Dense<ValueType> *y,
-                 matrix::Dense<ValueType> *result)
-{
-    if (cublas::is_supported<ValueType>::value) {
-        // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            cublas::dot(exec->get_cublas_handle(), x->get_size()[0],
-                        x->get_const_values() + col, x->get_stride(),
-                        y->get_const_values() + col, y->get_stride(),
-                        result->get_values() + col);
-        }
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr auto work_per_thread = 32;
-        constexpr auto block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_dot<block_size><<<grid_dim, block_dim>>>(
-                x->get_size()[0], as_cuda_type(x->get_const_values() + col),
-                x->get_stride(), as_cuda_type(y->get_const_values() + col),
-                y->get_stride(), as_cuda_type(work.get_data()));
-            kernel::finalize_sum_reduce_computation<block_size>
-                <<<1, block_dim>>>(grid_dim.x,
-                                   as_cuda_type(work.get_const_data()),
-                                   as_cuda_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot(std::shared_ptr<const CudaExecutor> exec,
-                      const matrix::Dense<ValueType> *x,
-                      const matrix::Dense<ValueType> *y,
-                      matrix::Dense<ValueType> *result)
-{
-    if (cublas::is_supported<ValueType>::value) {
-        // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            cublas::conj_dot(exec->get_cublas_handle(), x->get_size()[0],
-                             x->get_const_values() + col, x->get_stride(),
-                             y->get_const_values() + col, y->get_stride(),
-                             result->get_values() + col);
-        }
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr auto work_per_thread = 32;
-        constexpr auto block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_conj_dot<block_size>
-                <<<grid_dim, block_dim>>>(
-                    x->get_size()[0], as_cuda_type(x->get_const_values() + col),
-                    x->get_stride(), as_cuda_type(y->get_const_values() + col),
-                    y->get_stride(), as_cuda_type(work.get_data()));
-            kernel::finalize_sum_reduce_computation<block_size>
-                <<<1, block_dim>>>(grid_dim.x,
-                                   as_cuda_type(work.get_const_data()),
-                                   as_cuda_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<ValueType> *x,
-                   matrix::Dense<remove_complex<ValueType>> *result)
-{
-    if (cublas::is_supported<ValueType>::value) {
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            cublas::norm2(exec->get_cublas_handle(), x->get_size()[0],
-                          x->get_const_values() + col, x->get_stride(),
-                          result->get_values() + col);
-        }
-    } else {
-        using norm_type = remove_complex<ValueType>;
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr auto work_per_thread = 32;
-        constexpr auto block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<norm_type> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_norm2<block_size><<<grid_dim, block_dim>>>(
-                x->get_size()[0], as_cuda_type(x->get_const_values() + col),
-                x->get_stride(), as_cuda_type(work.get_data()));
-            kernel::finalize_sqrt_reduce_computation<block_size>
-                <<<1, block_dim>>>(grid_dim.x,
-                                   as_cuda_type(work.get_const_data()),
-                                   as_cuda_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
                     const matrix::Dense<ValueType> *source,