NVIDIA · luitjens · Feb 1, 2023 · Jan 21, 2023
diff --git a/bench/00_transform/svd_power.cu b/bench/00_transform/svd_power.cu
@@ -0,0 +1,69 @@
+#include "matx.h"
+#include <nvbench/nvbench.cuh>
+#include "matx/core/nvtx.h"
+
+using namespace matx;
+
+using svd_types =
+    nvbench::type_list<float, double, cuda::std::complex<float>, cuda::std::complex<double>>;
+
+/* SVD benchmarks */
+template <typename ValueType>
+void svdpi_batch(nvbench::state &state,
+                            nvbench::type_list<ValueType>)
+{
+  using AType = ValueType;
+  using SType = typename inner_op_type_t<AType>::type;
+
+  cudaStream_t stream = 0;
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
+
+  SType epsilon = SType(.0001);
+  SType delta = SType(.97);
+  SType lamda = SType(2);
+
+  int batch = state.get_int64("batch");
+  int m = state.get_int64("rows");
+  int n = state.get_int64("cols");
+
+  int r = std::min(n,m);
+  auto A = make_tensor<AType>({batch, m, n});
+  auto U = make_tensor<AType>({batch, m, r});
+  auto VT = make_tensor<AType>({batch, r, n});
+  auto S = make_tensor<SType>({batch, r});
+
+  randomGenerator_t<AType> gen(batch*m*n,0);
+  auto x0 = gen.GetTensorView({batch, r}, NORMAL);
+
+  int iterations = int(log(SType(4) * log ( SType(2 * n) / delta) / (epsilon * delta)) / (SType(2) * lamda));
+
+  auto random = gen.GetTensorView({batch, m, n}, NORMAL);
+  (A = random).run(stream);
+
+  A.PrefetchDevice(stream);
+  U.PrefetchDevice(stream);
+  S.PrefetchDevice(stream);
+  VT.PrefetchDevice(stream);
+
+  (U = 0).run(stream);
+  (S = 0).run(stream);
+  (VT = 0).run(stream);
+
+  // warm up
+  nvtxRangePushA("Warmup");
+  svdpi(U, S, VT, A, x0, iterations, stream, r);
+  cudaDeviceSynchronize();
+  nvtxRangePop();
+
+  MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
+  state.exec(
+   [&U, &S, &VT, &A, &x0, &iterations, &r](nvbench::launch &launch) {
+      svdpi(U, S, VT, A, x0, iterations, launch.get_stream(), r); });
+  MATX_NVTX_END_RANGE( 1 )
+
+}
+NVBENCH_BENCH_TYPES(svdpi_batch, NVBENCH_TYPE_AXES(svd_types))
+  .add_int64_axis("cols", {4, 16, 64})
+  .add_int64_axis("rows", {4})
+  .add_int64_axis("batch", {3000});
+
diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
@@ -4,6 +4,7 @@ set (bench_sources
     00_transform/conv.cu
     00_transform/cub.cu
     00_transform/einsum.cu
+    00_transform/svd_power.cu
     00_operators/operators.cu
     00_operators/reduction.cu
     01_radar/SingleChanSimplePipeline.cu

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -10,6 +10,7 @@ set(examples
     spectrogram 
     spectrogram_graph
     spherical_harmonics 
+    svd_power
     black_scholes)
 
 add_library(example_lib INTERFACE)

diff --git a/examples/svd_power.cu b/examples/svd_power.cu
@@ -0,0 +1,156 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#include "matx.h"
+#include <cassert>
+#include <cstdio>
+#include <math.h>
+
+using namespace matx;
+
+int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
+{
+  MATX_ENTER_HANDLER();
+
+  //using AType = float;
+  using AType = cuda::std::complex<float>;
+  using SType = float;
+
+  SType epsilon = SType(.0001);
+  SType delta = SType(.97);
+  SType lamda = SType(2);
+
+  cudaStream_t stream = 0;
+  int batch = 2; 
+
+  int m = 4;
+  int n = 4;
+
+  int d = std::min(m,n);
+
+  int r = std::min(n,m);
+  auto A = make_tensor<AType>({batch, m, n});
+  auto U = make_tensor<AType>({batch, m, r});
+  auto VT = make_tensor<AType>({batch, r, n});
+  auto S = make_tensor<SType>({batch, r});
+
+  randomGenerator_t<AType> gen(A.TotalSize(),0);
+  auto x0 = gen.GetTensorView({batch, d}, NORMAL);
+
+  int iterations = int(log(SType(4) * log ( SType(2 * n) / delta) / (epsilon * delta)) / (SType(2) * lamda));
+  iterations = 10;
+  printf("iterations: %d\n", iterations);
+
+  auto random = gen.GetTensorView({batch, m, n}, NORMAL);
+  (A = random).run(stream);
+
+  A.PrefetchDevice(stream);
+  U.PrefetchDevice(stream);
+  S.PrefetchDevice(stream);
+  VT.PrefetchDevice(stream);
+
+  (U = 0).run(stream);
+  (S = 0).run(stream);
+  (VT = 0).run(stream);
+
+  for(int i = 0; i < 1; i++) {
+    svdpi(U, S, VT, A, x0, iterations, stream, r);
+  }
+
+#if 1
+  cudaDeviceSynchronize();
+
+  auto UD = make_tensor<AType>({batch, m, r});
+  auto UDVT = make_tensor<AType>({batch, m, n});
+  auto UUT = make_tensor<AType>({batch, m, m});
+  auto UTU = make_tensor<AType>({batch, r, r});
+  auto VVT = make_tensor<AType>({batch, n, n});
+  auto VTV = make_tensor<AType>({batch, r, r});
+
+  if(batch == 1) {
+    printf("S\n");
+    Print(S);
+    printf("U\n");
+    Print(U);
+    printf("VT\n");
+    Print(VT);
+
+    if( m <=  n) {
+      printf("UUT:\n");
+      matmul(UUT, U, conj(transpose(U)), stream);
+      Print(UUT);
+    }
+
+    printf("UTU:\n");
+    matmul(UTU, conj(transpose(U)) , U, stream);
+    Print(UTU);
+
+    if( n >= m) {
+      printf("VVT:\n");
+      matmul(VVT, conj(transpose(VT)), VT, stream);
+      Print(VVT);
+    }
+
+    printf("VTV:\n");
+    matmul(VTV, VT, conj(transpose(VT)), stream); // works on r x r
+
+    Print(VTV);
+
+#if 0
+    matmul(UD, U, D, stream);
+#else
+    std::array<index_t, U.Rank()> Dshape;
+    Dshape.fill(matxKeepDim);
+    Dshape[U.Rank()-2] = m;
+    // cloning D across
+    auto D = clone<U.Rank()>(S, Dshape);
+    // scale U by eigen values (equivalent to matmul of the diagonal matrix)
+    (UD = U * D).run(stream);
+#endif
+    matmul(UDVT, UD, VT, stream);
+
+    printf("A\n");
+    Print(A);
+
+    printf("UDV\n");
+    Print(UDVT);
+
+    (A = A - UDVT).run(stream);
+
+    printf("A-UDVT\n");
+    Print(A);
+  }
+#endif
+
+  CUDA_CHECK_LAST_ERROR();
+  MATX_EXIT_HANDLER();
+}
diff --git a/include/matx/generators/diag.h b/include/matx/generators/diag.h
@@ -139,6 +139,6 @@ namespace matx
    */
   template <typename T = int, int RANK> inline auto eye(const index_t (&s)[RANK])
   {
-    return eye(detail::to_array(s));
+    return eye<T>(detail::to_array(s));
   }
 } // end namespace matx
diff --git a/include/matx/generators/random.h b/include/matx/generators/random.h
@@ -202,9 +202,8 @@ template <typename T> class randomGenerator_t {
    * @returns
    *   A randomTensorView_t with given parameters
    */
-  template <int RANK, typename ShapeType,
-           std::enable_if_t<!std::is_array_v<typename remove_cvref<ShapeType>::type>, bool> = true>
-  inline auto GetTensorView(ShapeType &&shape, Distribution_t dist,
+  template <std::size_t RANK>
+  inline auto GetTensorView(const std::array<index_t, RANK> &shape, Distribution_t dist,
                             T alpha = 1, T beta = 0)
 {
   return randomTensorView_t<T, RANK>(shape, states_, dist, alpha, beta);
@@ -228,7 +227,7 @@ template <typename T> class randomGenerator_t {
   inline auto GetTensorView(const index_t (&sizes)[RANK], Distribution_t dist,
                             T alpha = 1, T beta = 0)
   {
-    return GetTensorView<RANK>(detail::to_array(sizes), dist, alpha, beta);
+    return GetTensorView(detail::to_array(sizes), dist, alpha, beta);
   }
 
   /**

diff --git a/include/matx/operators/clone.h b/include/matx/operators/clone.h
@@ -115,7 +115,7 @@ namespace matx
    * Each element is either the size of the cloned dimension or matxKeepDim to be from the source tensor
    * @return operator to compute the cloned value
    */
-  template <int Rank, typename Op>
+  template <std::size_t Rank, typename Op>
     auto __MATX_INLINE__ clone(Op t, const std::array<index_t, Rank> &shape)
     {
       if constexpr (is_tensor_view_v<Op>) {