diff --git a/Spiral Compilation Tests/cuda_experiments/tensor1/corecuda/base.spi b/Spiral Compilation Tests/cuda_experiments/tensor1/corecuda/base.spi
index 673b4198..32da9cb4 100644
--- a/Spiral Compilation Tests/cuda_experiments/tensor1/corecuda/base.spi	
+++ b/Spiral Compilation Tests/cuda_experiments/tensor1/corecuda/base.spi	
@@ -27,7 +27,7 @@ type runConfig = {
 // Executes the lambda on the GPU device.
 inl run' ({gridDim blockDim} : runConfig) f =
     // Global statements only get executed once.
-    global "raw_module = cp.RawModule(code=kernel, backend='nvcc', options=(\"-I G:/cutlass-3.3.0/include\",))"
+    global "raw_module = cp.RawModule(code=kernel, backend='nvcc', options=(\"-I G:\\\\nvidia-mathdx-24.01.0\\\\nvidia\\\\mathdx\\\\24.01\\\\include\", \"-I G:\\\\nvidia-mathdx-24.01.0\\\\nvidia\\\\mathdx\\\\24.01\\\\include\\\\cublasdx\\\\include\", \"-I G:\\\\nvidia-mathdx-24.01.0\\\\nvidia\\\\mathdx\\\\24.01\\\\external\\\\cutlass\\\\include\"))"
     inl kernel_i, vars = join_backend Cuda 
         global "template <typename el, int dim> struct array { el v[dim]; };"
         f ()
diff --git a/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.py b/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.py
index cdc7de4c..f739266b 100644
--- a/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.py	
+++ b/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.py	
@@ -5,10 +5,11 @@
 #include <cstdint>
 #include <array>
 template <typename el, int dim> struct array { el v[dim]; };
+#include <cublasdx.hpp>
+using namespace cublasdx;
+constexpr auto t_mode = cublasdx::transpose_mode::non_transposed;
 extern "C" __global__ void entry0() {
-    extern __shared__ int v0[];
-    typename cutlass::Operator::SharedStorage * v1;
-    v1 = reinterpret_cast<typename cutlass::Operator::SharedStorage *>(v0);
+    using GEMM = decltype(Size<32, 32, 32>() + Precision<double>() + Type<type::real>() + TransposeMode<t_mode, t_mode>() + Function<function::MM>() + SM<700>() + Block());
     return ;
 }
 """
@@ -17,7 +18,7 @@
 from typing import NamedTuple, Union, Callable, Tuple
 i8 = i16 = i32 = i64 = u8 = u16 = u32 = u64 = int; f32 = f64 = float; char = string = str
 
-raw_module = cp.RawModule(code=kernel, backend='nvcc', options=("-I G:/cutlass-3.3.0/include",))
+raw_module = cp.RawModule(code=kernel, backend='nvcc', options=("-I G:\\nvidia-mathdx-24.01.0\\nvidia\\mathdx\\24.01\\include", "-I G:\\nvidia-mathdx-24.01.0\\nvidia\\mathdx\\24.01\\include\\cublasdx\\include", "-I G:\\nvidia-mathdx-24.01.0\\nvidia\\mathdx\\24.01\\external\\cutlass\\include"))
 def main():
     v0 = 0
     raw_module.get_function(f"entry{v0}")((1, 1, 1),(1, 1, 1),())
diff --git a/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.spi b/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.spi
index 5be20c0d..5a98f745 100644
--- a/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.spi	
+++ b/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.spi	
@@ -22,9 +22,9 @@ inl main() =
     inl blocks = 1
     inl grids = 1 // divup (length out) blocks
     run grids blocks (fun () =>
-        // Dynamic shared memory base pointer
-        inl shared_storage_base : $"int *" = $"extern __shared__ int v$[]"
-        // Declare pointer to dynamic shared memory.
-        inl shared_storage : $"typename cutlass::Operator::SharedStorage *" = $"reinterpret_cast<typename cutlass::Operator::SharedStorage *>(!shared_storage_base)"
+        global "#include <cublasdx.hpp>"
+        global "using namespace cublasdx;"
+        global "constexpr auto t_mode = cublasdx::transpose_mode::non_transposed;"
+        $"using GEMM = decltype(Size<32, 32, 32>() + Precision<double>() + Type<type::real>() + TransposeMode<t_mode, t_mode>() + Function<function::MM>() + SM<700>() + Block())"
         ()
         )
\ No newline at end of file