diff --git a/Spiral Compilation Tests/cuda_experiments/tensor1/corecuda/base.spi b/Spiral Compilation Tests/cuda_experiments/tensor1/corecuda/base.spi index 673b4198..32da9cb4 100644 --- a/Spiral Compilation Tests/cuda_experiments/tensor1/corecuda/base.spi +++ b/Spiral Compilation Tests/cuda_experiments/tensor1/corecuda/base.spi @@ -27,7 +27,7 @@ type runConfig = { // Executes the lambda on the GPU device. inl run' ({gridDim blockDim} : runConfig) f = // Global statements only get executed once. - global "raw_module = cp.RawModule(code=kernel, backend='nvcc', options=(\"-I G:/cutlass-3.3.0/include\",))" + global "raw_module = cp.RawModule(code=kernel, backend='nvcc', options=(\"-I G:\\\\nvidia-mathdx-24.01.0\\\\nvidia\\\\mathdx\\\\24.01\\\\include\", \"-I G:\\\\nvidia-mathdx-24.01.0\\\\nvidia\\\\mathdx\\\\24.01\\\\include\\\\cublasdx\\\\include\", \"-I G:\\\\nvidia-mathdx-24.01.0\\\\nvidia\\\\mathdx\\\\24.01\\\\external\\\\cutlass\\\\include\"))" inl kernel_i, vars = join_backend Cuda global "template struct array { el v[dim]; };" f () diff --git a/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.py b/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.py index cdc7de4c..f739266b 100644 --- a/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.py +++ b/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.py @@ -5,10 +5,11 @@ #include #include template struct array { el v[dim]; }; +#include +using namespace cublasdx; +constexpr auto t_mode = cublasdx::transpose_mode::non_transposed; extern "C" __global__ void entry0() { - extern __shared__ int v0[]; - typename cutlass::Operator::SharedStorage * v1; - v1 = reinterpret_cast(v0); + using GEMM = decltype(Size<32, 32, 32>() + Precision() + Type() + TransposeMode() + Function() + SM<700>() + Block()); return ; } """ @@ -17,7 +18,7 @@ from typing import NamedTuple, Union, Callable, Tuple i8 = i16 = i32 = i64 = u8 = u16 = u32 = u64 = int; f32 = f64 = float; char = string = str -raw_module = cp.RawModule(code=kernel, backend='nvcc', options=("-I G:/cutlass-3.3.0/include",)) +raw_module = cp.RawModule(code=kernel, backend='nvcc', options=("-I G:\\nvidia-mathdx-24.01.0\\nvidia\\mathdx\\24.01\\include", "-I G:\\nvidia-mathdx-24.01.0\\nvidia\\mathdx\\24.01\\include\\cublasdx\\include", "-I G:\\nvidia-mathdx-24.01.0\\nvidia\\mathdx\\24.01\\external\\cutlass\\include")) def main(): v0 = 0 raw_module.get_function(f"entry{v0}")((1, 1, 1),(1, 1, 1),()) diff --git a/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.spi b/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.spi index 5be20c0d..5a98f745 100644 --- a/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.spi +++ b/Spiral Compilation Tests/cuda_experiments/tensor1/matmul.spi @@ -22,9 +22,9 @@ inl main() = inl blocks = 1 inl grids = 1 // divup (length out) blocks run grids blocks (fun () => - // Dynamic shared memory base pointer - inl shared_storage_base : $"int *" = $"extern __shared__ int v$[]" - // Declare pointer to dynamic shared memory. - inl shared_storage : $"typename cutlass::Operator::SharedStorage *" = $"reinterpret_cast(!shared_storage_base)" + global "#include " + global "using namespace cublasdx;" + global "constexpr auto t_mode = cublasdx::transpose_mode::non_transposed;" + $"using GEMM = decltype(Size<32, 32, 32>() + Precision() + Type() + TransposeMode() + Function() + SM<700>() + Block())" () ) \ No newline at end of file