NVIDIA · cliffburdick · Mar 17, 2023 · Mar 16, 2023
diff --git a/include/matx/core/dlpack.h b/include/matx/core/dlpack.h
@@ -0,0 +1,232 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+/**
+ * \brief Compatibility with C++
+ */
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 80
+
+/*! \brief The current ABI version of dlpack */
+#define DLPACK_ABI_VERSION 1
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief The device type in DLDevice.
+ */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
+typedef enum {
+#endif
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+   */
+  kDLROCMHost = 11,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+  /*!
+   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+   */
+  kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   *
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*!
+   * \brief The device index.
+   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+   */
+  int32_t device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+  /*! \brief boolean */
+  kDLBool = 6U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes = 1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ *   - int8: type_code = 0, bits = 8, lanes = 1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int32_t ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void * manager_ctx;
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+   *   if there is no way for the caller to provide a reasonable destructor.
+   *   The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor * self);
+} DLManagedTensor;
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h
@@ -45,6 +45,7 @@
 #include "matx/core/storage.h"
 #include "matx/core/tensor_impl.h"
 #include "matx/core/tensor_utils.h"
+#include "matx/core/dlpack.h"
 #include "matx/kernels/utility.cuh"
 
 static constexpr int MAX_TENSOR_DIM = 4;
@@ -1744,6 +1745,91 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
       }, tup);
   }   
 
+  /**
+   * @brief Get a DLPack v0.8 structure representing the tensor
+   * 
+   * DLPack is a commonly-used tensor memory layout format for moving tensors between libraries. This function
+   * returns a DLPack structure based on a tensor_t. The caller is responsible for freeing the memory
+   * by calling ->deleter(self).
+   * 
+   * @returns Pointer to new DLManagedTensorVersioned pointer. The caller must call the deleter function when finished.
+   */
+  DLManagedTensor *GetDLPackTensor() const {
+  //DLManagedTensorVersioned *GetDLPackTensor() const {
+    //auto mt = new DLManagedTensorVersioned;
+    auto mt = new DLManagedTensor;
+    DLTensor *t = &mt->dl_tensor;
+    CUpointer_attribute attr[] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL};
+    CUmemorytype mem_type;
+    int dev_ord;
+    void *data[2] = {&mem_type, &dev_ord};
+
+    t->data   = static_cast<void*>(this->ldata_);
+    t->device.device_id = 0;
+
+    // Determine where this memory resides
+    auto kind = GetPointerKind(this->ldata_);
+    auto mem_res = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast<CUdeviceptr>(this->ldata_));
+    MATX_ASSERT_STR_EXP(mem_res, CUDA_SUCCESS, matxCudaError, "Error returned from cuPointerGetAttributes");
+    if (kind == MATX_INVALID_MEMORY) {
+      if (mem_type == CU_MEMORYTYPE_DEVICE) {
+        t->device.device_type = kDLCUDA; 
+        t->device.device_id = dev_ord;        
+      }
+      else {
+        t->device.device_type = kDLCPU;        
+      }
+    }
+    else {
+      // We have a record of this pointer and can map it from the record
+      switch (kind) {
+        case MATX_MANAGED_MEMORY: 
+        case MATX_DEVICE_MEMORY:
+        case MATX_ASYNC_DEVICE_MEMORY:
+          t->device.device_type = kDLCUDA; 
+          t->device.device_id = dev_ord;
+          break;
+        case MATX_HOST_MEMORY:
+          t->device.device_type = kDLCUDAHost;
+          t->device.device_id = dev_ord;
+          break;
+        case MATX_HOST_MALLOC_MEMORY:
+          t->device.device_type = kDLCPU;
+          break;
+        default: 
+          MATX_ASSERT_STR(false, matxCudaError, "Cannot determine kind of memory");
+          break;
+      }
+    }
+
+    t->ndim = RANK;
+    t->dtype = detail::TypeToDLPackType<T>();
+    t->shape = new int64_t[RANK];
+    t->strides = new int64_t[RANK];
+    for (int r = 0; r < RANK; r++) {
+      t->shape[r] = this->Size(r);
+      t->strides[r] = this->Stride(r);
+    }
+    t->byte_offset = 0;
+
+    mt->manager_ctx = nullptr;
+    //mt->flags = 0; // Only for v1.0
+
+    //auto deleter = [](struct DLManagedTensorVersioned *mtv) { // v1.0
+    auto deleter = [](struct DLManagedTensor *mtv) {
+      delete [] mtv->dl_tensor.shape;
+      delete [] mtv->dl_tensor.strides;
+      delete mtv;
+
+      mtv->dl_tensor.shape = nullptr;
+      mtv->dl_tensor.strides = nullptr;
+      mtv = nullptr;
+    };
+
+    mt->deleter = deleter;
+
+    return mt;
+  }
 
 private:
   Storage storage_;

diff --git a/include/matx/core/tensor_utils.h b/include/matx/core/tensor_utils.h
@@ -36,6 +36,7 @@
 #include <functional>
 
 #include "matx/core/nvtx.h"
+#include "matx/core/dlpack.h"
 #include "matx/core/make_tensor.h"
 
 namespace matx
@@ -448,6 +449,51 @@ namespace detail {
   }
 
 
+  template <typename T> constexpr DLDataType TypeToDLPackType()
+  {
+    if constexpr (std::is_same_v<T, cuda::std::complex<float>>)
+      return {kDLComplex, 64, 1};
+    if constexpr (std::is_same_v<T, cuda::std::complex<double>>)
+      return {kDLComplex, 128, 1};
+    if constexpr (std::is_same_v<T, matxFp16>)
+      return {kDLFloat, 16, 1};
+    if constexpr (std::is_same_v<T, matxBf16>)
+      return {kDLBfloat, 16, 1};
+    if constexpr (std::is_same_v<T, matxFp16Complex>)
+      return {kDLComplex, 32, 1};
+    if constexpr (std::is_same_v<T, matxBf16Complex>)
+      return {kDLComplex, 32, 1}; // Wrong, but no other choice
+    if constexpr (std::is_same_v<T, float>)
+      return {kDLFloat, 32, 1};
+    if constexpr (std::is_same_v<T, double>)
+      return {kDLFloat, 64, 1};
+    if constexpr (std::is_same_v<T, int8_t>)
+      return {kDLInt, 8, 1};
+    if constexpr (std::is_same_v<T, int16_t>)
+      return {kDLInt, 16, 1};
+    if constexpr (std::is_same_v<T, int32_t>)
+      return {kDLInt, 32, 1};
+    if constexpr (std::is_same_v<T, int64_t>)
+      return {kDLInt, 64, 1};
+    if constexpr (std::is_same_v<T, uint8_t>)
+      return {kDLUInt, 8, 1};
+    if constexpr (std::is_same_v<T, uint16_t>)
+      return {kDLUInt, 16, 1};
+    if constexpr (std::is_same_v<T, uint32_t>)
+      return {kDLUInt, 32, 1};
+    if constexpr (std::is_same_v<T, uint64_t>)
+      return {kDLUInt, 64, 1};    
+    if constexpr (std::is_same_v<T, bool>)
+#if DLPACK_VERSION >= 80      
+      return {kDLBool, 8, 1};
+#else
+      return {kDLUInt, 8, 1};
+#endif      
+
+    return {kDLOpaqueHandle, 1, 1};
+  }  
+
+
   /**
    * Print a value
    *