Skip to content

Commit

Permalink
DLPack support
Browse files Browse the repository at this point in the history
  • Loading branch information
cliffburdick committed Mar 16, 2023
1 parent 015908f commit e6fface
Show file tree
Hide file tree
Showing 4 changed files with 393 additions and 0 deletions.
232 changes: 232 additions & 0 deletions include/matx/core/dlpack.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
/*!
* Copyright (c) 2017 by Contributors
* \file dlpack.h
* \brief The common header of DLPack.
*/
#ifndef DLPACK_DLPACK_H_
#define DLPACK_DLPACK_H_

/**
* \brief Compatibility with C++
*/
#ifdef __cplusplus
#define DLPACK_EXTERN_C extern "C"
#else
#define DLPACK_EXTERN_C
#endif

/*! \brief The current version of dlpack */
#define DLPACK_VERSION 80

/*! \brief The current ABI version of dlpack */
#define DLPACK_ABI_VERSION 1

/*! \brief DLPACK_DLL prefix for windows */
#ifdef _WIN32
#ifdef DLPACK_EXPORTS
#define DLPACK_DLL __declspec(dllexport)
#else
#define DLPACK_DLL __declspec(dllimport)
#endif
#else
#define DLPACK_DLL
#endif

#include <stdint.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif
/*!
* \brief The device type in DLDevice.
*/
#ifdef __cplusplus
typedef enum : int32_t {
#else
typedef enum {
#endif
/*! \brief CPU device */
kDLCPU = 1,
/*! \brief CUDA GPU device */
kDLCUDA = 2,
/*!
* \brief Pinned CUDA CPU memory by cudaMallocHost
*/
kDLCUDAHost = 3,
/*! \brief OpenCL devices. */
kDLOpenCL = 4,
/*! \brief Vulkan buffer for next generation graphics. */
kDLVulkan = 7,
/*! \brief Metal for Apple GPU. */
kDLMetal = 8,
/*! \brief Verilog simulator buffer */
kDLVPI = 9,
/*! \brief ROCm GPUs for AMD GPUs */
kDLROCM = 10,
/*!
* \brief Pinned ROCm CPU memory allocated by hipMallocHost
*/
kDLROCMHost = 11,
/*!
* \brief Reserved extension device type,
* used for quickly test extension device
* The semantics can differ depending on the implementation.
*/
kDLExtDev = 12,
/*!
* \brief CUDA managed/unified memory allocated by cudaMallocManaged
*/
kDLCUDAManaged = 13,
/*!
* \brief Unified shared memory allocated on a oneAPI non-partititioned
* device. Call to oneAPI runtime is required to determine the device
* type, the USM allocation type and the sycl context it is bound to.
*
*/
kDLOneAPI = 14,
/*! \brief GPU support for next generation WebGPU standard. */
kDLWebGPU = 15,
/*! \brief Qualcomm Hexagon DSP */
kDLHexagon = 16,
} DLDeviceType;

/*!
* \brief A Device for Tensor and operator.
*/
typedef struct {
/*! \brief The device type used in the device. */
DLDeviceType device_type;
/*!
* \brief The device index.
* For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
*/
int32_t device_id;
} DLDevice;

/*!
* \brief The type code options DLDataType.
*/
typedef enum {
/*! \brief signed integer */
kDLInt = 0U,
/*! \brief unsigned integer */
kDLUInt = 1U,
/*! \brief IEEE floating point */
kDLFloat = 2U,
/*!
* \brief Opaque handle type, reserved for testing purposes.
* Frameworks need to agree on the handle data type for the exchange to be well-defined.
*/
kDLOpaqueHandle = 3U,
/*! \brief bfloat16 */
kDLBfloat = 4U,
/*!
* \brief complex number
* (C/C++/Python layout: compact struct per complex number)
*/
kDLComplex = 5U,
/*! \brief boolean */
kDLBool = 6U,
} DLDataTypeCode;

/*!
* \brief The data type the tensor can hold. The data type is assumed to follow the
* native endian-ness. An explicit error message should be raised when attempting to
* export an array with non-native endianness
*
* Examples
* - float: type_code = 2, bits = 32, lanes = 1
* - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
* - int8: type_code = 0, bits = 8, lanes = 1
* - std::complex<float>: type_code = 5, bits = 64, lanes = 1
* - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
*/
typedef struct {
/*!
* \brief Type code of base types.
* We keep it uint8_t instead of DLDataTypeCode for minimal memory
* footprint, but the value should be one of DLDataTypeCode enum values.
* */
uint8_t code;
/*!
* \brief Number of bits, common choices are 8, 16, 32.
*/
uint8_t bits;
/*! \brief Number of lanes in the type, used for vector types. */
uint16_t lanes;
} DLDataType;

/*!
* \brief Plain C Tensor object, does not manage memory.
*/
typedef struct {
/*!
* \brief The data pointer points to the allocated data. This will be CUDA
* device pointer or cl_mem handle in OpenCL. It may be opaque on some device
* types. This pointer is always aligned to 256 bytes as in CUDA. The
* `byte_offset` field should be used to point to the beginning of the data.
*
* Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
* TVM, perhaps others) do not adhere to this 256 byte aligment requirement
* on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed
* (after which this note will be updated); at the moment it is recommended
* to not rely on the data pointer being correctly aligned.
*
* For given DLTensor, the size of memory required to store the contents of
* data is calculated as follows:
*
* \code{.c}
* static inline size_t GetDataSize(const DLTensor* t) {
* size_t size = 1;
* for (tvm_index_t i = 0; i < t->ndim; ++i) {
* size *= t->shape[i];
* }
* size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
* return size;
* }
* \endcode
*/
void* data;
/*! \brief The device of the tensor */
DLDevice device;
/*! \brief Number of dimensions */
int32_t ndim;
/*! \brief The data type of the pointer*/
DLDataType dtype;
/*! \brief The shape of the tensor */
int64_t* shape;
/*!
* \brief strides of the tensor (in number of elements, not bytes)
* can be NULL, indicating tensor is compact and row-majored.
*/
int64_t* strides;
/*! \brief The offset in bytes to the beginning pointer to data */
uint64_t byte_offset;
} DLTensor;

/*!
* \brief C Tensor object, manage memory of DLTensor. This data structure is
* intended to facilitate the borrowing of DLTensor by another framework. It is
* not meant to transfer the tensor. When the borrowing framework doesn't need
* the tensor, it should call the deleter to notify the host that the resource
* is no longer needed.
*/
typedef struct DLManagedTensor {
/*! \brief DLTensor which is being memory managed */
DLTensor dl_tensor;
/*! \brief the context of the original host framework of DLManagedTensor in
* which DLManagedTensor is used in the framework. It can also be NULL.
*/
void * manager_ctx;
/*! \brief Destructor signature void (*)(void*) - this should be called
* to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
* if there is no way for the caller to provide a reasonable destructor.
* The destructors deletes the argument self as well.
*/
void (*deleter)(struct DLManagedTensor * self);
} DLManagedTensor;
#ifdef __cplusplus
} // DLPACK_EXTERN_C
#endif
#endif // DLPACK_DLPACK_H_
86 changes: 86 additions & 0 deletions include/matx/core/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "matx/core/storage.h"
#include "matx/core/tensor_impl.h"
#include "matx/core/tensor_utils.h"
#include "matx/core/dlpack.h"
#include "matx/kernels/utility.cuh"

static constexpr int MAX_TENSOR_DIM = 4;
Expand Down Expand Up @@ -1744,6 +1745,91 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
}, tup);
}

/**
* @brief Get a DLPack v0.8 structure representing the tensor
*
* DLPack is a commonly-used tensor memory layout format for moving tensors between libraries. This function
* returns a DLPack structure based on a tensor_t. The caller is responsible for freeing the memory
* by calling ->deleter(self).
*
* @returns Pointer to new DLManagedTensorVersioned pointer. The caller must call the deleter function when finished.
*/
DLManagedTensor *GetDLPackTensor() const {
//DLManagedTensorVersioned *GetDLPackTensor() const {
//auto mt = new DLManagedTensorVersioned;
auto mt = new DLManagedTensor;
DLTensor *t = &mt->dl_tensor;
CUpointer_attribute attr[] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL};
CUmemorytype mem_type;
int dev_ord;
void *data[2] = {&mem_type, &dev_ord};

t->data = static_cast<void*>(this->ldata_);
t->device.device_id = 0;

// Determine where this memory resides
auto kind = GetPointerKind(this->ldata_);
auto mem_res = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast<CUdeviceptr>(this->ldata_));
MATX_ASSERT_STR_EXP(mem_res, CUDA_SUCCESS, matxCudaError, "Error returned from cuPointerGetAttributes");
if (kind == MATX_INVALID_MEMORY) {
if (mem_type == CU_MEMORYTYPE_DEVICE) {
t->device.device_type = kDLCUDA;
t->device.device_id = dev_ord;
}
else {
t->device.device_type = kDLCPU;
}
}
else {
// We have a record of this pointer and can map it from the record
switch (kind) {
case MATX_MANAGED_MEMORY:
case MATX_DEVICE_MEMORY:
case MATX_ASYNC_DEVICE_MEMORY:
t->device.device_type = kDLCUDA;
t->device.device_id = dev_ord;
break;
case MATX_HOST_MEMORY:
t->device.device_type = kDLCUDAHost;
t->device.device_id = dev_ord;
break;
case MATX_HOST_MALLOC_MEMORY:
t->device.device_type = kDLCPU;
break;
default:
MATX_ASSERT_STR(false, matxCudaError, "Cannot determine kind of memory");
break;
}
}

t->ndim = RANK;
t->dtype = detail::TypeToDLPackType<T>();
t->shape = new int64_t[RANK];
t->strides = new int64_t[RANK];
for (int r = 0; r < RANK; r++) {
t->shape[r] = this->Size(r);
t->strides[r] = this->Stride(r);
}
t->byte_offset = 0;

mt->manager_ctx = nullptr;
//mt->flags = 0; // Only for v1.0

//auto deleter = [](struct DLManagedTensorVersioned *mtv) { // v1.0
auto deleter = [](struct DLManagedTensor *mtv) {
delete [] mtv->dl_tensor.shape;
delete [] mtv->dl_tensor.strides;
delete mtv;

mtv->dl_tensor.shape = nullptr;
mtv->dl_tensor.strides = nullptr;
mtv = nullptr;
};

mt->deleter = deleter;

return mt;
}

private:
Storage storage_;
Expand Down
46 changes: 46 additions & 0 deletions include/matx/core/tensor_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <functional>

#include "matx/core/nvtx.h"
#include "matx/core/dlpack.h"
#include "matx/core/make_tensor.h"

namespace matx
Expand Down Expand Up @@ -448,6 +449,51 @@ namespace detail {
}


template <typename T> constexpr DLDataType TypeToDLPackType()
{
if constexpr (std::is_same_v<T, cuda::std::complex<float>>)
return {kDLComplex, 64, 1};
if constexpr (std::is_same_v<T, cuda::std::complex<double>>)
return {kDLComplex, 128, 1};
if constexpr (std::is_same_v<T, matxFp16>)
return {kDLFloat, 16, 1};
if constexpr (std::is_same_v<T, matxBf16>)
return {kDLBfloat, 16, 1};
if constexpr (std::is_same_v<T, matxFp16Complex>)
return {kDLComplex, 32, 1};
if constexpr (std::is_same_v<T, matxBf16Complex>)
return {kDLComplex, 32, 1}; // Wrong, but no other choice
if constexpr (std::is_same_v<T, float>)
return {kDLFloat, 32, 1};
if constexpr (std::is_same_v<T, double>)
return {kDLFloat, 64, 1};
if constexpr (std::is_same_v<T, int8_t>)
return {kDLInt, 8, 1};
if constexpr (std::is_same_v<T, int16_t>)
return {kDLInt, 16, 1};
if constexpr (std::is_same_v<T, int32_t>)
return {kDLInt, 32, 1};
if constexpr (std::is_same_v<T, int64_t>)
return {kDLInt, 64, 1};
if constexpr (std::is_same_v<T, uint8_t>)
return {kDLUInt, 8, 1};
if constexpr (std::is_same_v<T, uint16_t>)
return {kDLUInt, 16, 1};
if constexpr (std::is_same_v<T, uint32_t>)
return {kDLUInt, 32, 1};
if constexpr (std::is_same_v<T, uint64_t>)
return {kDLUInt, 64, 1};
if constexpr (std::is_same_v<T, bool>)
#if DLPACK_VERSION >= 80
return {kDLBool, 8, 1};
#else
return {kDLUInt, 8, 1};
#endif

return {kDLOpaqueHandle, 1, 1};
}


/**
* Print a value
*
Expand Down
Loading

0 comments on commit e6fface

Please sign in to comment.