Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DLPack support #392

Merged
merged 1 commit into from
Mar 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 232 additions & 0 deletions include/matx/core/dlpack.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
/*!
* Copyright (c) 2017 by Contributors
* \file dlpack.h
* \brief The common header of DLPack.
*/
#ifndef DLPACK_DLPACK_H_
#define DLPACK_DLPACK_H_

/**
* \brief Compatibility with C++
*/
#ifdef __cplusplus
#define DLPACK_EXTERN_C extern "C"
#else
#define DLPACK_EXTERN_C
#endif

/*! \brief The current version of dlpack */
#define DLPACK_VERSION 80

/*! \brief The current ABI version of dlpack */
#define DLPACK_ABI_VERSION 1

/*! \brief DLPACK_DLL prefix for windows */
#ifdef _WIN32
#ifdef DLPACK_EXPORTS
#define DLPACK_DLL __declspec(dllexport)
#else
#define DLPACK_DLL __declspec(dllimport)
#endif
#else
#define DLPACK_DLL
#endif

#include <stdint.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif
/*!
* \brief The device type in DLDevice.
*/
#ifdef __cplusplus
typedef enum : int32_t {
#else
typedef enum {
#endif
/*! \brief CPU device */
kDLCPU = 1,
/*! \brief CUDA GPU device */
kDLCUDA = 2,
/*!
* \brief Pinned CUDA CPU memory by cudaMallocHost
*/
kDLCUDAHost = 3,
/*! \brief OpenCL devices. */
kDLOpenCL = 4,
/*! \brief Vulkan buffer for next generation graphics. */
kDLVulkan = 7,
/*! \brief Metal for Apple GPU. */
kDLMetal = 8,
/*! \brief Verilog simulator buffer */
kDLVPI = 9,
/*! \brief ROCm GPUs for AMD GPUs */
kDLROCM = 10,
/*!
* \brief Pinned ROCm CPU memory allocated by hipMallocHost
*/
kDLROCMHost = 11,
/*!
* \brief Reserved extension device type,
* used for quickly test extension device
* The semantics can differ depending on the implementation.
*/
kDLExtDev = 12,
/*!
* \brief CUDA managed/unified memory allocated by cudaMallocManaged
*/
kDLCUDAManaged = 13,
/*!
* \brief Unified shared memory allocated on a oneAPI non-partititioned
* device. Call to oneAPI runtime is required to determine the device
* type, the USM allocation type and the sycl context it is bound to.
*
*/
kDLOneAPI = 14,
/*! \brief GPU support for next generation WebGPU standard. */
kDLWebGPU = 15,
/*! \brief Qualcomm Hexagon DSP */
kDLHexagon = 16,
} DLDeviceType;

/*!
* \brief A Device for Tensor and operator.
*/
typedef struct {
/*! \brief The device type used in the device. */
DLDeviceType device_type;
/*!
* \brief The device index.
* For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
*/
int32_t device_id;
} DLDevice;

/*!
* \brief The type code options DLDataType.
*/
typedef enum {
/*! \brief signed integer */
kDLInt = 0U,
/*! \brief unsigned integer */
kDLUInt = 1U,
/*! \brief IEEE floating point */
kDLFloat = 2U,
/*!
* \brief Opaque handle type, reserved for testing purposes.
* Frameworks need to agree on the handle data type for the exchange to be well-defined.
*/
kDLOpaqueHandle = 3U,
/*! \brief bfloat16 */
kDLBfloat = 4U,
/*!
* \brief complex number
* (C/C++/Python layout: compact struct per complex number)
*/
kDLComplex = 5U,
/*! \brief boolean */
kDLBool = 6U,
} DLDataTypeCode;

/*!
* \brief The data type the tensor can hold. The data type is assumed to follow the
* native endian-ness. An explicit error message should be raised when attempting to
* export an array with non-native endianness
*
* Examples
* - float: type_code = 2, bits = 32, lanes = 1
* - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
* - int8: type_code = 0, bits = 8, lanes = 1
* - std::complex<float>: type_code = 5, bits = 64, lanes = 1
* - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
*/
typedef struct {
/*!
* \brief Type code of base types.
* We keep it uint8_t instead of DLDataTypeCode for minimal memory
* footprint, but the value should be one of DLDataTypeCode enum values.
* */
uint8_t code;
/*!
* \brief Number of bits, common choices are 8, 16, 32.
*/
uint8_t bits;
/*! \brief Number of lanes in the type, used for vector types. */
uint16_t lanes;
} DLDataType;

/*!
* \brief Plain C Tensor object, does not manage memory.
*/
typedef struct {
/*!
* \brief The data pointer points to the allocated data. This will be CUDA
* device pointer or cl_mem handle in OpenCL. It may be opaque on some device
* types. This pointer is always aligned to 256 bytes as in CUDA. The
* `byte_offset` field should be used to point to the beginning of the data.
*
* Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
* TVM, perhaps others) do not adhere to this 256 byte aligment requirement
* on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed
* (after which this note will be updated); at the moment it is recommended
* to not rely on the data pointer being correctly aligned.
*
* For given DLTensor, the size of memory required to store the contents of
* data is calculated as follows:
*
* \code{.c}
* static inline size_t GetDataSize(const DLTensor* t) {
* size_t size = 1;
* for (tvm_index_t i = 0; i < t->ndim; ++i) {
* size *= t->shape[i];
* }
* size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
* return size;
* }
* \endcode
*/
void* data;
/*! \brief The device of the tensor */
DLDevice device;
/*! \brief Number of dimensions */
int32_t ndim;
/*! \brief The data type of the pointer*/
DLDataType dtype;
/*! \brief The shape of the tensor */
int64_t* shape;
/*!
* \brief strides of the tensor (in number of elements, not bytes)
* can be NULL, indicating tensor is compact and row-majored.
*/
int64_t* strides;
/*! \brief The offset in bytes to the beginning pointer to data */
uint64_t byte_offset;
} DLTensor;

/*!
* \brief C Tensor object, manage memory of DLTensor. This data structure is
* intended to facilitate the borrowing of DLTensor by another framework. It is
* not meant to transfer the tensor. When the borrowing framework doesn't need
* the tensor, it should call the deleter to notify the host that the resource
* is no longer needed.
*/
typedef struct DLManagedTensor {
/*! \brief DLTensor which is being memory managed */
DLTensor dl_tensor;
/*! \brief the context of the original host framework of DLManagedTensor in
* which DLManagedTensor is used in the framework. It can also be NULL.
*/
void * manager_ctx;
/*! \brief Destructor signature void (*)(void*) - this should be called
* to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
* if there is no way for the caller to provide a reasonable destructor.
* The destructors deletes the argument self as well.
*/
void (*deleter)(struct DLManagedTensor * self);
} DLManagedTensor;
#ifdef __cplusplus
} // DLPACK_EXTERN_C
#endif
#endif // DLPACK_DLPACK_H_
86 changes: 86 additions & 0 deletions include/matx/core/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "matx/core/storage.h"
#include "matx/core/tensor_impl.h"
#include "matx/core/tensor_utils.h"
#include "matx/core/dlpack.h"
#include "matx/kernels/utility.cuh"

static constexpr int MAX_TENSOR_DIM = 4;
Expand Down Expand Up @@ -1744,6 +1745,91 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
}, tup);
}

/**
* @brief Get a DLPack v0.8 structure representing the tensor
*
* DLPack is a commonly-used tensor memory layout format for moving tensors between libraries. This function
* returns a DLPack structure based on a tensor_t. The caller is responsible for freeing the memory
* by calling ->deleter(self).
*
* @returns Pointer to new DLManagedTensorVersioned pointer. The caller must call the deleter function when finished.
*/
DLManagedTensor *GetDLPackTensor() const {
//DLManagedTensorVersioned *GetDLPackTensor() const {
//auto mt = new DLManagedTensorVersioned;
auto mt = new DLManagedTensor;
DLTensor *t = &mt->dl_tensor;
CUpointer_attribute attr[] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL};
CUmemorytype mem_type;
int dev_ord;
void *data[2] = {&mem_type, &dev_ord};

t->data = static_cast<void*>(this->ldata_);
t->device.device_id = 0;

// Determine where this memory resides
auto kind = GetPointerKind(this->ldata_);
auto mem_res = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast<CUdeviceptr>(this->ldata_));
MATX_ASSERT_STR_EXP(mem_res, CUDA_SUCCESS, matxCudaError, "Error returned from cuPointerGetAttributes");
if (kind == MATX_INVALID_MEMORY) {
if (mem_type == CU_MEMORYTYPE_DEVICE) {
t->device.device_type = kDLCUDA;
t->device.device_id = dev_ord;
}
else {
t->device.device_type = kDLCPU;
}
}
else {
// We have a record of this pointer and can map it from the record
switch (kind) {
case MATX_MANAGED_MEMORY:
case MATX_DEVICE_MEMORY:
case MATX_ASYNC_DEVICE_MEMORY:
t->device.device_type = kDLCUDA;
t->device.device_id = dev_ord;
break;
case MATX_HOST_MEMORY:
t->device.device_type = kDLCUDAHost;
t->device.device_id = dev_ord;
break;
case MATX_HOST_MALLOC_MEMORY:
t->device.device_type = kDLCPU;
break;
default:
MATX_ASSERT_STR(false, matxCudaError, "Cannot determine kind of memory");
break;
}
}

t->ndim = RANK;
t->dtype = detail::TypeToDLPackType<T>();
t->shape = new int64_t[RANK];
t->strides = new int64_t[RANK];
for (int r = 0; r < RANK; r++) {
t->shape[r] = this->Size(r);
t->strides[r] = this->Stride(r);
}
t->byte_offset = 0;

mt->manager_ctx = nullptr;
//mt->flags = 0; // Only for v1.0

//auto deleter = [](struct DLManagedTensorVersioned *mtv) { // v1.0
auto deleter = [](struct DLManagedTensor *mtv) {
delete [] mtv->dl_tensor.shape;
delete [] mtv->dl_tensor.strides;
delete mtv;

mtv->dl_tensor.shape = nullptr;
mtv->dl_tensor.strides = nullptr;
mtv = nullptr;
};

mt->deleter = deleter;

return mt;
}

private:
Storage storage_;
Expand Down
46 changes: 46 additions & 0 deletions include/matx/core/tensor_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <functional>

#include "matx/core/nvtx.h"
#include "matx/core/dlpack.h"
#include "matx/core/make_tensor.h"

namespace matx
Expand Down Expand Up @@ -448,6 +449,51 @@ namespace detail {
}


template <typename T> constexpr DLDataType TypeToDLPackType()
{
if constexpr (std::is_same_v<T, cuda::std::complex<float>>)
return {kDLComplex, 64, 1};
if constexpr (std::is_same_v<T, cuda::std::complex<double>>)
return {kDLComplex, 128, 1};
if constexpr (std::is_same_v<T, matxFp16>)
return {kDLFloat, 16, 1};
if constexpr (std::is_same_v<T, matxBf16>)
return {kDLBfloat, 16, 1};
if constexpr (std::is_same_v<T, matxFp16Complex>)
return {kDLComplex, 32, 1};
if constexpr (std::is_same_v<T, matxBf16Complex>)
return {kDLComplex, 32, 1}; // Wrong, but no other choice
if constexpr (std::is_same_v<T, float>)
return {kDLFloat, 32, 1};
if constexpr (std::is_same_v<T, double>)
return {kDLFloat, 64, 1};
if constexpr (std::is_same_v<T, int8_t>)
return {kDLInt, 8, 1};
if constexpr (std::is_same_v<T, int16_t>)
return {kDLInt, 16, 1};
if constexpr (std::is_same_v<T, int32_t>)
return {kDLInt, 32, 1};
if constexpr (std::is_same_v<T, int64_t>)
return {kDLInt, 64, 1};
if constexpr (std::is_same_v<T, uint8_t>)
return {kDLUInt, 8, 1};
if constexpr (std::is_same_v<T, uint16_t>)
return {kDLUInt, 16, 1};
if constexpr (std::is_same_v<T, uint32_t>)
return {kDLUInt, 32, 1};
if constexpr (std::is_same_v<T, uint64_t>)
return {kDLUInt, 64, 1};
if constexpr (std::is_same_v<T, bool>)
#if DLPACK_VERSION >= 80
return {kDLBool, 8, 1};
#else
return {kDLUInt, 8, 1};
#endif

return {kDLOpaqueHandle, 1, 1};
}


/**
* Print a value
*
Expand Down
Loading