Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgraded NNPA infra for handling dimension specific max sizes #2871

Merged
merged 6 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/SupportedONNXOps-NNPA.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Onnx-mlir currently supports ONNX operations targeting up to opset 20. Limitatio
* A * indicates onnx-mlir is compatible with the latest version of that operator available as of opset 20.


NNPA has hardware limitations in dimension index size and tensor size, which are described in [NNPALimit.h](../src/Accelerators/NNPA/Support/NNPALimit.h). They are large enough for normal use cases, but if your model exceeds the limitations, CPU is used instead of NNPA.
NNPA has hardware limitations in dimension index size and tensor size, which are described in [NNPALimit.hpp](../src/Accelerators/NNPA/Support/NNPALimit.hpp). They are large enough for normal use cases, but if your model exceeds the limitations, CPU is used instead of NNPA.


| Op |Supported Opsets (inclusive) |Limitations |Notes |
Expand Down
2 changes: 2 additions & 0 deletions src/Accelerators/NNPA/Conversion/ONNXToZHigh/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ add_onnx_mlir_library(OMRewriteONNXForZHigh
OMONNXOps
OMONNXToKrnl
OMZHighOps
OMLayoutHelper


ACCEL_INCLUDE_DIRS PRIVATE
${NNPA_INCLUDE_PATH}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXLegalityCheck.hpp"
#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.hpp"
#include "src/Accelerators/NNPA/Support/NNPALimit.h"
#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
#include "src/Compiler/CompilerOptions.hpp"
#include "src/Conversion/ONNXToKrnl/RNN/RNNBase.hpp"
#include "src/Dialect/ONNX/ONNXDimAnalysis.hpp"
Expand Down Expand Up @@ -46,29 +46,6 @@ bool onnxToZHighInCompatibilityReport(Operation *op) {
return onnxToZHighUnsupportedReport(op, message);
}

/// Convert the input NNPA level, ie. "z16", to a floating point value
/// representing the level, ie. "16.0".
float convertNNPALevel(std::string inputNNPALevel) {
float retNNPAFloat = 0;
try {
retNNPAFloat = std::strtof(
inputNNPALevel.substr(1, inputNNPALevel.size()).c_str(), NULL);
} catch (...) {
retNNPAFloat = 0;
}
return retNNPAFloat;
}

/// A function to check whether the input NNPA level, ie. "z16", is compatible
/// with the current NNPA level.
bool isCompatibleWithNNPALevel(std::string inputNNPALevel) {
float inLevel = convertNNPALevel(inputNNPALevel);
float mcpuLevel = convertNNPALevel(mcpu);
if (inLevel == 0 && mcpuLevel == 0)
return false;
return inLevel <= mcpuLevel;
}

/// A function to check whether a value's element type is valid for zAIU or not.
/// zAIU supports only F16, F32 and BFLOAT. Since MLIR does not support BFLOAT,
/// we check F16 and F32 here only. zAIU only supports rank in range of (0, 4].
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

//====----- ONNXToZHighCommon.cpp - Common functions to ZHigh lowering ----===//
//
// Copyright 2019-2020 The IBM Research Authors.
// Copyright 2019-2024 The IBM Research Authors.
//
// =============================================================================
//
Expand Down Expand Up @@ -40,25 +40,27 @@ Value emitONNXTransposeWithType(Location loc, PatternRewriter &rewriter,
}

/// Split a tensor along an axis in which each chunk has a size of
/// NNPA_MAXIMUM_DIMENSION_INDEX_SIZE and the last chunk can be smaller.
/// NNPAGetMaxForDim and the last chunk can be smaller.
ValueRange splitAlongAxis(
MultiDialectBuilder<OnnxBuilder> &create, Value X, int64_t axis) {
Type xType = X.getType();
ArrayRef<int64_t> xShape = getShape(xType);
int64_t xRank = xShape.size();
Type elementTy = getElementType(xType);

// Compute split sizes.
SmallVector<Type> splitTy;
SmallVector<int64_t> splitSizesI64;
SmallVector<int64_t> splitShape(xShape);
int64_t dimSize = xShape[axis];
// First splits have the same size of NNPA_MAXIMUM_DIMENSION_INDEX_SIZE.
while (dimSize > NNPA_MAXIMUM_DIMENSION_INDEX_SIZE) {
splitShape[axis] = NNPA_MAXIMUM_DIMENSION_INDEX_SIZE;
// First splits have the same size of NNPAGetMaxForDim.
int64_t maxSize = NNPAGetMaxForDim(axis, xRank);
while (dimSize > maxSize) {
splitShape[axis] = maxSize;
auto ty = RankedTensorType::get(splitShape, elementTy);
splitTy.emplace_back(ty);
splitSizesI64.emplace_back(NNPA_MAXIMUM_DIMENSION_INDEX_SIZE);
dimSize -= NNPA_MAXIMUM_DIMENSION_INDEX_SIZE;
splitSizesI64.emplace_back(maxSize);
dimSize -= maxSize;
}
// The last split.
splitShape[axis] = dimSize;
Expand Down
18 changes: 10 additions & 8 deletions src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

//===---------- ONNXToZHigh.hpp - Common functions in ONNXToZHigh ---------===//
//
// Copyright 2019-2020 The IBM Research Authors.
// Copyright 2019-2024 The IBM Research Authors.
//
// =============================================================================
//
Expand All @@ -18,7 +18,7 @@

#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXLegalityCheck.hpp"
#include "src/Accelerators/NNPA/Support/LayoutHelper.hpp"
#include "src/Accelerators/NNPA/Support/NNPALimit.h"
#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
#include "src/Dialect/ONNX/ONNXDimAnalysis.hpp"

namespace onnx_mlir {
Expand Down Expand Up @@ -68,11 +68,13 @@ void addDynamicallyLegalOpFor(mlir::ConversionTarget *target,
mlir::dyn_cast<mlir::ShapedType>(operand.getType())) {
// Check if static dimension size exceeds zDNN limitations
llvm::ArrayRef<int64_t> valueShape = valueType.getShape();
if (llvm::any_of(valueShape, [](int64_t dim) {
return (!mlir::ShapedType::isDynamic(dim)) &&
(dim > NNPA_MAXIMUM_DIMENSION_INDEX_SIZE);
}))
return true;
int64_t valueRank = valueShape.size();
for (int64_t i = 0; i < valueRank; ++i) {
int64_t dim = valueShape[i];
if (!mlir::ShapedType::isDynamic(dim) &&
dim > NNPAGetMaxForDim(i, valueRank))
return true;
}
}
return false;
});
Expand All @@ -97,7 +99,7 @@ mlir::Value emitONNXTransposeWithType(mlir::Location loc,
mlir::ArrayRef<int64_t> perms);

/// Split a tensor along an axis in which each chunk has a size of
/// NNPA_MAXIMUM_DIMENSION_INDEX_SIZE and the last chucnk can be smaller.
/// NNPAGetMaxForDim and the last chuck can be smaller.
mlir::ValueRange splitAlongAxis(
onnx_mlir::MultiDialectBuilder<onnx_mlir::OnnxBuilder> &create,
mlir::Value X, int64_t axis);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

//===--- RewriteONNXForZHigh.cpp - Rewrite ONNX ops for ZHigh lowering ----===//
//
// Copyright 2019-2023 The IBM Research Authors.
// Copyright 2019-2024 The IBM Research Authors.
//
// =============================================================================
//
Expand All @@ -28,7 +28,7 @@
#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.hpp"
#include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps.hpp"
#include "src/Accelerators/NNPA/Pass/NNPAPasses.hpp"
#include "src/Accelerators/NNPA/Support/NNPALimit.h"
#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
#include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
#include "src/Dialect/ONNX/DialectBuilder.hpp"
#include "src/Dialect/ONNX/ElementsAttr/WideNum.hpp"
Expand Down Expand Up @@ -292,8 +292,7 @@ Type CreatePaddedXType(Value x, ArrayAttr pads) {

/// This pattern is to split a large MatMul into smaller ones that fit into
/// NNPA. Given (NxK) * (K*M), the pattern considers dimensions N and/or M to
/// split, if N and/or M is greater than NNPA_MAXIMUM_DIMENSION_INDEX_SIZE
/// (MDIS).
/// split, if N and/or M is greater than NNPAGetMaxForDim (MDIS).
/// For example, given A(NxK) * B(KxM), we will split A and B as follows.
// clang-format off
///
Expand Down Expand Up @@ -406,8 +405,8 @@ class SplitLargeMatMulPattern : public OpRewritePattern<ONNXMatMulOp> {
// Expect N or M exceeds NNPA limitation.
int64_t N = aShape[aRank - 2];
int64_t M = bShape[bRank - 1];
nExceeded = N > NNPA_MAXIMUM_DIMENSION_INDEX_SIZE;
mExceeded = M > NNPA_MAXIMUM_DIMENSION_INDEX_SIZE;
nExceeded = N > NNPAGetMaxForDim(aRank - 2, aRank);
mExceeded = M > NNPAGetMaxForDim(bRank - 1, bRank);
if (!(nExceeded || mExceeded))
return false;

Expand Down
2 changes: 1 addition & 1 deletion src/Accelerators/NNPA/NNPAAccelerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include "src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.hpp"
#include "src/Accelerators/NNPA/NNPAAccelerator.hpp"
#include "src/Accelerators/NNPA/Pass/NNPAPasses.hpp"
#include "src/Accelerators/NNPA/Support/NNPALimit.h"
#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
#include "src/Compiler/CompilerOptions.hpp"
#include "zdnn.h"

Expand Down
6 changes: 6 additions & 0 deletions src/Accelerators/NNPA/Support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@ add_subdirectory(Stickify)

add_onnx_mlir_library(OMLayoutHelper
LayoutHelper.cpp
NNPALimit.cpp

DEPENDS
libzdnn
OMCompilerOptions

LINK_LIBS PUBLIC
MLIRIR
OMCompilerOptions

INCLUDE_DIRS PUBLIC
${ONNX_MLIR_SRC_ROOT}/include

ACCEL_INCLUDE_DIRS PRIVATE
${NNPA_SRC_ROOT}
Expand Down
61 changes: 61 additions & 0 deletions src/Accelerators/NNPA/Support/NNPALimit.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* SPDX-License-Identifier: Apache-2.0
*/

//===----------------------- NNPALimit.cpp --------------------------------===//
//
// Copyright 2022-2024 The IBM Research Authors.
//
// =============================================================================
//
// The NNPA constant values.
//
//===----------------------------------------------------------------------===//

#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
#include "src/Compiler/CompilerOptions.hpp"

#include <assert.h>
#include <string>

//===----------------------------------------------------------------------===//
// Compatibility checks

/// Convert the input NNPA level, ie. "z16", to a integer value representing the
/// level, ie. "16". When unkown / out of bounds, returns 0.
int64_t convertNNPALevel(std::string inputNNPALevel) {
if (inputNNPALevel.size() != 3 || inputNNPALevel[0] != 'z')
return 0;
if (inputNNPALevel[1] == '1') {
if (inputNNPALevel[2] == '6')
return 16;
}
return 0;
}

/// A function to check whether the input NNPA level, ie. "z16", is compatible
/// with the current NNPA level.
bool isCompatibleWithNNPALevel(std::string inputNNPALevel) {
int64_t inLevel = convertNNPALevel(inputNNPALevel);
int64_t mcpuLevel = convertNNPALevel(onnx_mlir::mcpu);
if (inLevel == 0 && mcpuLevel == 0)
return false;
return inLevel <= mcpuLevel;
}

//===----------------------------------------------------------------------===//
// Max dimension checks

// The NNPA maximum supported dimension index size value by using
// zdnn_get_nnpa_max_dim_idx_size() This value depends on HW.
static constexpr int64_t NNPA_Z16_MAXIMUM_DIMENSION_INDEX_SIZE = 32768;

int64_t NNPAGetMaxForDim(int64_t dim, int64_t rank) {
assert(rank >= 0 && "expected positive rank");
assert(dim >= 0 && dim < rank && "dim outside range [0..rank)");
if (rank > 4)
return 0;
if (isCompatibleWithNNPALevel(NNPA_Z16))
return NNPA_Z16_MAXIMUM_DIMENSION_INDEX_SIZE;
return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
* SPDX-License-Identifier: Apache-2.0
*/

//===----------------------- NNPALimit.h ----------------------------------===//
//===----------------------- NNPALimit.hpp --------------------------------===//
//
// Copyright 2022-2023 The IBM Research Authors.
// Copyright 2022-2024 The IBM Research Authors.
//
// =============================================================================
//
Expand All @@ -16,9 +16,11 @@

#include <stdint.h>

// The NNPA maximum supported dimension index size value by using
// zdnn_get_nnpa_max_dim_idx_size() This value depends on HW.
static constexpr int64_t NNPA_MAXIMUM_DIMENSION_INDEX_SIZE = 32768;
// Get maximum number of element for a given NNPA tensor. Dim is a tensor/memref
// index (from 0 to rank-1), with dim=0 being the outermost dimension and
// dim=(rank-1) being the innermost dimension. Return 0 if dimension is invalid.
// Generate assert if dim outside of rank, rank non-positive.
int64_t NNPAGetMaxForDim(int64_t dim, int64_t rank);

// The NNPA maximum supported tensor size (in bytes)
// by using zdnn_get_nnpa_max_tensor_size()
Expand Down
1 change: 1 addition & 0 deletions src/Accelerators/NNPA/Support/Stickify/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ add_onnx_mlir_library(OMStickify

LINK_LIBS PUBLIC
LLVMSupport
OMLayoutHelper

ACCEL_INCLUDE_DIRS PRIVATE
${NNPA_INCLUDE_PATH}
Expand Down
2 changes: 1 addition & 1 deletion src/Accelerators/NNPA/Support/Stickify/Convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
//===----------------------------------------------------------------------===//

#include "src/Accelerators/NNPA/Support/Stickify/Convert.hpp"
#include "src/Accelerators/NNPA/Support/NNPALimit.h"
#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
#include "src/Accelerators/NNPA/Support/Stickify/DLF16Conversion.hpp"

/// fp32 -> dlf16 conversion.
Expand Down
6 changes: 3 additions & 3 deletions src/Accelerators/NNPA/Support/Stickify/Stickify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

//===------- stickify.cpp - Data Stickify ---------------------------------===//
//
// Copyright 2020-2022 The IBM Research Authors.
// Copyright 2020-2024 The IBM Research Authors.
//
// =============================================================================
//
Expand All @@ -21,7 +21,7 @@
#include <stdlib.h>
#include <string.h>

#include "src/Accelerators/NNPA/Support/NNPALimit.h"
#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
#include "src/Accelerators/NNPA/Support/Stickify/Convert.hpp"
#include "src/Accelerators/NNPA/Support/Stickify/Stickify.hpp"

Expand Down Expand Up @@ -412,7 +412,7 @@ zdnn_status verify_transformed_descriptor(const zdnn_tensor_desc *tfrmd_desc) {
// is the dimension above the limit or zero?
// transformed layout uses all dim* entries, so we'll check them all
for (int i = 0; i < ZDNN_MAX_DIMS; i++) {
if (!dims_ptr[i] || dims_ptr[i] > NNPA_MAXIMUM_DIMENSION_INDEX_SIZE) {
if (!dims_ptr[i] || dims_ptr[i] > NNPAGetMaxForDim(i, ZDNN_MAX_DIMS)) {
return ZDNN_INVALID_SHAPE;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps.hpp"
#include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/OpHelper.hpp"
#include "src/Accelerators/NNPA/Pass/NNPAPasses.hpp"
#include "src/Accelerators/NNPA/Support/NNPALimit.h"
#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
#include "src/Dialect/ONNX/DialectBuilder.hpp"
#include "src/Dialect/ONNX/ONNXOps.hpp"
#include "src/Dialect/ONNX/ONNXOps/OpHelper.hpp"
Expand Down
2 changes: 1 addition & 1 deletion test/accelerators/NNPA/backend/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ endif()
set(NNPA_TEST_LIST

# ==ARCH== NNPA
# ==ADDITIONAL_PARAGRAPH== NNPA has hardware limitations in dimension index size and tensor size, which are described in [NNPALimit.h](../src/Accelerators/NNPA/Support/NNPALimit.h). They are large enough for normal use cases, but if your model exceeds the limitations, CPU is used instead of NNPA.
# ==ADDITIONAL_PARAGRAPH== NNPA has hardware limitations in dimension index size and tensor size, which are described in [NNPALimit.hpp](../src/Accelerators/NNPA/Support/NNPALimit.hpp). They are large enough for normal use cases, but if your model exceeds the limitations, CPU is used instead of NNPA.

# ==OP== Add
# ==MIN== 6
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ func.func @test_matmul_unknown_batch_dim(%arg0: tensor<?x?x256x256xf32>) -> (ten

// -----

// Split MatMul because a dimension exceeds NNPA_MAXIMUM_DIMENSION_INDEX_SIZE = 32768.
// Split MatMul because a dimension exceeds NNPAGetMaxForDim = 32768.
func.func @test_matmul_splitting_A(%arg0: tensor<?x50257x768xf32>, %arg1: tensor<768x1024xf32>) -> (tensor<?x50257x1024xf32>) {
%0 = "onnx.MatMul"(%arg0, %arg1) : (tensor<?x50257x768xf32>, tensor<768x1024xf32>) -> tensor<?x50257x1024xf32>
return %0 : tensor<?x50257x1024xf32>
Expand All @@ -502,7 +502,7 @@ func.func @test_matmul_splitting_A(%arg0: tensor<?x50257x768xf32>, %arg1: tensor
// CHECK: }
}

// Split MatMul because a dimension exceeds NNPA_MAXIMUM_DIMENSION_INDEX_SIZE = 32768.
// Split MatMul because a dimension exceeds NNPAGetMaxForDim = 32768.
func.func @test_matmul_splitting_B(%arg0: tensor<?x?x768xf32>, %arg1: tensor<768x50257xf32>) -> (tensor<?x?x50257xf32>) {
%0 = "onnx.MatMul"(%arg0, %arg1) : (tensor<?x?x768xf32>, tensor<768x50257xf32>) -> tensor<?x?x50257xf32>
return %0 : tensor<?x?x50257xf32>
Expand All @@ -521,7 +521,7 @@ func.func @test_matmul_splitting_B(%arg0: tensor<?x?x768xf32>, %arg1: tensor<768

// -----

// Split MatMul because a dimension exceeds NNPA_MAXIMUM_DIMENSION_INDEX_SIZE = 32768.
// Split MatMul because a dimension exceeds NNPAGetMaxForDim = 32768.
func.func @test_matmul_splitting_A_B(%arg0: tensor<?x50257x768xf32>, %arg1: tensor<768x50258xf32>) -> (tensor<?x50257x50258xf32>) {
%0 = "onnx.MatMul"(%arg0, %arg1) : (tensor<?x50257x768xf32>, tensor<768x50258xf32>) -> tensor<?x50257x50258xf32>
return %0 : tensor<?x50257x50258xf32>
Expand Down
Loading