Skip to content

Commit

Permalink
MVEI support for depthwise conv s4 (#133)
Browse files Browse the repository at this point in the history
  * Adds new file arm_nn_depthwise_conv_nt_t_s4.c
  * Adds MVEI optimization to arm_depthwise_conv_s4_opt.c
  * Updates get buffer size functions for depthwise conv s4

---------

Signed-off-by: Ryan O'Shea <ryan.oshea3@arm.com>
  • Loading branch information
ArmRyan authored Apr 30, 2024
1 parent 15dbe7c commit 429fb5c
Show file tree
Hide file tree
Showing 7 changed files with 482 additions and 25 deletions.
1 change: 1 addition & 0 deletions ARM.CMSIS-NN.pdsc
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
<file category="source" name="Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nntables.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s4.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s4.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c"/>
Expand Down
69 changes: 66 additions & 3 deletions Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: 23 April 2024
* $Revision: V.21.0.0
* $Date: 30 April 2024
* $Revision: V.21.1.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -55,14 +55,19 @@ extern "C" {
#define CLAMP(x, h, l) MAX(MIN((x), (h)), (l))
#define REDUCE_MULTIPLIER(_mult) ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF)

// Number of channels processed in a block for DW Conv(MVE)
// Number of channels processed in a block for DW Conv with Int8 weights(MVE)
// Requirement: Greater than 0 & less than 128
// This can be fine tuned to match number of input channels for best performance.
// A layer with lower number of channels than CH_IN_BLOCK_MVE will result in higher
// scratch buffer usage and a layer with higher number of channels than CH_IN_BLOCK_MVE
// will result in lower scratch buffer usage.
#define CH_IN_BLOCK_MVE (124)

// Number of channels processed in a block for DW Conv with Int4 weights(MVE)
// Requirement: See CH_IN_BLOCK_MVE.
// An additional requirement for this signed 4 variant is that it must be an even number.
#define S4_CH_IN_BLOCK_MVE (124)

// For input of int16 when number of columns are above this limit int64 accumulation is needed
// to not loose precision.
#define MAX_COL_COUNT (512)
Expand Down Expand Up @@ -178,6 +183,20 @@ void arm_s8_to_s16_unordered_with_offset(const int8_t *src, int16_t *dst, int32_

#endif

/**
* @brief Get the required buffer size for optimized s8 depthwise convolution
* function with constraint that in_channel equals out_channel.
* This is for processors with MVE extension.
* Refer to arm_depthwise_conv_s8_opt_get_buffer_size() for function argument details.
*
* @note Intended for compilation on Host. If compiling for an Arm target, use
* arm_depthwise_conv_s8_opt_get_buffer_size(). Note also this is a support function,
* so not recommended to call directly even on Host.
*
*/
int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve(const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims);

/**
* @brief Get the required buffer size for optimized s8 depthwise convolution
* function with constraint that in_channel equals out_channel.
Expand Down Expand Up @@ -784,6 +803,50 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const int8_t *lhs,
const int32_t *const output_bias,
int8_t *out);

/**
* @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. rhs
* consists of packed int4 data. Dimensions are the same for lhs and rhs.
*
* @param[in] lhs Input left-hand side matrix
* @param[in] rhs Input right-hand side matrix (transposed). Consists of int4 data packed in an int8
* buffer.
* @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128
* @param[in] active_ch Subset of total_ch processed
* @param[in] total_ch Number of channels in LHS/RHS
* @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels.
* @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels.
* @param[in] out_offset Offset to be added to the output values. Range: -127 to 128
* @param[in] activation_min Minimum value to clamp the output to. Range: int8
* @param[in] activation_max Maximum value to clamp the output to. Range: int8
* @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix
* @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels.
* @param[in] out Output pointer
*
* @return The function returns one of the two
* - Updated output pointer if an implementation is available
* - NULL if no implementation is available.
*
* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
* out for the following.
* - Output shift
* - Output multiplier
* - Output bias
* - rhs
*/
arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s4(const int8_t *lhs,
const int8_t *rhs,
const int32_t lhs_offset,
const int32_t active_ch,
const int32_t total_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t row_x_col,
const int32_t *const output_bias,
int8_t *out);

/**
* @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases.
* Dimensions are the same for lhs and rhs.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Examples are Cortex-M55 or Cortex-M85 configured with MVE.
| Operator | C <br> int8 | C<br>int16 | C<br>int4* | DSP<br>int8 | DSP<br>int16 | DSP<br>int4* | MVE<br>int8 | MVE<br>int16 | MVE<br>int4* |
| --------------- | ----------- | ---------- |------------|-------------| -------------|--------------|-------------| -------------|--------------|
| Conv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| DepthwiseConv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No |
| DepthwiseConv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| TransposeConv2D | Yes | No | No | Yes | No | No | Yes | No | No |
| Fully Connected | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No |
| Add | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes | N/A |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
* SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
*
* SPDX-License-Identifier: Apache-2.0
*
Expand All @@ -18,11 +18,11 @@

/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_get_buffer_sizes_s8.c
* Description: Collection of get buffer size functions for the various s8 convolution layer functions.
* Title: arm_depthwise_conv_get_buffer_sizes_s4.c
* Description: Collection of get buffer size functions for the various s4 depthwise convolution layer functions.
*
* $Date: 30 October 2023
* $Revision: V.1.0.0
* $Date: 17 April 2024
* $Revision: V.1.1.0
*
* Target : Arm(R) M-Profile Architecture
*
Expand All @@ -42,7 +42,11 @@

int32_t arm_depthwise_conv_s4_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if defined(ARM_MATH_MVEI)
return arm_depthwise_conv_s8_opt_get_buffer_size_mve(input_dims, filter_dims);
#else
return arm_depthwise_conv_s8_opt_get_buffer_size_dsp(input_dims, filter_dims);
#endif
}

int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
Expand Down Expand Up @@ -74,7 +78,15 @@ int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size_mve(const cmsis_nn_dw_conv
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims)
{
return arm_depthwise_conv_wrapper_s4_get_buffer_size(dw_conv_params, input_dims, filter_dims, output_dims);
int32_t size = 0;

if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
dw_conv_params->dilation.h == 1)
{
size = arm_depthwise_conv_s8_opt_get_buffer_size_mve(input_dims, filter_dims);
}

return size;
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
* SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
*
* SPDX-License-Identifier: Apache-2.0
*
Expand All @@ -21,8 +21,8 @@
* Title: arm_depthwise_conv_get_buffer_sizes_s8.c
* Description: Collection of get buffer size functions for the various s8 convolution layer functions.
*
* $Date: 30 October 2023
* $Revision: V.1.1.0
* $Date: 17 April 2024
* $Revision: V.1.2.0
*
* Target : Arm(R) M-Profile Architecture
*
Expand All @@ -40,8 +40,7 @@
* @{
*/

__STATIC_INLINE int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve(const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims)
int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
(void)input_dims;
return (4 * CH_IN_BLOCK_MVE * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int8_t);
Expand Down
Loading

0 comments on commit 429fb5c

Please sign in to comment.