From 429fb5c02acf1a0c7f7340e6eb4d94a592fd0ee8 Mon Sep 17 00:00:00 2001 From: Ryan OShea <86965113+ArmRyan@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:18:19 +0200 Subject: [PATCH] MVEI support for depthwise conv s4 (#133) * Adds new file arm_nn_depthwise_conv_nt_t_s4.c * Adds MVEI optimization to arm_depthwise_conv_s4_opt.c * Updates get buffer size functions for depthwise conv s4 --------- Signed-off-by: Ryan O'Shea --- ARM.CMSIS-NN.pdsc | 1 + Include/arm_nnsupportfunctions.h | 69 +++++- README.md | 2 +- .../arm_depthwise_conv_get_buffer_sizes_s4.c | 24 +- .../arm_depthwise_conv_get_buffer_sizes_s8.c | 9 +- .../arm_depthwise_conv_s4_opt.c | 175 +++++++++++++- .../arm_nn_depthwise_conv_nt_t_s4.c | 227 ++++++++++++++++++ 7 files changed, 482 insertions(+), 25 deletions(-) create mode 100644 Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s4.c diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc index c6bb60dd..852b466d 100644 --- a/ARM.CMSIS-NN.pdsc +++ b/ARM.CMSIS-NN.pdsc @@ -109,6 +109,7 @@ + diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h index d739f512..434c7190 100644 --- a/Include/arm_nnsupportfunctions.h +++ b/Include/arm_nnsupportfunctions.h @@ -21,8 +21,8 @@ * Title: arm_nnsupportfunctions.h * Description: Public header file of support functions for CMSIS NN Library * - * $Date: 23 April 2024 - * $Revision: V.21.0.0 + * $Date: 30 April 2024 + * $Revision: V.21.1.0 * * Target : Arm(R) M-Profile Architecture * -------------------------------------------------------------------- */ @@ -55,7 +55,7 @@ extern "C" { #define CLAMP(x, h, l) MAX(MIN((x), (h)), (l)) #define REDUCE_MULTIPLIER(_mult) ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF) -// Number of channels processed in a block for DW Conv(MVE) +// Number of channels processed in a block for DW Conv with Int8 weights(MVE) // Requirement: Greater than 0 & less than 128 // This can be fine tuned to match number of input channels for best performance. // A layer with lower number of channels than CH_IN_BLOCK_MVE will result in higher @@ -63,6 +63,11 @@ extern "C" { // will result in lower scratch buffer usage. #define CH_IN_BLOCK_MVE (124) +// Number of channels processed in a block for DW Conv with Int4 weights(MVE) +// Requirement: See CH_IN_BLOCK_MVE. +// An additional requirement for this signed 4 variant is that it must be an even number. +#define S4_CH_IN_BLOCK_MVE (124) + // For input of int16 when number of columns are above this limit int64 accumulation is needed // to not loose precision. #define MAX_COL_COUNT (512) @@ -178,6 +183,20 @@ void arm_s8_to_s16_unordered_with_offset(const int8_t *src, int16_t *dst, int32_ #endif +/** + * @brief Get the required buffer size for optimized s8 depthwise convolution + * function with constraint that in_channel equals out_channel. + * This is for processors with MVE extension. + * Refer to arm_depthwise_conv_s8_opt_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_depthwise_conv_s8_opt_get_buffer_size(). Note also this is a support function, + * so not recommended to call directly even on Host. + * + */ +int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve(const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims); + /** * @brief Get the required buffer size for optimized s8 depthwise convolution * function with constraint that in_channel equals out_channel. @@ -784,6 +803,50 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const int8_t *lhs, const int32_t *const output_bias, int8_t *out); +/** + * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. rhs + * consists of packed int4 data. Dimensions are the same for lhs and rhs. + * + * @param[in] lhs Input left-hand side matrix + * @param[in] rhs Input right-hand side matrix (transposed). Consists of int4 data packed in an int8 + * buffer. + * @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128 + * @param[in] active_ch Subset of total_ch processed + * @param[in] total_ch Number of channels in LHS/RHS + * @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels. + * @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels. + * @param[in] out_offset Offset to be added to the output values. Range: -127 to 128 + * @param[in] activation_min Minimum value to clamp the output to. Range: int8 + * @param[in] activation_max Maximum value to clamp the output to. Range: int8 + * @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix + * @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels. + * @param[in] out Output pointer + * + * @return The function returns one of the two + * - Updated output pointer if an implementation is available + * - NULL if no implementation is available. + * + * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read + * out for the following. + * - Output shift + * - Output multiplier + * - Output bias + * - rhs + */ +arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s4(const int8_t *lhs, + const int8_t *rhs, + const int32_t lhs_offset, + const int32_t active_ch, + const int32_t total_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t row_x_col, + const int32_t *const output_bias, + int8_t *out); + /** * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. * Dimensions are the same for lhs and rhs. diff --git a/README.md b/README.md index f2ddba61..929ac84e 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Examples are Cortex-M55 or Cortex-M85 configured with MVE. | Operator | C
int8 | C
int16 | C
int4* | DSP
int8 | DSP
int16 | DSP
int4* | MVE
int8 | MVE
int16 | MVE
int4* | | --------------- | ----------- | ---------- |------------|-------------| -------------|--------------|-------------| -------------|--------------| | Conv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | -| DepthwiseConv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | +| DepthwiseConv2D | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | | TransposeConv2D | Yes | No | No | Yes | No | No | Yes | No | No | | Fully Connected | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | | Add | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes | N/A | diff --git a/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s4.c b/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s4.c index 452a863f..084ca2eb 100644 --- a/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s4.c +++ b/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s4.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -18,11 +18,11 @@ /* ---------------------------------------------------------------------- * Project: CMSIS NN Library - * Title: arm_depthwise_conv_get_buffer_sizes_s8.c - * Description: Collection of get buffer size functions for the various s8 convolution layer functions. + * Title: arm_depthwise_conv_get_buffer_sizes_s4.c + * Description: Collection of get buffer size functions for the various s4 depthwise convolution layer functions. * - * $Date: 30 October 2023 - * $Revision: V.1.0.0 + * $Date: 17 April 2024 + * $Revision: V.1.1.0 * * Target : Arm(R) M-Profile Architecture * @@ -42,7 +42,11 @@ int32_t arm_depthwise_conv_s4_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) { +#if defined(ARM_MATH_MVEI) + return arm_depthwise_conv_s8_opt_get_buffer_size_mve(input_dims, filter_dims); +#else return arm_depthwise_conv_s8_opt_get_buffer_size_dsp(input_dims, filter_dims); +#endif } int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, @@ -74,7 +78,15 @@ int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size_mve(const cmsis_nn_dw_conv const cmsis_nn_dims *filter_dims, const cmsis_nn_dims *output_dims) { - return arm_depthwise_conv_wrapper_s4_get_buffer_size(dw_conv_params, input_dims, filter_dims, output_dims); + int32_t size = 0; + + if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && + dw_conv_params->dilation.h == 1) + { + size = arm_depthwise_conv_s8_opt_get_buffer_size_mve(input_dims, filter_dims); + } + + return size; } /** diff --git a/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c b/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c index 94a8f520..b477982e 100644 --- a/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c +++ b/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_depthwise_conv_get_buffer_sizes_s8.c * Description: Collection of get buffer size functions for the various s8 convolution layer functions. * - * $Date: 30 October 2023 - * $Revision: V.1.1.0 + * $Date: 17 April 2024 + * $Revision: V.1.2.0 * * Target : Arm(R) M-Profile Architecture * @@ -40,8 +40,7 @@ * @{ */ -__STATIC_INLINE int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve(const cmsis_nn_dims *input_dims, - const cmsis_nn_dims *filter_dims) +int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) { (void)input_dims; return (4 * CH_IN_BLOCK_MVE * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int8_t); diff --git a/Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c b/Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c index 253e29d0..bd3850d6 100644 --- a/Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c +++ b/Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -22,8 +22,8 @@ * Description: Optimized s4 depthwise separable convolution function for * channel multiplier of 1. * - * $Date: 31 October 2023 - * $Revision: V.1.0.0 + * $Date: 17 April 2024 + * $Revision: V.1.1.0 * * Target : Arm(R) M-Profile Architecture * @@ -94,6 +94,161 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx, const int32_t output_activation_max = dw_conv_params->activation.max; int16_t *buffer_a = (int16_t *)ctx->buf; +#ifdef ARM_MATH_MVEI + /* Generate two columns from the input tensor */ + int8_t *lhs_buffer = (int8_t *)buffer_a; + int8_t *out = output; + int buffer_count = 0; + const int32_t kernel_size = kernel_x * kernel_y; + + const int32_t ch_loop = (input_ch + (S4_CH_IN_BLOCK_MVE - 1)) / S4_CH_IN_BLOCK_MVE; + int32_t remaining_ch = output_ch; + int32_t active_ch = MIN(S4_CH_IN_BLOCK_MVE, remaining_ch); + remaining_ch -= S4_CH_IN_BLOCK_MVE; + + for (int i_ch = 0; i_ch < ch_loop; i_ch++) + { + out = output + i_ch * S4_CH_IN_BLOCK_MVE; + const int8_t *input_slice = input + (i_ch * S4_CH_IN_BLOCK_MVE); + + for (int i_out_y = 0, base_idx_y = -pad_y; i_out_y < output_y; base_idx_y += stride_y, i_out_y++) + { + for (int i_out_x = 0, base_idx_x = -pad_x; i_out_x < output_x; base_idx_x += stride_x, i_out_x++) + { + for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++) + { + for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x) + { + arm_memset_s8(lhs_buffer, (int8_t)-input_offset, (uint32_t)active_ch); + } + else + { + arm_memcpy_s8(lhs_buffer, + input_slice + (i_ker_y * input_x + i_ker_x) * input_ch, + (uint32_t)active_ch); + } + lhs_buffer += S4_CH_IN_BLOCK_MVE; + } + } + buffer_count++; + + if (buffer_count == 4) + { + const int32_t block_offset = i_ch * S4_CH_IN_BLOCK_MVE; + lhs_buffer = (int8_t *)buffer_a; + arm_nn_depthwise_conv_nt_t_s4(lhs_buffer, + kernel + (block_offset >> 1), + input_offset, + active_ch, + input_ch, + output_shift + block_offset, + output_mult + block_offset, + output_offset, + output_activation_min, + output_activation_max, + kernel_size, + bias + block_offset, + out); + + out += (4 * input_ch); + buffer_count = 0; + } + } + } + /* Handle left over buffers */ + lhs_buffer = (int8_t *)buffer_a; + + int8_t *out_base = out; + const uint32x4_t gather_offset = {0, 0, 1, 1}; + const mve_pred16_t lower_nibble_mask = 3855; // 0000111100001111 + for (int i_buf = 0; i_buf < buffer_count; i_buf++) + { + int32_t loop_count = (active_ch + 3) / 4; + int32_t num_ch_to_process = active_ch; + out = out_base + (i_buf * input_ch); + for (int i_loop_cnt = 0, offset = i_ch * S4_CH_IN_BLOCK_MVE; i_loop_cnt < loop_count; + num_ch_to_process -= 4, offset += 4, i_loop_cnt++) + { + const int8_t *col_0 = lhs_buffer + (kernel_size * S4_CH_IN_BLOCK_MVE * i_buf) + (i_loop_cnt * 4); + const int8_t *row_0 = kernel + (offset >> 1); + int32x4_t out_0 = vdupq_n_s32(0); + if (bias) + { + out_0 = vldrwq_s32(&bias[offset]); + } + + if (input_ch % 2) + { + int get_low_nibble = 1; + for (int i_ker = 0; i_ker < kernel_size; i_ker++) + { + int32x4_t ker_0; + if (get_low_nibble) + { + ker_0 = vldrbq_gather_offset_s32(row_0, gather_offset); + ker_0 = vrshlq_m_n_s32(ker_0, 28, lower_nibble_mask); + ker_0 = vshrq_m_n_s32(ker_0, ker_0, 24, lower_nibble_mask); + + ker_0 = vshrq_n_s32(ker_0, 4); + } + else + { + int8_t temp[] = {row_0[0] >> 4, + (int8_t)(row_0[1] << 4) >> 4, + row_0[1] >> 4, + (int8_t)(row_0[2] << 4) >> 4}; + ker_0 = vldrbq_s32(temp); + } + + int32x4_t ip_0 = vldrbq_s32(col_0); + ip_0 = vaddq_n_s32(ip_0, input_offset); + out_0 += vmulq_s32(ip_0, ker_0); + + get_low_nibble = !get_low_nibble; + col_0 += S4_CH_IN_BLOCK_MVE; + row_0 += (input_ch >> 1) + get_low_nibble; + } + } + else + { + for (int i_ker = 0; i_ker < kernel_size; i_ker++) + { + int32x4_t ker_0 = vldrbq_gather_offset_s32(row_0, gather_offset); + ker_0 = vrshlq_m_n_s32(ker_0, 28, lower_nibble_mask); + ker_0 = vshrq_m_n_s32(ker_0, ker_0, 24, lower_nibble_mask); + + ker_0 = vshrq_n_s32(ker_0, 4); + + int32x4_t ip_0 = vldrbq_s32(col_0); + ip_0 = vaddq_n_s32(ip_0, input_offset); + out_0 += vmulq_s32(ip_0, ker_0); + + col_0 += S4_CH_IN_BLOCK_MVE; + row_0 += input_ch >> 1; + } + } + + const int32x4_t mult = vldrwq_s32(&output_mult[offset]); + const int32x4_t shift = vldrwq_s32(&output_shift[offset]); + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_0 = vaddq_n_s32(out_0, output_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(output_activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(output_activation_max)); + mve_pred16_t p = vctp32q((uint32_t)num_ch_to_process); + vstrbq_p_s32(out, out_0, p); + + out += 4; + } + } + buffer_count = 0; + + active_ch = MIN(S4_CH_IN_BLOCK_MVE, remaining_ch); + remaining_ch -= S4_CH_IN_BLOCK_MVE; + } +#else int16_t *const col_buffer_start = buffer_a; int16_t *col_buffer = col_buffer_start; const int32_t *const bias_start_pos = bias; @@ -186,7 +341,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx, while (col_count) { -#ifdef ARM_MATH_DSP + #ifdef ARM_MATH_DSP /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */ /* Note: variable names can be improved here to align with rows and columns. */ @@ -219,7 +374,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx, op_b = PKHTB(ip_a1, ip_b1, 16); sum_4 = SMLAD(op_a, op_b, sum_4); -#else + #else int8_t ker0, ker1, ker2, ker3, ker00, ker11; ker00 = row_pos[0]; @@ -245,7 +400,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx, sum_3 += ker2 * col_pos[2 + input_ch]; sum_4 += ker3 * col_pos[3 + input_ch]; -#endif + #endif row_pos += (input_ch); col_pos += input_ch << 1; @@ -384,7 +539,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx, row_shift += 2; col_shift += 4; -#ifdef ARM_MATH_DSP + #ifdef ARM_MATH_DSP while (col_count) { /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to @@ -426,9 +581,9 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx, } col_count = num_cols & 0x1; -#else + #else col_count = num_cols; -#endif + #endif while (col_count) { int8_t ker0, ker1, ker2, ker3, ker00, ker11; @@ -524,7 +679,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx, col_buffer = col_buffer_start; } } - +#endif // ARM_MATH_MVEI /* Return to application */ return ARM_CMSIS_NN_SUCCESS; } diff --git a/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s4.c b/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s4.c new file mode 100644 index 00000000..906d2e19 --- /dev/null +++ b/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s4.c @@ -0,0 +1,227 @@ +/* + * SPDX-FileCopyrightText: Copyright 2010-2020, 2022, 2024 Arm Limited and/or its affiliates + * + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_depthwise_conv_nt_t_s4.c + * Description: Depthwise convolution on matrices with no padding and packed int4 weights. + * + * $Date: 05 April 2024 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M processors with MVE extension. + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup supportConvolution + * @{ + */ + +/* + * Depthwise convolution of rhs matrix with 4 lhs matrices with no padding and packed int4 weights. + * Dimensions are the same for lhs and rhs. + * + * Refer header file for details. + * + */ +arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s4(const int8_t *lhs, + const int8_t *rhs, + const int32_t input_offset, + const int32_t active_ch, + const int32_t total_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t row_x_col, + const int32_t *const output_bias, + int8_t *out) +{ +#if defined(ARM_MATH_MVEI) + const int32_t *bias = output_bias; + int32_t loop_count = (active_ch + 3) / 4; + uint32_t num_ch_to_process = active_ch; + const uint32x4_t gather_offset = {0, 0, 1, 1}; + const mve_pred16_t lower_nibble_mask = 3855; // 0000111100001111 + + for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; + num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++) + { + int32x4_t out_0 = vdupq_n_s32(0); + if (bias) + { + out_0 = vldrwq_s32(bias); + bias += 4; + } + int32x4_t out_1 = out_0; + int32x4_t out_2 = out_0; + int32x4_t out_3 = out_0; + + const int8_t *rhs_0 = rhs + (offset >> 1); + const int8_t *lhs_0 = lhs + offset; + const int8_t *lhs_1 = lhs + row_x_col * S4_CH_IN_BLOCK_MVE + offset; + const int8_t *lhs_2 = lhs + (row_x_col * S4_CH_IN_BLOCK_MVE * 2) + offset; + const int8_t *lhs_3 = lhs + (row_x_col * S4_CH_IN_BLOCK_MVE * 3) + offset; + int32x4_t ker_sum = vdupq_n_s32(0); + + if (total_ch % 2) + { + int get_low_nibble = 1; + for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++) + { + int32x4_t ker_0; + if (get_low_nibble) + { + ker_0 = vldrbq_gather_offset_s32(rhs_0, gather_offset); + + ker_0 = vrshlq_m_n_s32(ker_0, 28, lower_nibble_mask); + ker_0 = vshrq_m_n_s32(ker_0, ker_0, 24, lower_nibble_mask); + + ker_0 = vshrq_n_s32(ker_0, 4); + } + else + { + int8_t temp[] = { + rhs_0[0] >> 4, (int8_t)(rhs_0[1] << 4) >> 4, rhs_0[1] >> 4, (int8_t)(rhs_0[2] << 4) >> 4}; + ker_0 = vldrbq_s32(temp); + } + + ker_sum = vaddq_s32(ker_sum, ker_0); + + int32x4_t ip_0 = vldrbq_s32(lhs_0); + out_0 += vmulq_s32(ip_0, ker_0); + + int32x4_t ip_1 = vldrbq_s32(lhs_1); + out_1 += vmulq_s32(ip_1, ker_0); + + int32x4_t ip_2 = vldrbq_s32(lhs_2); + out_2 += vmulq_s32(ip_2, ker_0); + + int32x4_t ip_3 = vldrbq_s32(lhs_3); + out_3 += vmulq_s32(ip_3, ker_0); + + lhs_0 += S4_CH_IN_BLOCK_MVE; + lhs_1 += S4_CH_IN_BLOCK_MVE; + lhs_2 += S4_CH_IN_BLOCK_MVE; + lhs_3 += S4_CH_IN_BLOCK_MVE; + + get_low_nibble = !get_low_nibble; + rhs_0 += (total_ch >> 1) + get_low_nibble; + } + } + else + { + for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++) + { + int32x4_t ker_0 = vldrbq_gather_offset_s32(rhs_0, gather_offset); + + ker_0 = vrshlq_m_n_s32(ker_0, 28, lower_nibble_mask); + ker_0 = vshrq_m_n_s32(ker_0, ker_0, 24, lower_nibble_mask); + + ker_0 = vshrq_n_s32(ker_0, 4); + + ker_sum = vaddq_s32(ker_sum, ker_0); + + int32x4_t ip_0 = vldrbq_s32(lhs_0); + out_0 += vmulq_s32(ip_0, ker_0); + + int32x4_t ip_1 = vldrbq_s32(lhs_1); + out_1 += vmulq_s32(ip_1, ker_0); + + int32x4_t ip_2 = vldrbq_s32(lhs_2); + out_2 += vmulq_s32(ip_2, ker_0); + + int32x4_t ip_3 = vldrbq_s32(lhs_3); + out_3 += vmulq_s32(ip_3, ker_0); + + lhs_0 += S4_CH_IN_BLOCK_MVE; + lhs_1 += S4_CH_IN_BLOCK_MVE; + lhs_2 += S4_CH_IN_BLOCK_MVE; + lhs_3 += S4_CH_IN_BLOCK_MVE; + + rhs_0 += total_ch >> 1; + } + } + + ker_sum = vmulq_n_s32(ker_sum, input_offset); + out_0 = ker_sum + out_0; + out_1 = ker_sum + out_1; + out_2 = ker_sum + out_2; + out_3 = ker_sum + out_3; + + const int32x4_t mult = vldrwq_s32(out_mult); + const int32x4_t shift = vldrwq_s32(out_shift); + out_mult += 4; + out_shift += 4; + mve_pred16_t p = vctp32q(num_ch_to_process); + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_0 = vaddq_n_s32(out_0, out_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out, out_0, p); + + out_1 = arm_requantize_mve_32x4(out_1, mult, shift); + out_1 = vaddq_n_s32(out_1, out_offset); + out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min)); + out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + total_ch, out_1, p); + + out_2 = arm_requantize_mve_32x4(out_2, mult, shift); + out_2 = vaddq_n_s32(out_2, out_offset); + out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min)); + out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + 2 * total_ch, out_2, p); + + out_3 = arm_requantize_mve_32x4(out_3, mult, shift); + out_3 = vaddq_n_s32(out_3, out_offset); + out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min)); + out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + 3 * total_ch, out_3, p); + } + + return ARM_CMSIS_NN_SUCCESS; +#else + (void)lhs; + (void)rhs; + (void)input_offset; + (void)active_ch; + (void)total_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)row_x_col; + (void)output_bias; + (void)out; + return ARM_CMSIS_NN_NO_IMPL_ERROR; +#endif +} + +/** + * @} end of Doxygen group + */