diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h index 5da1cb67..f2b7f29b 100644 --- a/Include/arm_nnsupportfunctions.h +++ b/Include/arm_nnsupportfunctions.h @@ -21,8 +21,8 @@ * Title: arm_nnsupportfunctions.h * Description: Public header file of support functions for CMSIS NN Library * - * $Date: 31 January 2024 - * $Revision: V.18.1.0 + * $Date: 14 February 2024 + * $Revision: V.19.0.0 * * Target : Arm(R) M-Profile Architecture * -------------------------------------------------------------------- */ @@ -529,6 +529,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s4(const int8_t *lhs, * @param[in] activation_max Maximum value to clamp the output to. Range: int8 * @param[in] address_offset Memory position offset for dst. First output is stored at 'dst', the * second at 'dst + address_offset' and so on. Default value is typically 1. + * @param[in] rhs_offset Offset to be added to the input values of the right-hand side vector. + * Range: -127 to 128 * * @return The function returns ARM_CMSIS_NN_SUCCESS * @@ -546,7 +548,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max, - const int32_t address_offset); + const int32_t address_offset, + const int32_t rhs_offset); /** * @brief s16 Vector by Matrix (transposed) multiplication diff --git a/Source/FullyConnectedFunctions/arm_fully_connected_s8.c b/Source/FullyConnectedFunctions/arm_fully_connected_s8.c index 1e4d5f6b..00ed201f 100644 --- a/Source/FullyConnectedFunctions/arm_fully_connected_s8.c +++ b/Source/FullyConnectedFunctions/arm_fully_connected_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2024 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_fully_connected_s8 * Description: Fully connected function compatible with TF Lite. * - * $Date: 23 October 2023 - * $Revision: V.5.2.0 + * $Date: 6 February 2024 + * $Revision: V.5.3.0 * * Target : Arm(R) M-Profile Architecture * @@ -60,7 +60,6 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx, int8_t *output) { (void)bias_dims; - (void)fc_params->filter_offset; int32_t batch_cnt = input_dims->n; @@ -71,10 +70,11 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx, } #endif - const int32_t *kernel_sum = (const int32_t *) ctx->buf; + const int32_t *kernel_sum = (const int32_t *)ctx->buf; while (batch_cnt) { + arm_nn_vec_mat_mult_t_s8(input, kernel, kernel_sum, @@ -88,7 +88,8 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx, output_dims->c, /* row_dim or output_depth */ fc_params->activation.min, fc_params->activation.max, - 1L); + 1L, + fc_params->filter_offset); input += filter_dims->n; output += output_dims->c; diff --git a/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c index 1287d00a..9575cd16 100644 --- a/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c +++ b/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_nn_vec_mat_mult_t_s8 * Description: s8 vector by matrix (transposed) multiplication * - * $Date: 5 May 2023 - * $Revision: V.5.4.1 + * $Date: 14 Feb 2023 + * $Revision: V.6.0.0 * * Target : Arm(R) M-Profile Architecture * @@ -68,339 +68,693 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max, - const int32_t address_offset) + const int32_t address_offset, + const int32_t rhs_offset) { + if (rhs_offset) + { #if defined(ARM_MATH_MVEI) - const int32_t row_loop_cnt = rhs_rows / 4; - const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3}; + const int32_t row_loop_cnt = rhs_rows / 4; + const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3}; - for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) - { - int32_t acc_0 = 0; - int32_t acc_1 = 0; - int32_t acc_2 = 0; - int32_t acc_3 = 0; + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + int32_t acc_0 = 0; + int32_t acc_1 = 0; + int32_t acc_2 = 0; + int32_t acc_3 = 0; - const int32_t col_loop_cnt = (rhs_cols + 15) / 16; + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; - const int8_t *lhs_vec = lhs; - const int8_t *rhs_0 = rhs; - const int8_t *rhs_1 = rhs + rhs_cols; - const int8_t *rhs_2 = rhs + 2 * rhs_cols; - const int8_t *rhs_3 = rhs + 3 * rhs_cols; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0_ptr = rhs; + const int8_t *rhs_1_ptr = rhs + rhs_cols; + const int8_t *rhs_2_ptr = rhs + 2 * rhs_cols; + const int8_t *rhs_3_ptr = rhs + 3 * rhs_cols; - if (bias) - { - acc_0 = *bias++; - acc_1 = *bias++; - acc_2 = *bias++; - acc_3 = *bias++; - } + int32_t lhs_sum = 0; - uint32_t col_cnt = (uint32_t)rhs_cols; + if (bias) + { + acc_0 = *bias++; + acc_1 = *bias++; + acc_2 = *bias++; + acc_3 = *bias++; + } - for (int i = 0; i < col_loop_cnt; i++) - { - mve_pred16_t p = vctp8q(col_cnt); - col_cnt -= 16; + uint32_t col_cnt = (uint32_t)rhs_cols; - const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + for (int32_t i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; - const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); - acc_0 = vmladavaq_s8(acc_0, ker_0, input); + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + lhs_sum = vaddvaq_s8(lhs_sum, input); - const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p); - acc_1 = vmladavaq_s8(acc_1, ker_1, input); + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0_ptr, p); + acc_0 = vmladavaq_s8(acc_0, ker_0, input); - const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p); - acc_2 = vmladavaq_s8(acc_2, ker_2, input); + const int8x16_t ker_1 = vldrbq_z_s8(rhs_1_ptr, p); + acc_1 = vmladavaq_s8(acc_1, ker_1, input); - const int8x16_t ker_3 = vldrbq_z_s8(rhs_3, p); - acc_3 = vmladavaq_s8(acc_3, ker_3, input); + const int8x16_t ker_2 = vldrbq_z_s8(rhs_2_ptr, p); + acc_2 = vmladavaq_s8(acc_2, ker_2, input); - lhs_vec += 16; - rhs_0 += 16; - rhs_1 += 16; - rhs_2 += 16; - rhs_3 += 16; - } - rhs += 4 * rhs_cols; + const int8x16_t ker_3 = vldrbq_z_s8(rhs_3_ptr, p); + acc_3 = vmladavaq_s8(acc_3, ker_3, input); - int32x4_t acc = {acc_0, acc_1, acc_2, acc_3}; + lhs_vec += 16; + rhs_0_ptr += 16; + rhs_1_ptr += 16; + rhs_2_ptr += 16; + rhs_3_ptr += 16; + } + rhs += 4 * rhs_cols; - const int32x4_t rhs_sum = {kernel_sum[0], kernel_sum[1], kernel_sum[2], kernel_sum[3]}; - acc += vdupq_n_s32(lhs_offset) * rhs_sum; - kernel_sum += 4; + int32x4_t acc = {acc_0, acc_1, acc_2, acc_3}; - acc = arm_requantize_mve(acc, dst_multiplier, dst_shift); - acc = vaddq_s32(acc, vdupq_n_s32(dst_offset)); - acc = vmaxq_s32(acc, vdupq_n_s32(activation_min)); - acc = vminq_s32(acc, vdupq_n_s32(activation_max)); + const int32x4_t rhs_sum = {kernel_sum[0], kernel_sum[1], kernel_sum[2], kernel_sum[3]}; + acc += vdupq_n_s32(lhs_offset) * rhs_sum; + kernel_sum += 4; - vstrbq_scatter_offset_s32(dst, address_offset_array, acc); + acc += vdupq_n_s32(rhs_offset) * vdupq_n_s32(lhs_sum); + acc += vdupq_n_s32(rhs_offset * lhs_offset) * vdupq_n_s32(rhs_cols); - dst += 4 * address_offset; - } + acc = arm_requantize_mve(acc, dst_multiplier, dst_shift); + acc = vaddq_s32(acc, vdupq_n_s32(dst_offset)); + acc = vmaxq_s32(acc, vdupq_n_s32(activation_min)); + acc = vminq_s32(acc, vdupq_n_s32(activation_max)); - const int loop_cnt = rhs_rows % 4; - for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++) - { - int32_t acc_0 = 0; - const int32_t col_loop_cnt = (rhs_cols + 15) / 16; - const int8_t *lhs_vec = lhs; - const int8_t *rhs_0 = rhs; - uint32_t col_cnt = (uint32_t)rhs_cols; + vstrbq_scatter_offset_s32(dst, address_offset_array, acc); + + dst += 4 * address_offset; + } - for (int i = 0; i < col_loop_cnt; i++) + const int loop_cnt = rhs_rows % 4; + for (int32_t i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++) { - mve_pred16_t p = vctp8q(col_cnt); - col_cnt -= 16; - const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + int32_t acc_0 = 0; + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_ptr = rhs; + int32_t lhs_sum = 0; + uint32_t col_cnt = (uint32_t)rhs_cols; + + for (int32_t i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + lhs_sum = vaddvaq_s8(lhs_sum, input); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_ptr, p); + acc_0 = vmladavaq_s8(acc_0, ker_0, input); + + lhs_vec += 16; + rhs_ptr += 16; + } + rhs += rhs_cols; + + if (bias) + { + acc_0 += *bias; + bias++; + } + const int32_t rhs_sum = kernel_sum[i_row_loop_cnt]; + acc_0 += rhs_sum * lhs_offset; + acc_0 += lhs_sum * rhs_offset; + acc_0 += rhs_cols * lhs_offset * rhs_offset; + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_0 += dst_offset; + + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + *dst = MIN(acc_0, activation_max); + dst += address_offset; + } - const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); - acc_0 = vmladavaq_s8(acc_0, ker_0, input); +#elif defined(ARM_MATH_DSP) + (void)kernel_sum; + + const int32_t row_loop_cnt = rhs_rows / 2; + const int16_t lhs_offset_s16 = (int16_t)lhs_offset; + const uint32_t lhs_offset_s16x2 = PKHBT(lhs_offset_s16, lhs_offset_s16, 16); - lhs_vec += 16; - rhs_0 += 16; + const int16_t rhs_offset_s16 = (int16_t)rhs_offset; + const uint32_t rhs_offset_s16x2 = PKHBT(rhs_offset_s16, rhs_offset_s16, 16); + + for (int32_t i = 0; i < row_loop_cnt; i++) + { + int32_t acc_0 = 0; + int32_t acc_1 = 0; + if (bias) + { + acc_0 = *bias++; + acc_1 = *bias++; + } + + const int32_t col_loop_cnt = rhs_cols / 4; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0_ptr = rhs; + const int8_t *rhs_1_ptr = rhs + rhs_cols; + rhs += 2 * rhs_cols; + + for (int32_t j = col_loop_cnt; j != 0; j--) + { + int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + int32_t vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + + int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0_ptr); + int32_t ker_1 = SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8); + ker_0 = SXTAB16(rhs_offset_s16x2, ker_0); + + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); + + ker_0 = arm_nn_read_s8x4_ia(&rhs_1_ptr); + ker_1 = SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8); + ker_0 = SXTAB16(rhs_offset_s16x2, ker_0); + + acc_1 = SMLAD(ker_1, vec_1, acc_1); + acc_1 = SMLAD(ker_0, vec_0, acc_1); + } + + for (int32_t k = col_loop_cnt * 4; k < rhs_cols; k++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_0_ptr + rhs_offset); + rhs_0_ptr++; + acc_1 += lhs_temp * (*rhs_1_ptr + rhs_offset); + rhs_1_ptr++; + } + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift); + + // Add offset + acc_0 += dst_offset; + acc_1 += dst_offset; + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + acc_1 = MAX(acc_1, activation_min); + acc_1 = MIN(acc_1, activation_max); + *dst = (int8_t)acc_0; + *(dst + address_offset) = (int8_t)acc_1; + dst += 2 * address_offset; } - rhs += rhs_cols; - if (bias) + if (rhs_rows & 0x1) { - acc_0 += *bias; - bias++; + int32_t acc_0 = 0; + if (bias) + { + acc_0 = *bias++; + } + const int32_t col_loop_cnt = rhs_cols / 4; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_ptr = rhs; + + for (int32_t i = col_loop_cnt; i != 0; i--) + { + int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + int32_t vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + + int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_ptr); + int32_t ker_1 = SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8); + ker_0 = SXTAB16(rhs_offset_s16x2, ker_0); + + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); + } + + for (int32_t j = col_loop_cnt * 4; j < rhs_cols; j++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_ptr + rhs_offset); + rhs_ptr++; + } + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + + // Add offset + acc_0 += dst_offset; + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + *dst = (int8_t)acc_0; + dst += address_offset; } - const int32_t rhs_sum = kernel_sum[i_row_loop_cnt]; - const int32_t offsets = rhs_sum * lhs_offset; - acc_0 += offsets; - acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); - acc_0 += dst_offset; - - // Clamp the result - acc_0 = MAX(acc_0, activation_min); - *dst = MIN(acc_0, activation_max); - dst += address_offset; - } -#elif defined(ARM_MATH_DSP) - (void)kernel_sum; +#else + (void)kernel_sum; - const int32_t row_loop_cnt = rhs_rows / 2; - const int16_t lhs_offset_s16 = (int16_t)lhs_offset; - const uint32_t lhs_offset_s16x2 = PKHBT(lhs_offset_s16, lhs_offset_s16, 16); + const int32_t row_loop_cnt = rhs_rows / 3; - for (int32_t i = 0; i < row_loop_cnt; i++) - { - int32_t acc_0 = 0; - int32_t acc_1 = 0; - if (bias) + for (int32_t i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) { - acc_0 = *bias++; - acc_1 = *bias++; + const int8_t *lhs_ptr = lhs; + const int8_t *rhs_ptr_0 = &rhs[0]; + const int8_t *rhs_ptr_1 = &rhs[rhs_cols]; + const int8_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; + + int32_t res00 = 0; + int32_t res01 = 0; + int32_t res02 = 0; + if (bias) + { + res00 = *bias++; + res01 = *bias++; + res02 = *bias++; + } + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + const int32_t rhs_value0 = (int8_t)*rhs_ptr_0 + rhs_offset; + const int32_t rhs_value1 = (int8_t)*rhs_ptr_1 + rhs_offset; + const int32_t rhs_value2 = (int8_t)*rhs_ptr_2 + rhs_offset; + const int32_t lhs_value = (int8_t)*lhs_ptr + lhs_offset; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + res02 += lhs_value * rhs_value2; + + ++rhs_ptr_0; + ++rhs_ptr_1; + ++rhs_ptr_2; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift); + res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + res02 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + res02 = MAX(res02, activation_min); + res02 = MIN(res02, activation_max); + + *dst = (int8_t)res00; + *(dst + address_offset) = (int8_t)res01; + *(dst + 2 * address_offset) = (int8_t)res02; + dst += 3 * address_offset; + + rhs += 3 * rhs_cols; } - const int32_t col_loop_cnt = rhs_cols / 4; + const int loop_cnt = rhs_rows % 3; - const int8_t *lhs_vec = lhs; - const int8_t *rhs_0 = rhs; - const int8_t *rhs_1 = rhs + rhs_cols; - rhs += 2 * rhs_cols; - - for (int j = col_loop_cnt; j != 0; j--) + for (int32_t i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++) { - int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); - int32_t vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + const int8_t *lhs_ptr = &lhs[0]; + const int8_t *rhs_ptr = &rhs[0]; - vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + int32_t res00 = 0; + if (bias) + { + res00 = *bias++; + } - int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0); - int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); - ker_0 = SXTB16(ker_0); + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + int32_t rhs_value0 = (int8_t)rhs_ptr[0] + rhs_offset; + int32_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset; - acc_0 = SMLAD(ker_1, vec_1, acc_0); - acc_0 = SMLAD(ker_0, vec_0, acc_0); + res00 += lhs_value * rhs_value0; - ker_0 = arm_nn_read_s8x4_ia(&rhs_1); - ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); - ker_0 = SXTB16(ker_0); + ++rhs_ptr; + ++lhs_ptr; + } - acc_1 = SMLAD(ker_1, vec_1, acc_1); - acc_1 = SMLAD(ker_0, vec_0, acc_1); - } + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); - for (int k = col_loop_cnt * 4; k < rhs_cols; k++) - { - const int32_t lhs_temp = (*lhs_vec + lhs_offset); - lhs_vec++; - acc_0 += lhs_temp * (*rhs_0); - rhs_0++; - acc_1 += lhs_temp * (*rhs_1); - rhs_1++; - } + // Add offset + res00 += dst_offset; - acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); - acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift); - - // Add offset - acc_0 += dst_offset; - acc_1 += dst_offset; - // Clamp the result - acc_0 = MAX(acc_0, activation_min); - acc_0 = MIN(acc_0, activation_max); - acc_1 = MAX(acc_1, activation_min); - acc_1 = MIN(acc_1, activation_max); - *dst = (int8_t)acc_0; - *(dst + address_offset) = (int8_t)acc_1; - dst += 2 * address_offset; + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + + *dst = (int8_t)res00; + dst += address_offset; + rhs += rhs_cols; + } +#endif } - if (rhs_rows & 0x1) + else { - int32_t acc_0 = 0; - if (bias) - { - acc_0 = *bias++; - } - const int32_t col_loop_cnt = rhs_cols / 4; - const int8_t *lhs_vec = lhs; - const int8_t *rhs_0 = rhs; +#if defined(ARM_MATH_MVEI) + const int32_t row_loop_cnt = rhs_rows / 4; + const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3}; - for (int i = col_loop_cnt; i != 0; i--) + for (int32_t i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) { - int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); - int32_t vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); - vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + int32_t acc_0 = 0; + int32_t acc_1 = 0; + int32_t acc_2 = 0; + int32_t acc_3 = 0; + + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; - int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0); - int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); - ker_0 = SXTB16(ker_0); + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0_ptr = rhs; + const int8_t *rhs_1_ptr = rhs + rhs_cols; + const int8_t *rhs_2_ptr = rhs + 2 * rhs_cols; + const int8_t *rhs_3_ptr = rhs + 3 * rhs_cols; - acc_0 = SMLAD(ker_1, vec_1, acc_0); - acc_0 = SMLAD(ker_0, vec_0, acc_0); + if (bias) + { + acc_0 = *bias++; + acc_1 = *bias++; + acc_2 = *bias++; + acc_3 = *bias++; + } + + uint32_t col_cnt = (uint32_t)rhs_cols; + + for (int32_t i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0_ptr, p); + acc_0 = vmladavaq_s8(acc_0, ker_0, input); + + const int8x16_t ker_1 = vldrbq_z_s8(rhs_1_ptr, p); + acc_1 = vmladavaq_s8(acc_1, ker_1, input); + + const int8x16_t ker_2 = vldrbq_z_s8(rhs_2_ptr, p); + acc_2 = vmladavaq_s8(acc_2, ker_2, input); + + const int8x16_t ker_3 = vldrbq_z_s8(rhs_3_ptr, p); + acc_3 = vmladavaq_s8(acc_3, ker_3, input); + + lhs_vec += 16; + rhs_0_ptr += 16; + rhs_1_ptr += 16; + rhs_2_ptr += 16; + rhs_3_ptr += 16; + } + rhs += 4 * rhs_cols; + + int32x4_t acc = {acc_0, acc_1, acc_2, acc_3}; + + const int32x4_t rhs_sum = {kernel_sum[0], kernel_sum[1], kernel_sum[2], kernel_sum[3]}; + acc += vdupq_n_s32(lhs_offset) * rhs_sum; + kernel_sum += 4; + + acc = arm_requantize_mve(acc, dst_multiplier, dst_shift); + acc = vaddq_s32(acc, vdupq_n_s32(dst_offset)); + acc = vmaxq_s32(acc, vdupq_n_s32(activation_min)); + acc = vminq_s32(acc, vdupq_n_s32(activation_max)); + + vstrbq_scatter_offset_s32(dst, address_offset_array, acc); + + dst += 4 * address_offset; } - for (int j = col_loop_cnt * 4; j < rhs_cols; j++) + const int loop_cnt = rhs_rows % 4; + for (int32_t i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++) { - const int32_t lhs_temp = (*lhs_vec + lhs_offset); - lhs_vec++; - acc_0 += lhs_temp * (*rhs_0); - rhs_0++; + int32_t acc_0 = 0; + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_ptr = rhs; + uint32_t col_cnt = (uint32_t)rhs_cols; + + for (int32_t i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_ptr, p); + acc_0 = vmladavaq_s8(acc_0, ker_0, input); + + lhs_vec += 16; + rhs_ptr += 16; + } + rhs += rhs_cols; + + if (bias) + { + acc_0 += *bias; + bias++; + } + const int32_t rhs_sum = kernel_sum[i_row_loop_cnt]; + const int32_t offsets = rhs_sum * lhs_offset; + acc_0 += offsets; + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_0 += dst_offset; + + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + *dst = MIN(acc_0, activation_max); + dst += address_offset; } - acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); - - // Add offset - acc_0 += dst_offset; - // Clamp the result - acc_0 = MAX(acc_0, activation_min); - acc_0 = MIN(acc_0, activation_max); - *dst = (int8_t)acc_0; - dst += address_offset; - } - -#else - (void)kernel_sum; +#elif defined(ARM_MATH_DSP) + (void)kernel_sum; - const int32_t row_loop_cnt = rhs_rows / 3; + const int32_t row_loop_cnt = rhs_rows / 2; + const int16_t lhs_offset_s16 = (int16_t)lhs_offset; + const uint32_t lhs_offset_s16x2 = PKHBT(lhs_offset_s16, lhs_offset_s16, 16); - for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) - { - const int8_t *lhs_ptr = lhs; - const int8_t *rhs_ptr_0 = &rhs[0]; - const int8_t *rhs_ptr_1 = &rhs[rhs_cols]; - const int8_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; - - int32_t res00 = 0; - int32_t res01 = 0; - int32_t res02 = 0; - if (bias) + for (int32_t i = 0; i < row_loop_cnt; i++) { - res00 = *bias++; - res01 = *bias++; - res02 = *bias++; + int32_t acc_0 = 0; + int32_t acc_1 = 0; + if (bias) + { + acc_0 = *bias++; + acc_1 = *bias++; + } + + const int32_t col_loop_cnt = rhs_cols / 4; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0_ptr = rhs; + const int8_t *rhs_1_ptr = rhs + rhs_cols; + rhs += 2 * rhs_cols; + + for (int32_t j = col_loop_cnt; j != 0; j--) + { + int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + int32_t vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + + int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_0_ptr); + int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); + + ker_0 = arm_nn_read_s8x4_ia(&rhs_1_ptr); + ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + + acc_1 = SMLAD(ker_1, vec_1, acc_1); + acc_1 = SMLAD(ker_0, vec_0, acc_1); + } + + for (int32_t k = col_loop_cnt * 4; k < rhs_cols; k++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_0_ptr); + rhs_0_ptr++; + acc_1 += lhs_temp * (*rhs_1_ptr); + rhs_1_ptr++; + } + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift); + + // Add offset + acc_0 += dst_offset; + acc_1 += dst_offset; + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + acc_1 = MAX(acc_1, activation_min); + acc_1 = MIN(acc_1, activation_max); + *dst = (int8_t)acc_0; + *(dst + address_offset) = (int8_t)acc_1; + dst += 2 * address_offset; } - for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + + if (rhs_rows & 0x1) { - const int32_t rhs_value0 = (int8_t)*rhs_ptr_0; - const int32_t rhs_value1 = (int8_t)*rhs_ptr_1; - const int32_t rhs_value2 = (int8_t)*rhs_ptr_2; - const int32_t lhs_value = (int8_t)*lhs_ptr + lhs_offset; - - res00 += lhs_value * rhs_value0; - res01 += lhs_value * rhs_value1; - res02 += lhs_value * rhs_value2; - - ++rhs_ptr_0; - ++rhs_ptr_1; - ++rhs_ptr_2; - ++lhs_ptr; + int32_t acc_0 = 0; + if (bias) + { + acc_0 = *bias++; + } + const int32_t col_loop_cnt = rhs_cols / 4; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_ptr = rhs; + + for (int32_t i = col_loop_cnt; i != 0; i--) + { + int32_t vec_0 = arm_nn_read_s8x4_ia(&lhs_vec); + int32_t vec_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + vec_0 = SXTAB16(lhs_offset_s16x2, vec_0); + + int32_t ker_0 = arm_nn_read_s8x4_ia(&rhs_ptr); + int32_t ker_1 = SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = SXTB16(ker_0); + + acc_0 = SMLAD(ker_1, vec_1, acc_0); + acc_0 = SMLAD(ker_0, vec_0, acc_0); + } + + for (int32_t j = col_loop_cnt * 4; j < rhs_cols; j++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_ptr); + rhs_ptr++; + } + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + + // Add offset + acc_0 += dst_offset; + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + *dst = (int8_t)acc_0; + dst += address_offset; } - // Quantize down - res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); - res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift); - res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift); - - // Add offset - res00 += dst_offset; - res01 += dst_offset; - res02 += dst_offset; - - // Clamp the result - res00 = MAX(res00, activation_min); - res00 = MIN(res00, activation_max); - res01 = MAX(res01, activation_min); - res01 = MIN(res01, activation_max); - res02 = MAX(res02, activation_min); - res02 = MIN(res02, activation_max); - - *dst = (int8_t)res00; - *(dst + address_offset) = (int8_t)res01; - *(dst + 2 * address_offset) = (int8_t)res02; - dst += 3 * address_offset; - - rhs += 3 * rhs_cols; - } - const int loop_cnt = rhs_rows % 3; +#else + (void)kernel_sum; - for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++) - { - const int8_t *lhs_ptr = &lhs[0]; - const int8_t *rhs_ptr = &rhs[0]; + const int32_t row_loop_cnt = rhs_rows / 3; - int32_t res00 = 0; - if (bias) + for (int32_t i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) { - res00 = *bias++; + const int8_t *lhs_ptr = lhs; + const int8_t *rhs_ptr_0 = &rhs[0]; + const int8_t *rhs_ptr_1 = &rhs[rhs_cols]; + const int8_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; + + int32_t res00 = 0; + int32_t res01 = 0; + int32_t res02 = 0; + if (bias) + { + res00 = *bias++; + res01 = *bias++; + res02 = *bias++; + } + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + const int32_t rhs_value0 = (int8_t)*rhs_ptr_0; + const int32_t rhs_value1 = (int8_t)*rhs_ptr_1; + const int32_t rhs_value2 = (int8_t)*rhs_ptr_2; + const int32_t lhs_value = (int8_t)*lhs_ptr + lhs_offset; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + res02 += lhs_value * rhs_value2; + + ++rhs_ptr_0; + ++rhs_ptr_1; + ++rhs_ptr_2; + ++lhs_ptr; + } + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift); + res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + res02 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + res02 = MAX(res02, activation_min); + res02 = MIN(res02, activation_max); + + *dst = (int8_t)res00; + *(dst + address_offset) = (int8_t)res01; + *(dst + 2 * address_offset) = (int8_t)res02; + dst += 3 * address_offset; + + rhs += 3 * rhs_cols; } - for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + const int loop_cnt = rhs_rows % 3; + + for (int32_t i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++) { - int32_t rhs_value0 = (int8_t)rhs_ptr[0]; - int32_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset; + const int8_t *lhs_ptr = &lhs[0]; + const int8_t *rhs_ptr = &rhs[0]; - res00 += lhs_value * rhs_value0; + int32_t res00 = 0; + if (bias) + { + res00 = *bias++; + } - ++rhs_ptr; - ++lhs_ptr; - } + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + int32_t rhs_value0 = (int8_t)rhs_ptr[0]; + int32_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset; - // Quantize down - res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + res00 += lhs_value * rhs_value0; - // Add offset - res00 += dst_offset; + ++rhs_ptr; + ++lhs_ptr; + } - // Clamp the result - res00 = MAX(res00, activation_min); - res00 = MIN(res00, activation_max); + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); - *dst = (int8_t)res00; - dst += address_offset; - rhs += rhs_cols; - } + // Add offset + res00 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + + *dst = (int8_t)res00; + dst += address_offset; + rhs += rhs_cols; + } #endif + } return ARM_CMSIS_NN_SUCCESS; } diff --git a/Source/SVDFunctions/arm_svdf_s8.c b/Source/SVDFunctions/arm_svdf_s8.c index a97ce3ae..d20e09d7 100644 --- a/Source/SVDFunctions/arm_svdf_s8.c +++ b/Source/SVDFunctions/arm_svdf_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2024 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -21,8 +21,8 @@ * Title: arm_svdf_s8.c * Description: S8 basic SVDF layer function * - * $Date: 5 September 2023 - * $Revision: V.6.0.0 + * $Date: 14 Feb 2024 + * $Revision: V.6.1.0 * * Target : Arm(R) M-Profile Architecture * @@ -133,7 +133,8 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *ctx, feature_batches, in_activation_min, in_activation_max, - time_batches); + time_batches, + 0); if (res != ARM_CMSIS_NN_SUCCESS) { diff --git a/Tests/UnitTest/TestCases/Common/fc_s4_weights_template.json b/Tests/UnitTest/TestCases/Common/fc_weights_template.json similarity index 98% rename from Tests/UnitTest/TestCases/Common/fc_s4_weights_template.json rename to Tests/UnitTest/TestCases/Common/fc_weights_template.json index b7f36037..81bcee83 100644 --- a/Tests/UnitTest/TestCases/Common/fc_s4_weights_template.json +++ b/Tests/UnitTest/TestCases/Common/fc_weights_template.json @@ -35,7 +35,7 @@ output_size, input_size ], - "type": "INT4", + "type": "w_type", "buffer": 1, "name" : "tensor_weight", "quantization": { diff --git a/Tests/UnitTest/TestCases/Common/fc_s4_weights_template_null_bias.json b/Tests/UnitTest/TestCases/Common/fc_weights_template_null_bias.json similarity index 98% rename from Tests/UnitTest/TestCases/Common/fc_s4_weights_template_null_bias.json rename to Tests/UnitTest/TestCases/Common/fc_weights_template_null_bias.json index 59f027b7..2f69bf62 100644 --- a/Tests/UnitTest/TestCases/Common/fc_s4_weights_template_null_bias.json +++ b/Tests/UnitTest/TestCases/Common/fc_weights_template_null_bias.json @@ -35,7 +35,7 @@ output_size, input_size ], - "type": "INT4", + "type": "w_type", "buffer": 1, "name" : "tensor_weight", "quantization": { diff --git a/Tests/UnitTest/TestCases/Common/fc_s4_weights_template_null_bias_unpacked.json b/Tests/UnitTest/TestCases/Common/fc_weights_template_null_bias_unpacked.json similarity index 100% rename from Tests/UnitTest/TestCases/Common/fc_s4_weights_template_null_bias_unpacked.json rename to Tests/UnitTest/TestCases/Common/fc_weights_template_null_bias_unpacked.json diff --git a/Tests/UnitTest/TestCases/Common/fc_s4_weights_template_unpacked.json b/Tests/UnitTest/TestCases/Common/fc_weights_template_unpacked.json similarity index 100% rename from Tests/UnitTest/TestCases/Common/fc_s4_weights_template_unpacked.json rename to Tests/UnitTest/TestCases/Common/fc_weights_template_unpacked.json diff --git a/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/biases_data.h b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/biases_data.h new file mode 100644 index 00000000..3a4badd1 --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/biases_data.h @@ -0,0 +1,6 @@ +// Generated by test_settings.py using tensorflow version 2.15.0 (Keras version 2.15.0). +// Interpreter from tensorflow version 2.15.0 and revision v2.15.0-2-g0b15fdfcb3f. +#pragma once +#include + +const int32_t *fully_connected_w_zp_biases = NULL; diff --git a/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/config_data.h b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/config_data.h new file mode 100644 index 00000000..c59f122c --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/config_data.h @@ -0,0 +1,18 @@ +// Generated by test_settings.py using tensorflow version 2.15.0 (Keras version 2.15.0). +// Interpreter from tensorflow version 2.15.0 and revision v2.15.0-2-g0b15fdfcb3f. +#pragma once +#define FULLY_CONNECTED_W_ZP_OUT_CH 6 +#define FULLY_CONNECTED_W_ZP_IN_CH 10 +#define FULLY_CONNECTED_W_ZP_INPUT_W 2 +#define FULLY_CONNECTED_W_ZP_INPUT_H 1 +#define FULLY_CONNECTED_W_ZP_DST_SIZE 18 +#define FULLY_CONNECTED_W_ZP_INPUT_SIZE 20 +#define FULLY_CONNECTED_W_ZP_OUT_ACTIVATION_MIN -128 +#define FULLY_CONNECTED_W_ZP_OUT_ACTIVATION_MAX 127 +#define FULLY_CONNECTED_W_ZP_INPUT_BATCHES 3 +#define FULLY_CONNECTED_W_ZP_OUTPUT_MULTIPLIER 1417628845 +#define FULLY_CONNECTED_W_ZP_OUTPUT_SHIFT -7 +#define FULLY_CONNECTED_W_ZP_ACCUMULATION_DEPTH 20 +#define FULLY_CONNECTED_W_ZP_INPUT_OFFSET -2 +#define FULLY_CONNECTED_W_ZP_FILTER_OFFSET -15 +#define FULLY_CONNECTED_W_ZP_OUTPUT_OFFSET 35 diff --git a/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/input_data.h b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/input_data.h new file mode 100644 index 00000000..29dcba55 --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/input_data.h @@ -0,0 +1,9 @@ +// Generated by test_settings.py using tensorflow version 2.15.0 (Keras version 2.15.0). +// Interpreter from tensorflow version 2.15.0 and revision v2.15.0-2-g0b15fdfcb3f. +#pragma once +#include + +const int8_t fully_connected_w_zp_input[60] = { + 24, -17, 19, -78, -113, 35, -125, -40, 77, -59, -46, -56, -128, 25, 59, 79, 122, 59, -46, -37, + 37, -10, -56, -100, -26, -9, -52, -128, -55, -122, 24, 4, 65, 31, 124, 87, -55, 96, 120, 35, + -104, -18, 4, -90, 35, 82, -111, -111, -31, -117, 20, -84, -29, -45, -118, 86, -47, -50, -69, -35}; diff --git a/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/output_ref_data.h b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/output_ref_data.h new file mode 100644 index 00000000..8a4a8379 --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/output_ref_data.h @@ -0,0 +1,7 @@ +// Generated by test_settings.py using tensorflow version 2.15.0 (Keras version 2.15.0). +// Interpreter from tensorflow version 2.15.0 and revision v2.15.0-2-g0b15fdfcb3f. +#pragma once +#include + +const int8_t fully_connected_w_zp_output_ref[18] = + {-3, 127, -87, 86, 127, -21, 86, 0, -74, 94, 127, -92, 127, 36, 127, 127, 127, 34}; diff --git a/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/test_data.h b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/test_data.h new file mode 100644 index 00000000..79d78626 --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/test_data.h @@ -0,0 +1,7 @@ +// Generated by test_settings.py using tensorflow version 2.15.0 (Keras version 2.15.0). +// Interpreter from tensorflow version 2.15.0 and revision v2.15.0-2-g0b15fdfcb3f. +#include "biases_data.h" +#include "config_data.h" +#include "input_data.h" +#include "output_ref_data.h" +#include "weights_data.h" diff --git a/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/weights_data.h b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/weights_data.h new file mode 100644 index 00000000..3daff602 --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/fully_connected_w_zp/weights_data.h @@ -0,0 +1,12 @@ +// Generated by test_settings.py using tensorflow version 2.15.0 (Keras version 2.15.0). +// Interpreter from tensorflow version 2.15.0 and revision v2.15.0-2-g0b15fdfcb3f. +#pragma once +#include + +const int8_t fully_connected_w_zp_weights[120] = { + -44, -43, 22, -123, -26, -126, -6, -8, -94, -46, -15, 89, 76, -47, -114, 28, 49, -54, 4, 8, + 124, -96, 81, 46, -99, 95, -107, -58, 48, 116, 32, -32, -128, -84, 58, -45, 39, -40, 111, -56, + -92, -128, 57, -33, -1, 15, 38, 89, 109, 37, -99, 123, 64, -110, -101, 64, -116, -19, 91, -89, + -102, -31, -101, -76, -27, 68, -112, 41, 49, -42, 30, -122, 109, -89, 31, -52, -127, 9, -120, -17, + 64, 45, -2, 51, -97, -29, -128, -93, 55, -77, -11, 34, -16, 0, -78, -81, -115, 96, 64, -96, + -50, 52, -38, 116, 98, 102, 85, -86, 106, 54, -122, -83, -22, 65, -23, -113, -19, 80, 41, -51}; diff --git a/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/Unity/unity_test_arm_fully_connected_s8.c b/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/Unity/unity_test_arm_fully_connected_s8.c index 41339110..324c31d1 100644 --- a/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/Unity/unity_test_arm_fully_connected_s8.c +++ b/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/Unity/unity_test_arm_fully_connected_s8.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * SPDX-FileCopyrightText: Copyright 2010-2021, 2024 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -46,6 +46,8 @@ void tearDown(void) {} void test_fully_connected_arm_fully_connected_s8(void) { fully_connected_arm_fully_connected_s8(); } +void test_fully_connected_w_zp_arm_fully_connected_s8(void) { fully_connected_w_zp_arm_fully_connected_s8(); } + void test_fully_connected_mve_0_arm_fully_connected_s8(void) { fully_connected_mve_0_arm_fully_connected_s8(); } void test_fully_connected_mve_1_arm_fully_connected_s8(void) { fully_connected_mve_1_arm_fully_connected_s8(); } diff --git a/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c b/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c index be5be41d..d6488c9d 100644 --- a/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c +++ b/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c @@ -25,6 +25,7 @@ #include "../TestData/fully_connected_mve_1/test_data.h" #include "../TestData/fully_connected_null_bias_0/test_data.h" #include "../TestData/fully_connected_out_activation/test_data.h" +#include "../TestData/fully_connected_w_zp/test_data.h" #include "../Utils/validate.h" void fully_connected_arm_fully_connected_s8(void) @@ -95,6 +96,74 @@ void fully_connected_arm_fully_connected_s8(void) TEST_ASSERT_TRUE(validate(output, output_ref, output_ref_size)); } +void fully_connected_w_zp_arm_fully_connected_s8(void) +{ + const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS; + int8_t output[FULLY_CONNECTED_W_ZP_DST_SIZE] = {0}; + + cmsis_nn_context ctx; + cmsis_nn_fc_params fc_params; + cmsis_nn_per_tensor_quant_params quant_params; + cmsis_nn_dims input_dims; + cmsis_nn_dims filter_dims; + cmsis_nn_dims bias_dims; + cmsis_nn_dims output_dims; + + const int32_t *bias_data = fully_connected_w_zp_biases; + const int8_t *kernel_data = fully_connected_w_zp_weights; + const int8_t *input_data = fully_connected_w_zp_input; + const int8_t *output_ref = fully_connected_w_zp_output_ref; + const int32_t output_ref_size = FULLY_CONNECTED_W_ZP_DST_SIZE; + + input_dims.n = FULLY_CONNECTED_W_ZP_INPUT_BATCHES; + input_dims.w = FULLY_CONNECTED_W_ZP_INPUT_W; + input_dims.h = FULLY_CONNECTED_W_ZP_INPUT_H; + input_dims.c = FULLY_CONNECTED_W_ZP_IN_CH; + filter_dims.n = FULLY_CONNECTED_W_ZP_ACCUMULATION_DEPTH; + filter_dims.c = FULLY_CONNECTED_W_ZP_OUT_CH; + output_dims.n = FULLY_CONNECTED_W_ZP_INPUT_BATCHES; + output_dims.c = FULLY_CONNECTED_W_ZP_OUT_CH; + + fc_params.input_offset = FULLY_CONNECTED_W_ZP_INPUT_OFFSET; + fc_params.filter_offset = FULLY_CONNECTED_W_ZP_FILTER_OFFSET; + fc_params.output_offset = FULLY_CONNECTED_W_ZP_OUTPUT_OFFSET; + fc_params.activation.min = FULLY_CONNECTED_W_ZP_OUT_ACTIVATION_MIN; + fc_params.activation.max = FULLY_CONNECTED_W_ZP_OUT_ACTIVATION_MAX; + + quant_params.multiplier = FULLY_CONNECTED_W_ZP_OUTPUT_MULTIPLIER; + quant_params.shift = FULLY_CONNECTED_W_ZP_OUTPUT_SHIFT; + + const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); + ctx.buf = malloc(buf_size); + ctx.size = buf_size; + +#if defined(ARM_MATH_MVEI) + int32_t *buf = ctx.buf; + TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data, 1, NULL)); +#endif + + arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx, + &fc_params, + &quant_params, + &input_dims, + input_data, + &filter_dims, + kernel_data, + &bias_dims, + bias_data, + &output_dims, + output); + + if (ctx.buf) + { + // The caller is responsible to clear the scratch buffers for security reasons if applicable. + memset(ctx.buf, 0, buf_size); + free(ctx.buf); + } + TEST_ASSERT_EQUAL(expected, result); + TEST_ASSERT_TRUE(validate(output, output_ref, output_ref_size)); +} + void fully_connected_mve_0_arm_fully_connected_s8(void) { const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS; diff --git a/Tests/UnitTest/conv_settings.py b/Tests/UnitTest/conv_settings.py index f16c688b..f81d5c92 100644 --- a/Tests/UnitTest/conv_settings.py +++ b/Tests/UnitTest/conv_settings.py @@ -93,18 +93,15 @@ def __init__(self, raise RuntimeError("out channel ({}) is not multiple of in channel ({})".format(out_ch, in_ch)) if groups != 1: raise RuntimeError("ERROR: Groups cannot be used for depthwise convolution") + else: + self.channel_multiplier = 0 self.filter_ch = in_ch // groups if in_ch % groups != 0: - print(in_ch) - print(groups) raise RuntimeError("ERROR: Number of input channels must be an even multiple of groups") if out_ch % groups != 0: raise RuntimeError("ERROR: Number of output channels must be an even multiple of groups") - else: - self.channel_multiplier = 0 - if self.int4_weights: if self.test_type == 'conv': self.json_template = "TestCases/Common/conv2d_s4_weights_template.json" @@ -149,7 +146,6 @@ def generate_quantize_per_channel_multiplier(self): return per_channel_multiplier, per_channel_shift - # TODO def quantize_float_data(self, data=None, quantization_bit_range=8, quantization_type="affine", tf_tensor=False): if data is not None: if tf_tensor: @@ -162,13 +158,13 @@ def quantize_float_data(self, data=None, quantization_bit_range=8, quantization_ data_max = max(data_max, 0.0) scale = (data_max - data_min) / (pow(2, quantization_bit_range) - 1) - zero_point = -(round(data_max * scale)) - pow(2, quantization_bit_range-1) - zero_point = max(zero_point, pow(quantization_bit_range-1) - 1) - zero_point = min(zero_point, -pow(quantization_bit_range-1)) + zero_point = -(round(data_max * scale)) - pow(2, quantization_bit_range - 1) + zero_point = max(zero_point, pow(quantization_bit_range - 1) - 1) + zero_point = min(zero_point, -pow(quantization_bit_range - 1)) elif quantization_type.lower() == "symmetric": absolute_max = max(abs(data_min), abs(data_max)) - scale = absolute_max / (pow(2, quantization_bit_range-1) - 1) + scale = absolute_max / (pow(2, quantization_bit_range - 1) - 1) zero_point = 0 else: @@ -283,7 +279,8 @@ def generate_data(self, input_data=None, weights=None, biases=None) -> None: generated_json = self.generate_json_from_template( None, weights, int8_time_weights=True, bias_data=biases, bias_buffer=3) else: - generated_json = self.generate_json_from_template(weights, int8_time_weights=False, bias_data=quant_bias, bias_buffer=2) + generated_json = self.generate_json_from_template(weights, int8_time_weights=False, + bias_data=quant_bias, bias_buffer=2) self.flatc_generate_tflite(generated_json, self.schema_file) @@ -317,7 +314,7 @@ def generate_data(self, input_data=None, weights=None, biases=None) -> None: padding=self.padding, input_shape=input_shape[1:], dilation_rate=(self.dilation_y, self.dilation_x), - groups=self.groups) + groups=self.groups) model.add(conv_layer) conv_layer.set_weights([weights, biases]) elif self.test_type == 'depthwise_conv': @@ -335,7 +332,8 @@ def generate_data(self, input_data=None, weights=None, biases=None) -> None: strides=(self.stride_y, self.stride_x), padding=self.padding, input_shape=input_shape[1:], - dilation_rate=(self.dilation_y, self.dilation_x), + dilation_rate=(self.dilation_y, + self.dilation_x), use_bias=self.generate_bias) model.add(transposed_conv_layer) if self.generate_bias: diff --git a/Tests/UnitTest/fully_connected_settings.py b/Tests/UnitTest/fully_connected_settings.py index 74d33485..9a60b37f 100644 --- a/Tests/UnitTest/fully_connected_settings.py +++ b/Tests/UnitTest/fully_connected_settings.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2010-2024 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # @@ -85,17 +85,22 @@ def __init__(self, interpreter=interpreter, int4_weights=int4_weights) - if self.int4_weights: + self.filter_zero_point = w_zp + + if self.int4_weights or self.filter_zero_point: if self.generate_bias: - self.json_template = "TestCases/Common/fc_s4_weights_template.json" + self.json_template = "TestCases/Common/fc_weights_template.json" else: - self.json_template = "TestCases/Common/fc_s4_weights_template_null_bias.json" + self.json_template = "TestCases/Common/fc_weights_template_null_bias.json" + + weight_type = "INT4" if self.int4_weights else "INT8" self.json_replacements = { "batches": batches, "input_size": in_ch * x_in * y_in, "input_scale": input_scale, "input_zp": input_zp, + "w_type": weight_type, "w_scale": w_scale, "w_zp": w_zp, "bias_size": out_ch, @@ -118,6 +123,7 @@ def write_c_config_header(self) -> None: f.write("#define {}_OUTPUT_SHIFT {}\n".format(prefix, self.quantized_shift)) f.write("#define {}_ACCUMULATION_DEPTH {}\n".format(prefix, self.input_ch * self.x_input * self.y_input)) f.write("#define {}_INPUT_OFFSET {}\n".format(prefix, -self.input_zero_point)) + f.write("#define {}_FILTER_OFFSET {}\n".format(prefix, -self.filter_zero_point)) f.write("#define {}_OUTPUT_OFFSET {}\n".format(prefix, self.output_zero_point)) def quantize_multiplier(self, weights_scale): @@ -151,7 +157,30 @@ def generate_data(self, input_data=None, weights=None, biases=None) -> None: else: biases = None - if self.int4_weights: + if self.filter_zero_point: + temp1 = self.model_path + temp2 = self.json_template + + fc_weights_format = [self.input_ch * self.y_input * self.x_input * self.output_ch] + if weights is not None: + weights = tf.reshape(weights, fc_weights_format) + else: + weights = self.get_randomized_data(fc_weights_format, + self.kernel_table_file, + minrange=TestSettings.INT8_MIN, + maxrange=TestSettings.INT8_MAX, + regenerate=self.regenerate_new_weights) + + self.model_path = self.model_path + self.json_template = self.json_template + generated_json = self.generate_json_from_template(weights, bias_data=biases, bias_buffer=2) + self.flatc_generate_tflite(generated_json, self.schema_file) + + weights_size = weights.numpy().size + filter_index = 1 + bias_index = 2 + + elif self.int4_weights: # Generate weights, both packed and unpacked model from JSON temp1 = self.model_path temp2 = self.json_template @@ -226,6 +255,10 @@ def generate_data(self, input_data=None, weights=None, biases=None) -> None: (self.generate_bias and biases.numpy().size != interpreter.get_tensor(bias_layer['index']).size): raise RuntimeError(f"Dimension mismatch for {self.testdataset}") + weights_zero_point = filter_layer['quantization_parameters']['zero_points'][0] + if weights_zero_point != self.filter_zero_point: + raise RuntimeError(f"Filter zero point point mismatch for {self.filter_zero_point}") + self.x_output = 1 self.y_output = 1 diff --git a/Tests/UnitTest/generate_test_data.py b/Tests/UnitTest/generate_test_data.py index c4790136..18d0c1bb 100755 --- a/Tests/UnitTest/generate_test_data.py +++ b/Tests/UnitTest/generate_test_data.py @@ -2028,6 +2028,27 @@ def load_testdata_sets(regenerate_input, regenerate_weights, regenerate_biases, y_in=1, batches=3, interpreter=interpreter) + dataset = 'fully_connected_w_zp' + testdata_sets[dataset] = FullyConnectedSettings(dataset, + type_of_test, + regenerate_weights, + regenerate_input, + regenerate_biases, + schema_file, + in_ch=10, + out_ch=6, + x_in=2, + y_in=1, + batches=3, + input_scale=0.034, + w_scale=0.054, + bias_scale=0.00000001, + output_scale=0.356, + input_zp=2, + output_zp=35, + w_zp=15, + generate_bias=False, + interpreter=interpreter) dataset = 'fully_connected_mve_0' testdata_sets[dataset] = FullyConnectedSettings(dataset, type_of_test,