From 429fb5c02acf1a0c7f7340e6eb4d94a592fd0ee8 Mon Sep 17 00:00:00 2001
From: Ryan OShea <86965113+ArmRyan@users.noreply.github.com>
Date: Tue, 30 Apr 2024 16:18:19 +0200
Subject: [PATCH] MVEI support for depthwise conv s4 (#133)

  * Adds new file arm_nn_depthwise_conv_nt_t_s4.c
  * Adds MVEI optimization to arm_depthwise_conv_s4_opt.c
  * Updates get buffer size functions for depthwise conv s4

---------

Signed-off-by: Ryan O'Shea <ryan.oshea3@arm.com>
---
 ARM.CMSIS-NN.pdsc                             |   1 +
 Include/arm_nnsupportfunctions.h              |  69 +++++-
 README.md                                     |   2 +-
 .../arm_depthwise_conv_get_buffer_sizes_s4.c  |  24 +-
 .../arm_depthwise_conv_get_buffer_sizes_s8.c  |   9 +-
 .../arm_depthwise_conv_s4_opt.c               | 175 +++++++++++++-
 .../arm_nn_depthwise_conv_nt_t_s4.c           | 227 ++++++++++++++++++
 7 files changed, 482 insertions(+), 25 deletions(-)
 create mode 100644 Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s4.c
diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc
index c6bb60dd..852b466d 100644
--- a/ARM.CMSIS-NN.pdsc
+++ b/ARM.CMSIS-NN.pdsc
@@ -109,6 +109,7 @@
         <file category="source" name="Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nntables.c"/>
+        <file category="source" name="Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s4.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s4.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c"/>
diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h
index d739f512..434c7190 100644
--- a/Include/arm_nnsupportfunctions.h
+++ b/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        23 April 2024
- * $Revision:    V.21.0.0
+ * $Date:        30 April 2024
+ * $Revision:    V.21.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -55,7 +55,7 @@ extern "C" {
 #define CLAMP(x, h, l) MAX(MIN((x), (h)), (l))
 #define REDUCE_MULTIPLIER(_mult) ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF)
 
-// Number of channels processed in a block for DW Conv(MVE)
+// Number of channels processed in a block for DW Conv with Int8 weights(MVE)
 // Requirement: Greater than 0 & less than 128
 // This can be fine tuned to match number of input channels for best performance.
 // A layer with lower number of channels than CH_IN_BLOCK_MVE will result in higher
@@ -63,6 +63,11 @@ extern "C" {
 // will result in lower scratch buffer usage.
 #define CH_IN_BLOCK_MVE (124)
 
+// Number of channels processed in a block for DW Conv with Int4 weights(MVE)
+// Requirement: See CH_IN_BLOCK_MVE.
+// An additional requirement for this signed 4 variant is that it must be an even number.
+#define S4_CH_IN_BLOCK_MVE (124)
+
 // For input of int16 when number of columns are above this limit int64 accumulation is needed
 // to not loose precision.
 #define MAX_COL_COUNT (512)
@@ -178,6 +183,20 @@ void arm_s8_to_s16_unordered_with_offset(const int8_t *src, int16_t *dst, int32_
 
 #endif
 
+/**
+ * @brief Get the required buffer size for optimized s8 depthwise convolution
+ *        function with constraint that in_channel equals out_channel.
+ *        This is for processors with MVE extension.
+ *        Refer to arm_depthwise_conv_s8_opt_get_buffer_size() for function argument details.
+ *
+ * @note  Intended for compilation on Host. If compiling for an Arm target, use
+ *        arm_depthwise_conv_s8_opt_get_buffer_size(). Note also this is a support function,
+ *        so not recommended to call directly even on Host.
+ *
+ */
+int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve(const cmsis_nn_dims *input_dims,
+                                                      const cmsis_nn_dims *filter_dims);
+
 /**
  * @brief Get the required buffer size for optimized s8 depthwise convolution
  *        function with constraint that in_channel equals out_channel.
@@ -784,6 +803,50 @@ arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const int8_t *lhs,
                                                   const int32_t *const output_bias,
                                                   int8_t *out);
 
+/**
+ * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. rhs
+ * consists of packed int4 data. Dimensions are the same for lhs and rhs.
+ *
+ * @param[in]      lhs             Input left-hand side matrix
+ * @param[in]      rhs             Input right-hand side matrix (transposed). Consists of int4 data packed in an int8
+ * buffer.
+ * @param[in]      lhs_offset      LHS matrix offset(input offset). Range: -127 to 128
+ * @param[in]      active_ch       Subset of total_ch processed
+ * @param[in]      total_ch        Number of channels in LHS/RHS
+ * @param[in]      out_shift       Per channel output shift. Length of vector is equal to number of channels.
+ * @param[in]      out_mult        Per channel output multiplier. Length of vector is equal to number of channels.
+ * @param[in]      out_offset      Offset to be added to the output values. Range: -127 to 128
+ * @param[in]      activation_min  Minimum value to clamp the output to. Range: int8
+ * @param[in]      activation_max  Maximum value to clamp the output to. Range: int8
+ * @param[in]       row_x_col       (row_dimension * col_dimension) of LHS/RHS matrix
+ * @param[in]      output_bias     Per channel output bias. Length of vector is equal to number of channels.
+ * @param[in]      out             Output pointer
+ *
+ * @return         The function returns one of the two
+ *                  - Updated output pointer if an implementation is available
+ *                  - NULL if no implementation is available.
+ *
+ * @note           If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
+ * out for the following.
+ *                  - Output shift
+ *                  - Output multiplier
+ *                  - Output bias
+ *                  - rhs
+ */
+arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s4(const int8_t *lhs,
+                                                  const int8_t *rhs,
+                                                  const int32_t lhs_offset,
+                                                  const int32_t active_ch,
+                                                  const int32_t total_ch,
+                                                  const int32_t *out_shift,
+                                                  const int32_t *out_mult,
+                                                  const int32_t out_offset,
+                                                  const int32_t activation_min,
+                                                  const int32_t activation_max,
+                                                  const uint16_t row_x_col,
+                                                  const int32_t *const output_bias,
+                                                  int8_t *out);
+
 /**
  * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases.
  *        Dimensions are the same for lhs and rhs.
diff --git a/README.md b/README.md
index f2ddba61..929ac84e 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Examples are Cortex-M55 or Cortex-M85 configured with MVE.
 | Operator        | C <br> int8 | C<br>int16 | C<br>int4* | DSP<br>int8 | DSP<br>int16 | DSP<br>int4* | MVE<br>int8 | MVE<br>int16 | MVE<br>int4* |
 | --------------- | ----------- | ---------- |------------|-------------| -------------|--------------|-------------| -------------|--------------|
 | Conv2D          | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          | Yes          |
-| DepthwiseConv2D | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          | No           |
+| DepthwiseConv2D | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          | Yes          |
 | TransposeConv2D | Yes         | No         | No         | Yes         | No           | No           | Yes         | No           | No           |
 | Fully Connected | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          | No           |
 | Add             | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          | N/A          |
diff --git a/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s4.c b/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s4.c
index 452a863f..084ca2eb 100644
--- a/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s4.c
+++ b/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s4.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -18,11 +18,11 @@
 
 /* ----------------------------------------------------------------------
  * Project:      CMSIS NN Library
- * Title:        arm_depthwise_conv_get_buffer_sizes_s8.c
- * Description:  Collection of get buffer size functions for the various s8 convolution layer functions.
+ * Title:        arm_depthwise_conv_get_buffer_sizes_s4.c
+ * Description:  Collection of get buffer size functions for the various s4 depthwise convolution layer functions.
  *
- * $Date:        30 October 2023
- * $Revision:    V.1.0.0
+ * $Date:        17 April 2024
+ * $Revision:    V.1.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  *
@@ -42,7 +42,11 @@
 
 int32_t arm_depthwise_conv_s4_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
 {
+#if defined(ARM_MATH_MVEI)
+    return arm_depthwise_conv_s8_opt_get_buffer_size_mve(input_dims, filter_dims);
+#else
     return arm_depthwise_conv_s8_opt_get_buffer_size_dsp(input_dims, filter_dims);
+#endif
 }
 
 int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
@@ -74,7 +78,15 @@ int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size_mve(const cmsis_nn_dw_conv
                                                           const cmsis_nn_dims *filter_dims,
                                                           const cmsis_nn_dims *output_dims)
 {
-    return arm_depthwise_conv_wrapper_s4_get_buffer_size(dw_conv_params, input_dims, filter_dims, output_dims);
+    int32_t size = 0;
+
+    if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
+        dw_conv_params->dilation.h == 1)
+    {
+        size = arm_depthwise_conv_s8_opt_get_buffer_size_mve(input_dims, filter_dims);
+    }
+
+    return size;
 }
 
 /**
diff --git a/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c b/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c
index 94a8f520..b477982e 100644
--- a/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c
+++ b/Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_depthwise_conv_get_buffer_sizes_s8.c
  * Description:  Collection of get buffer size functions for the various s8 convolution layer functions.
  *
- * $Date:        30 October 2023
- * $Revision:    V.1.1.0
+ * $Date:        17 April 2024
+ * $Revision:    V.1.2.0
  *
  * Target :  Arm(R) M-Profile Architecture
  *
@@ -40,8 +40,7 @@
  * @{
  */
 
-__STATIC_INLINE int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve(const cmsis_nn_dims *input_dims,
-                                                                      const cmsis_nn_dims *filter_dims)
+int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
 {
     (void)input_dims;
     return (4 * CH_IN_BLOCK_MVE * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int8_t);
diff --git a/Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c b/Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c
index 253e29d0..bd3850d6 100644
--- a/Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c
+++ b/Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,8 +22,8 @@
  * Description:  Optimized s4 depthwise separable convolution function for
  *               channel multiplier of 1.
  *
- * $Date:        31 October 2023
- * $Revision:    V.1.0.0
+ * $Date:        17 April 2024
+ * $Revision:    V.1.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  *
@@ -94,6 +94,161 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx,
     const int32_t output_activation_max = dw_conv_params->activation.max;
     int16_t *buffer_a = (int16_t *)ctx->buf;
 
+#ifdef ARM_MATH_MVEI
+    /* Generate two columns from the input tensor */
+    int8_t *lhs_buffer = (int8_t *)buffer_a;
+    int8_t *out = output;
+    int buffer_count = 0;
+    const int32_t kernel_size = kernel_x * kernel_y;
+
+    const int32_t ch_loop = (input_ch + (S4_CH_IN_BLOCK_MVE - 1)) / S4_CH_IN_BLOCK_MVE;
+    int32_t remaining_ch = output_ch;
+    int32_t active_ch = MIN(S4_CH_IN_BLOCK_MVE, remaining_ch);
+    remaining_ch -= S4_CH_IN_BLOCK_MVE;
+
+    for (int i_ch = 0; i_ch < ch_loop; i_ch++)
+    {
+        out = output + i_ch * S4_CH_IN_BLOCK_MVE;
+        const int8_t *input_slice = input + (i_ch * S4_CH_IN_BLOCK_MVE);
+
+        for (int i_out_y = 0, base_idx_y = -pad_y; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
+        {
+            for (int i_out_x = 0, base_idx_x = -pad_x; i_out_x < output_x; base_idx_x += stride_x, i_out_x++)
+            {
+                for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++)
+                {
+                    for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++)
+                    {
+                        if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
+                        {
+                            arm_memset_s8(lhs_buffer, (int8_t)-input_offset, (uint32_t)active_ch);
+                        }
+                        else
+                        {
+                            arm_memcpy_s8(lhs_buffer,
+                                          input_slice + (i_ker_y * input_x + i_ker_x) * input_ch,
+                                          (uint32_t)active_ch);
+                        }
+                        lhs_buffer += S4_CH_IN_BLOCK_MVE;
+                    }
+                }
+                buffer_count++;
+
+                if (buffer_count == 4)
+                {
+                    const int32_t block_offset = i_ch * S4_CH_IN_BLOCK_MVE;
+                    lhs_buffer = (int8_t *)buffer_a;
+                    arm_nn_depthwise_conv_nt_t_s4(lhs_buffer,
+                                                  kernel + (block_offset >> 1),
+                                                  input_offset,
+                                                  active_ch,
+                                                  input_ch,
+                                                  output_shift + block_offset,
+                                                  output_mult + block_offset,
+                                                  output_offset,
+                                                  output_activation_min,
+                                                  output_activation_max,
+                                                  kernel_size,
+                                                  bias + block_offset,
+                                                  out);
+
+                    out += (4 * input_ch);
+                    buffer_count = 0;
+                }
+            }
+        }
+        /* Handle left over buffers */
+        lhs_buffer = (int8_t *)buffer_a;
+
+        int8_t *out_base = out;
+        const uint32x4_t gather_offset = {0, 0, 1, 1};
+        const mve_pred16_t lower_nibble_mask = 3855; // 0000111100001111
+        for (int i_buf = 0; i_buf < buffer_count; i_buf++)
+        {
+            int32_t loop_count = (active_ch + 3) / 4;
+            int32_t num_ch_to_process = active_ch;
+            out = out_base + (i_buf * input_ch);
+            for (int i_loop_cnt = 0, offset = i_ch * S4_CH_IN_BLOCK_MVE; i_loop_cnt < loop_count;
+                 num_ch_to_process -= 4, offset += 4, i_loop_cnt++)
+            {
+                const int8_t *col_0 = lhs_buffer + (kernel_size * S4_CH_IN_BLOCK_MVE * i_buf) + (i_loop_cnt * 4);
+                const int8_t *row_0 = kernel + (offset >> 1);
+                int32x4_t out_0 = vdupq_n_s32(0);
+                if (bias)
+                {
+                    out_0 = vldrwq_s32(&bias[offset]);
+                }
+
+                if (input_ch % 2)
+                {
+                    int get_low_nibble = 1;
+                    for (int i_ker = 0; i_ker < kernel_size; i_ker++)
+                    {
+                        int32x4_t ker_0;
+                        if (get_low_nibble)
+                        {
+                            ker_0 = vldrbq_gather_offset_s32(row_0, gather_offset);
+                            ker_0 = vrshlq_m_n_s32(ker_0, 28, lower_nibble_mask);
+                            ker_0 = vshrq_m_n_s32(ker_0, ker_0, 24, lower_nibble_mask);
+
+                            ker_0 = vshrq_n_s32(ker_0, 4);
+                        }
+                        else
+                        {
+                            int8_t temp[] = {row_0[0] >> 4,
+                                             (int8_t)(row_0[1] << 4) >> 4,
+                                             row_0[1] >> 4,
+                                             (int8_t)(row_0[2] << 4) >> 4};
+                            ker_0 = vldrbq_s32(temp);
+                        }
+
+                        int32x4_t ip_0 = vldrbq_s32(col_0);
+                        ip_0 = vaddq_n_s32(ip_0, input_offset);
+                        out_0 += vmulq_s32(ip_0, ker_0);
+
+                        get_low_nibble = !get_low_nibble;
+                        col_0 += S4_CH_IN_BLOCK_MVE;
+                        row_0 += (input_ch >> 1) + get_low_nibble;
+                    }
+                }
+                else
+                {
+                    for (int i_ker = 0; i_ker < kernel_size; i_ker++)
+                    {
+                        int32x4_t ker_0 = vldrbq_gather_offset_s32(row_0, gather_offset);
+                        ker_0 = vrshlq_m_n_s32(ker_0, 28, lower_nibble_mask);
+                        ker_0 = vshrq_m_n_s32(ker_0, ker_0, 24, lower_nibble_mask);
+
+                        ker_0 = vshrq_n_s32(ker_0, 4);
+
+                        int32x4_t ip_0 = vldrbq_s32(col_0);
+                        ip_0 = vaddq_n_s32(ip_0, input_offset);
+                        out_0 += vmulq_s32(ip_0, ker_0);
+
+                        col_0 += S4_CH_IN_BLOCK_MVE;
+                        row_0 += input_ch >> 1;
+                    }
+                }
+
+                const int32x4_t mult = vldrwq_s32(&output_mult[offset]);
+                const int32x4_t shift = vldrwq_s32(&output_shift[offset]);
+
+                out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
+                out_0 = vaddq_n_s32(out_0, output_offset);
+                out_0 = vmaxq_s32(out_0, vdupq_n_s32(output_activation_min));
+                out_0 = vminq_s32(out_0, vdupq_n_s32(output_activation_max));
+                mve_pred16_t p = vctp32q((uint32_t)num_ch_to_process);
+                vstrbq_p_s32(out, out_0, p);
+
+                out += 4;
+            }
+        }
+        buffer_count = 0;
+
+        active_ch = MIN(S4_CH_IN_BLOCK_MVE, remaining_ch);
+        remaining_ch -= S4_CH_IN_BLOCK_MVE;
+    }
+#else
     int16_t *const col_buffer_start = buffer_a;
     int16_t *col_buffer = col_buffer_start;
     const int32_t *const bias_start_pos = bias;
@@ -186,7 +341,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx,
 
                     while (col_count)
                     {
-#ifdef ARM_MATH_DSP
+    #ifdef ARM_MATH_DSP
                         /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to
                            use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */
                         /* Note: variable names can be improved here to align with rows and columns. */
@@ -219,7 +374,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx,
                         op_b = PKHTB(ip_a1, ip_b1, 16);
                         sum_4 = SMLAD(op_a, op_b, sum_4);
 
-#else
+    #else
                         int8_t ker0, ker1, ker2, ker3, ker00, ker11;
 
                         ker00 = row_pos[0];
@@ -245,7 +400,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx,
                         sum_3 += ker2 * col_pos[2 + input_ch];
                         sum_4 += ker3 * col_pos[3 + input_ch];
 
-#endif
+    #endif
                         row_pos += (input_ch);
                         col_pos += input_ch << 1;
 
@@ -384,7 +539,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx,
                     row_shift += 2;
                     col_shift += 4;
 
-#ifdef ARM_MATH_DSP
+    #ifdef ARM_MATH_DSP
                     while (col_count)
                     {
                         /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to
@@ -426,9 +581,9 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx,
                     }
 
                     col_count = num_cols & 0x1;
-#else
+    #else
                     col_count = num_cols;
-#endif
+    #endif
                     while (col_count)
                     {
                         int8_t ker0, ker1, ker2, ker3, ker00, ker11;
@@ -524,7 +679,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx,
             col_buffer = col_buffer_start;
         }
     }
-
+#endif // ARM_MATH_MVEI
     /* Return to application */
     return ARM_CMSIS_NN_SUCCESS;
 }
diff --git a/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s4.c b/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s4.c
new file mode 100644
index 00000000..906d2e19
--- /dev/null
+++ b/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s4.c
@@ -0,0 +1,227 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2010-2020, 2022, 2024 Arm Limited and/or its affiliates
+ * <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_depthwise_conv_nt_t_s4.c
+ * Description:  Depthwise convolution on matrices with no padding and packed int4 weights.
+ *
+ * $Date:        05 April 2024
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M processors with MVE extension.
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup supportConvolution
+ * @{
+ */
+
+/*
+ * Depthwise convolution of rhs matrix with 4 lhs matrices with no padding and packed int4 weights.
+ * Dimensions are the same for lhs and rhs.
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s4(const int8_t *lhs,
+                                                  const int8_t *rhs,
+                                                  const int32_t input_offset,
+                                                  const int32_t active_ch,
+                                                  const int32_t total_ch,
+                                                  const int32_t *out_shift,
+                                                  const int32_t *out_mult,
+                                                  const int32_t out_offset,
+                                                  const int32_t activation_min,
+                                                  const int32_t activation_max,
+                                                  const uint16_t row_x_col,
+                                                  const int32_t *const output_bias,
+                                                  int8_t *out)
+{
+#if defined(ARM_MATH_MVEI)
+    const int32_t *bias = output_bias;
+    int32_t loop_count = (active_ch + 3) / 4;
+    uint32_t num_ch_to_process = active_ch;
+    const uint32x4_t gather_offset = {0, 0, 1, 1};
+    const mve_pred16_t lower_nibble_mask = 3855; // 0000111100001111
+
+    for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
+         num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++)
+    {
+        int32x4_t out_0 = vdupq_n_s32(0);
+        if (bias)
+        {
+            out_0 = vldrwq_s32(bias);
+            bias += 4;
+        }
+        int32x4_t out_1 = out_0;
+        int32x4_t out_2 = out_0;
+        int32x4_t out_3 = out_0;
+
+        const int8_t *rhs_0 = rhs + (offset >> 1);
+        const int8_t *lhs_0 = lhs + offset;
+        const int8_t *lhs_1 = lhs + row_x_col * S4_CH_IN_BLOCK_MVE + offset;
+        const int8_t *lhs_2 = lhs + (row_x_col * S4_CH_IN_BLOCK_MVE * 2) + offset;
+        const int8_t *lhs_3 = lhs + (row_x_col * S4_CH_IN_BLOCK_MVE * 3) + offset;
+        int32x4_t ker_sum = vdupq_n_s32(0);
+
+        if (total_ch % 2)
+        {
+            int get_low_nibble = 1;
+            for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
+            {
+                int32x4_t ker_0;
+                if (get_low_nibble)
+                {
+                    ker_0 = vldrbq_gather_offset_s32(rhs_0, gather_offset);
+
+                    ker_0 = vrshlq_m_n_s32(ker_0, 28, lower_nibble_mask);
+                    ker_0 = vshrq_m_n_s32(ker_0, ker_0, 24, lower_nibble_mask);
+
+                    ker_0 = vshrq_n_s32(ker_0, 4);
+                }
+                else
+                {
+                    int8_t temp[] = {
+                        rhs_0[0] >> 4, (int8_t)(rhs_0[1] << 4) >> 4, rhs_0[1] >> 4, (int8_t)(rhs_0[2] << 4) >> 4};
+                    ker_0 = vldrbq_s32(temp);
+                }
+
+                ker_sum = vaddq_s32(ker_sum, ker_0);
+
+                int32x4_t ip_0 = vldrbq_s32(lhs_0);
+                out_0 += vmulq_s32(ip_0, ker_0);
+
+                int32x4_t ip_1 = vldrbq_s32(lhs_1);
+                out_1 += vmulq_s32(ip_1, ker_0);
+
+                int32x4_t ip_2 = vldrbq_s32(lhs_2);
+                out_2 += vmulq_s32(ip_2, ker_0);
+
+                int32x4_t ip_3 = vldrbq_s32(lhs_3);
+                out_3 += vmulq_s32(ip_3, ker_0);
+
+                lhs_0 += S4_CH_IN_BLOCK_MVE;
+                lhs_1 += S4_CH_IN_BLOCK_MVE;
+                lhs_2 += S4_CH_IN_BLOCK_MVE;
+                lhs_3 += S4_CH_IN_BLOCK_MVE;
+
+                get_low_nibble = !get_low_nibble;
+                rhs_0 += (total_ch >> 1) + get_low_nibble;
+            }
+        }
+        else
+        {
+            for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
+            {
+                int32x4_t ker_0 = vldrbq_gather_offset_s32(rhs_0, gather_offset);
+
+                ker_0 = vrshlq_m_n_s32(ker_0, 28, lower_nibble_mask);
+                ker_0 = vshrq_m_n_s32(ker_0, ker_0, 24, lower_nibble_mask);
+
+                ker_0 = vshrq_n_s32(ker_0, 4);
+
+                ker_sum = vaddq_s32(ker_sum, ker_0);
+
+                int32x4_t ip_0 = vldrbq_s32(lhs_0);
+                out_0 += vmulq_s32(ip_0, ker_0);
+
+                int32x4_t ip_1 = vldrbq_s32(lhs_1);
+                out_1 += vmulq_s32(ip_1, ker_0);
+
+                int32x4_t ip_2 = vldrbq_s32(lhs_2);
+                out_2 += vmulq_s32(ip_2, ker_0);
+
+                int32x4_t ip_3 = vldrbq_s32(lhs_3);
+                out_3 += vmulq_s32(ip_3, ker_0);
+
+                lhs_0 += S4_CH_IN_BLOCK_MVE;
+                lhs_1 += S4_CH_IN_BLOCK_MVE;
+                lhs_2 += S4_CH_IN_BLOCK_MVE;
+                lhs_3 += S4_CH_IN_BLOCK_MVE;
+
+                rhs_0 += total_ch >> 1;
+            }
+        }
+
+        ker_sum = vmulq_n_s32(ker_sum, input_offset);
+        out_0 = ker_sum + out_0;
+        out_1 = ker_sum + out_1;
+        out_2 = ker_sum + out_2;
+        out_3 = ker_sum + out_3;
+
+        const int32x4_t mult = vldrwq_s32(out_mult);
+        const int32x4_t shift = vldrwq_s32(out_shift);
+        out_mult += 4;
+        out_shift += 4;
+        mve_pred16_t p = vctp32q(num_ch_to_process);
+
+        out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
+        out_0 = vaddq_n_s32(out_0, out_offset);
+        out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
+        out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
+        vstrbq_p_s32(out, out_0, p);
+
+        out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
+        out_1 = vaddq_n_s32(out_1, out_offset);
+        out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
+        out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
+        vstrbq_p_s32(out + total_ch, out_1, p);
+
+        out_2 = arm_requantize_mve_32x4(out_2, mult, shift);
+        out_2 = vaddq_n_s32(out_2, out_offset);
+        out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
+        out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
+        vstrbq_p_s32(out + 2 * total_ch, out_2, p);
+
+        out_3 = arm_requantize_mve_32x4(out_3, mult, shift);
+        out_3 = vaddq_n_s32(out_3, out_offset);
+        out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
+        out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
+        vstrbq_p_s32(out + 3 * total_ch, out_3, p);
+    }
+
+    return ARM_CMSIS_NN_SUCCESS;
+#else
+    (void)lhs;
+    (void)rhs;
+    (void)input_offset;
+    (void)active_ch;
+    (void)total_ch;
+    (void)out_shift;
+    (void)out_mult;
+    (void)out_offset;
+    (void)activation_min;
+    (void)activation_max;
+    (void)row_x_col;
+    (void)output_bias;
+    (void)out;
+    return ARM_CMSIS_NN_NO_IMPL_ERROR;
+#endif
+}
+
+/**
+ * @} end of Doxygen group
+ */