From 37c2f64832120c359411716a4c5f8281ed8937b8 Mon Sep 17 00:00:00 2001 From: "Mihai.Olinovici" Date: Thu, 19 Sep 2024 07:29:05 +0000 Subject: [PATCH] Add RVV qs8/qu8-f32-vcvt kernels and configs. --- cmake/gen/rvv_microkernels.cmake | 4 ++ gen/rvv_microkernels.bzl | 4 ++ scripts/generate-qs8-f32-vcvt.sh | 7 +++ src/configs/unary-elementwise-config.c | 14 +++--- src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u1v.c | 47 +++++++++++++++++++ src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c | 47 +++++++++++++++++++ src/qs8-f32-vcvt/qs8-f32-vcvt.h | 5 ++ src/qs8-f32-vcvt/rvv.c.in | 52 +++++++++++++++++++++ src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u1v.c | 47 +++++++++++++++++++ src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c | 47 +++++++++++++++++++ src/qu8-f32-vcvt/qu8-f32-vcvt.h | 5 ++ 11 files changed, 273 insertions(+), 6 deletions(-) create mode 100644 src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u1v.c create mode 100644 src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c create mode 100644 src/qs8-f32-vcvt/rvv.c.in create mode 100644 src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u1v.c create mode 100644 src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index cd41d9717bb..7df64dde964 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -54,8 +54,10 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-vrnd/gen/f32-vrndu-rvv-u4v.c src/f32-vrnd/gen/f32-vrndz-rvv-u4v.c src/f32-vrsqrt/gen/f32-vrsqrt-rvv-rsqrt-u4v.c + src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u2v.c src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u2v.c + src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c @@ -180,8 +182,10 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x4v-minmax-rvv.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c + src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u1v.c src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u1v.c src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u1v.c + src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u1v.c src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index 397b67d9954..2d7222ca9f0 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -50,8 +50,10 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-vrnd/gen/f32-vrndu-rvv-u4v.c", "src/f32-vrnd/gen/f32-vrndz-rvv-u4v.c", "src/f32-vrsqrt/gen/f32-vrsqrt-rvv-rsqrt-u4v.c", + "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c", "src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u2v.c", "src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u2v.c", + "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u2v.c", "src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u8.c", @@ -177,8 +179,10 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x4v-minmax-rvv.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x4v-minmax-rvv.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x4v-minmax-rvv.c", + "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u1v.c", "src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u1v.c", "src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u1v.c", + "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u1v.c", "src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u1v.c", "src/qu8-vmulc/gen/qu8-vmulc-minmax-f32-rvv-u1v.c", "src/x32-packw/gen/x32-packw-x1v-gemm-goi-rvv-u2.c", diff --git a/scripts/generate-qs8-f32-vcvt.sh b/scripts/generate-qs8-f32-vcvt.sh index 2dbf4529b33..f307861d254 100755 --- a/scripts/generate-qs8-f32-vcvt.sh +++ b/scripts/generate-qs8-f32-vcvt.sh @@ -15,6 +15,13 @@ tools/xngen src/qs8-f32-vcvt/neon.c.in -D BATCH_TILE=16 -D DATATYPE=QU8 -o src/q tools/xngen src/qs8-f32-vcvt/neon.c.in -D BATCH_TILE=24 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-u24.c & tools/xngen src/qs8-f32-vcvt/neon.c.in -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-u32.c & +################################ RISC-V Vector ################################ +tools/xngen src/qs8-f32-vcvt/rvv.c.in -D LMUL=1 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u1v.c & +tools/xngen src/qs8-f32-vcvt/rvv.c.in -D LMUL=2 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c & + +tools/xngen src/qs8-f32-vcvt/rvv.c.in -D LMUL=1 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u1v.c & +tools/xngen src/qs8-f32-vcvt/rvv.c.in -D LMUL=2 -D DATATYPE=QU8 -o src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c & + ################################# x86 128-bit ################################# tools/xngen src/qs8-f32-vcvt/sse2.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-u8.c & tools/xngen src/qs8-f32-vcvt/sse2.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-u16.c & diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c index 6e794819768..bfd10be0ba5 100644 --- a/src/configs/unary-elementwise-config.c +++ b/src/configs/unary-elementwise-config.c @@ -2066,10 +2066,11 @@ static void init_qs8_to_f32_cvt_config(void) { qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__scalar_u1; qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; qs8_to_f32_cvt_config.element_tile = 1; - #elif XNN_ARCH_RISCV - qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__scalar_u4; + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__rvv_u2v; qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; - qs8_to_f32_cvt_config.element_tile = 4; + qs8_to_f32_cvt_config.element_tile = hardware_config->vlenb / sizeof(int8_t) * 2; // (VLENB/sizeof)*LMUL; #else qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__scalar_u4; qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; @@ -2288,10 +2289,11 @@ static void init_qu8_to_f32_cvt_config(void) { qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__scalar_u1; qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; qu8_to_f32_cvt_config.element_tile = 1; - #elif XNN_ARCH_RISCV - qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__scalar_u4; + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__rvv_u2v; qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; - qu8_to_f32_cvt_config.element_tile = 4; + qu8_to_f32_cvt_config.element_tile = hardware_config->vlenb / sizeof(uint8_t) * 2; // (VLENB/sizeof)*LMUL; #else qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__scalar_u4; qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; diff --git a/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u1v.c b/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u1v.c new file mode 100644 index 00000000000..6c5c3e2bb1a --- /dev/null +++ b/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u1v.c @@ -0,0 +1,47 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-f32-vcvt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vcvt.h" + + +void xnn_qs8_f32_vcvt_ukernel__rvv_u1v( + size_t batch, + const int8_t* input, + float* output, + const struct xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_INT8_T; + + const float scale = params->scalar.scale; + const int32_t minus_zero_point = -params->scalar.zero_point; + + for (; batch > 0; ) { + const int32_t n = __riscv_vsetvl_e8m1(batch); batch -= n; + + vint8m1_t x_i8v = __riscv_vle8_v_i8m1(input, n); input += n; + + vint32m4_t wx_i32v = __riscv_vsext_vf4_i32m4(x_i8v, n); + wx_i32v = __riscv_vadd_vx_i32m4(wx_i32v, minus_zero_point, n); + vfloat32m4_t y_f32v = __riscv_vfcvt_f_x_v_f32m4(wx_i32v, n); + y_f32v = __riscv_vfmul_vf_f32m4(y_f32v, scale, n); + + __riscv_vse32_v_f32m4(output, y_f32v, n); output += n; + } +} diff --git a/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c b/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c new file mode 100644 index 00000000000..c6961b8451e --- /dev/null +++ b/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c @@ -0,0 +1,47 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-f32-vcvt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vcvt.h" + + +void xnn_qs8_f32_vcvt_ukernel__rvv_u2v( + size_t batch, + const int8_t* input, + float* output, + const struct xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_INT8_T; + + const float scale = params->scalar.scale; + const int32_t minus_zero_point = -params->scalar.zero_point; + + for (; batch > 0; ) { + const int32_t n = __riscv_vsetvl_e8m2(batch); batch -= n; + + vint8m2_t x_i8v = __riscv_vle8_v_i8m2(input, n); input += n; + + vint32m8_t wx_i32v = __riscv_vsext_vf4_i32m8(x_i8v, n); + wx_i32v = __riscv_vadd_vx_i32m8(wx_i32v, minus_zero_point, n); + vfloat32m8_t y_f32v = __riscv_vfcvt_f_x_v_f32m8(wx_i32v, n); + y_f32v = __riscv_vfmul_vf_f32m8(y_f32v, scale, n); + + __riscv_vse32_v_f32m8(output, y_f32v, n); output += n; + } +} diff --git a/src/qs8-f32-vcvt/qs8-f32-vcvt.h b/src/qs8-f32-vcvt/qs8-f32-vcvt.h index a5f0b4286ce..e7de4a98713 100644 --- a/src/qs8-f32-vcvt/qs8-f32-vcvt.h +++ b/src/qs8-f32-vcvt/qs8-f32-vcvt.h @@ -53,6 +53,11 @@ XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_f32_vcvt_ukernel__wasmsimd_u24, 24, false XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_f32_vcvt_ukernel__wasmsimd_u32, 32, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_f32_vcvt_ukernel__rvv_u1v, 1, true, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) +XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_f32_vcvt_ukernel__rvv_u2v, 2, true, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_f32_vcvt_ukernel__scalar_u1, 1, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_f32_vcvt_ukernel__scalar_u2, 2, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qs8_f32_vcvt_ukernel__scalar_u3, 3, false, int8_t, float, struct xnn_qs8_f32_cvt_params, xnn_init_qs8_f32_cvt_scalar_params) diff --git a/src/qs8-f32-vcvt/rvv.c.in b/src/qs8-f32-vcvt/rvv.c.in new file mode 100644 index 00000000000..424fa6dfa11 --- /dev/null +++ b/src/qs8-f32-vcvt/rvv.c.in @@ -0,0 +1,52 @@ +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2] +$VXINT = {"QS8": "vint", "QU8": "vuint"}[DATATYPE] +$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] +$XLOAD = {"QS8": "__riscv_vle8_v_i8", "QU8": "__riscv_vle8_v_u8"}[DATATYPE] +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vcvt.h" + + +void xnn_${DATATYPE.lower()}_f32_vcvt_ukernel__rvv_u${LMUL}v( + size_t batch, + const ${XINT8_T}* input, + float* output, + const struct xnn_${DATATYPE.lower()}_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_INT8_T; + + const float scale = params->scalar.scale; + const int32_t minus_zero_point = -params->scalar.zero_point; + + for (; batch > 0; ) { + const int32_t n = __riscv_vsetvl_e8m${LMUL}(batch); batch -= n; + + $if DATATYPE == "QS8": + vint8m${LMUL}_t x_i8v = __riscv_vle8_v_i8m${LMUL}(input, n); input += n; + + vint32m${LMUL*4}_t wx_i32v = __riscv_vsext_vf4_i32m${LMUL*4}(x_i8v, n); + $else: + vuint8m${LMUL}_t x_u8v = __riscv_vle8_v_u8m${LMUL}(input, n); input += n; + + vint32m${LMUL*4}_t wx_i32v = __riscv_vreinterpret_v_u32m${LMUL*4}_i32m${LMUL*4}(__riscv_vzext_vf4_u32m${LMUL*4}(x_u8v, n)); + wx_i32v = __riscv_vadd_vx_i32m${LMUL*4}(wx_i32v, minus_zero_point, n); + vfloat32m${LMUL*4}_t y_f32v = __riscv_vfcvt_f_x_v_f32m${LMUL*4}(wx_i32v, n); + y_f32v = __riscv_vfmul_vf_f32m${LMUL*4}(y_f32v, scale, n); + + __riscv_vse32_v_f32m${LMUL*4}(output, y_f32v, n); output += n; + } +} diff --git a/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u1v.c b/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u1v.c new file mode 100644 index 00000000000..e4365f09ac3 --- /dev/null +++ b/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u1v.c @@ -0,0 +1,47 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-f32-vcvt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vcvt.h" + + +void xnn_qu8_f32_vcvt_ukernel__rvv_u1v( + size_t batch, + const uint8_t* input, + float* output, + const struct xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_INT8_T; + + const float scale = params->scalar.scale; + const int32_t minus_zero_point = -params->scalar.zero_point; + + for (; batch > 0; ) { + const int32_t n = __riscv_vsetvl_e8m1(batch); batch -= n; + + vuint8m1_t x_u8v = __riscv_vle8_v_u8m1(input, n); input += n; + + vint32m4_t wx_i32v = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vzext_vf4_u32m4(x_u8v, n)); + wx_i32v = __riscv_vadd_vx_i32m4(wx_i32v, minus_zero_point, n); + vfloat32m4_t y_f32v = __riscv_vfcvt_f_x_v_f32m4(wx_i32v, n); + y_f32v = __riscv_vfmul_vf_f32m4(y_f32v, scale, n); + + __riscv_vse32_v_f32m4(output, y_f32v, n); output += n; + } +} diff --git a/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c b/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c new file mode 100644 index 00000000000..4ac23cdc184 --- /dev/null +++ b/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c @@ -0,0 +1,47 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-f32-vcvt/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Imagination Technologies, inc. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/intrinsics-polyfill.h" +#include "xnnpack/vcvt.h" + + +void xnn_qu8_f32_vcvt_ukernel__rvv_u2v( + size_t batch, + const uint8_t* input, + float* output, + const struct xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(int8_t) == 0); + assert(input != NULL); + assert(output != NULL); + + batch >>= XNN_LOG2_SIZEOF_INT8_T; + + const float scale = params->scalar.scale; + const int32_t minus_zero_point = -params->scalar.zero_point; + + for (; batch > 0; ) { + const int32_t n = __riscv_vsetvl_e8m2(batch); batch -= n; + + vuint8m2_t x_u8v = __riscv_vle8_v_u8m2(input, n); input += n; + + vint32m8_t wx_i32v = __riscv_vreinterpret_v_u32m8_i32m8(__riscv_vzext_vf4_u32m8(x_u8v, n)); + wx_i32v = __riscv_vadd_vx_i32m8(wx_i32v, minus_zero_point, n); + vfloat32m8_t y_f32v = __riscv_vfcvt_f_x_v_f32m8(wx_i32v, n); + y_f32v = __riscv_vfmul_vf_f32m8(y_f32v, scale, n); + + __riscv_vse32_v_f32m8(output, y_f32v, n); output += n; + } +} diff --git a/src/qu8-f32-vcvt/qu8-f32-vcvt.h b/src/qu8-f32-vcvt/qu8-f32-vcvt.h index 17cfe586c7d..3bcbc008f05 100644 --- a/src/qu8-f32-vcvt/qu8-f32-vcvt.h +++ b/src/qu8-f32-vcvt/qu8-f32-vcvt.h @@ -53,6 +53,11 @@ XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_f32_vcvt_ukernel__wasmsimd_u24, 24, false XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_f32_vcvt_ukernel__wasmsimd_u32, 32, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_f32_vcvt_ukernel__rvv_u1v, 1, true, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) +XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_f32_vcvt_ukernel__rvv_u2v, 2, true, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_f32_vcvt_ukernel__scalar_u1, 1, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_f32_vcvt_ukernel__scalar_u2, 2, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params) XNN_CVT_UKERNEL_WITH_PARAMS(0, xnn_qu8_f32_vcvt_ukernel__scalar_u3, 3, false, uint8_t, float, struct xnn_qu8_f32_cvt_params, xnn_init_qu8_f32_cvt_scalar_params)