openzfs · behlendorf · Mar 14, 2023 · Mar 8, 2023 · ryao · Mar 9, 2023
diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h
@@ -126,8 +126,9 @@ typedef struct fletcher_4_func {
 	fletcher_4_fini_f fini_byteswap;
 	fletcher_4_compute_f compute_byteswap;
 	boolean_t (*valid)(void);
+	boolean_t uses_fpu;
 	const char *name;
-} fletcher_4_ops_t;
+} __attribute__((aligned(64))) fletcher_4_ops_t;
 
 _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar_ops;
 _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar4_ops;

diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
@@ -578,13 +578,13 @@
   <elf-variable-symbols>
     <elf-symbol name='efi_debug' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='fletcher_4_abd_ops' size='24' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='fletcher_4_avx2_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='fletcher_4_avx512bw_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='fletcher_4_avx512f_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='fletcher_4_sse2_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='fletcher_4_ssse3_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='fletcher_4_superscalar4_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='fletcher_4_superscalar_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='fletcher_4_avx2_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='fletcher_4_avx512bw_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='fletcher_4_avx512f_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='fletcher_4_sse2_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='fletcher_4_ssse3_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='fletcher_4_superscalar4_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='spa_feature_table' size='2128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -9053,7 +9053,7 @@
     <typedef-decl name='fletcher_4_init_f' type-id='173aa527' id='b9ae1656'/>
     <typedef-decl name='fletcher_4_fini_f' type-id='0ad5b8a8' id='c4c1f4fc'/>
     <typedef-decl name='fletcher_4_compute_f' type-id='38147eff' id='ad1dc4cb'/>
-    <class-decl name='fletcher_4_func' size-in-bits='512' is-struct='yes' visibility='default' id='57f479a0'>
+    <class-decl name='fletcher_4_func' size-in-bits='1024' is-struct='yes' visibility='default' id='57f479a0'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='init_native' type-id='b9ae1656' visibility='default'/>
       </data-member>
@@ -9076,6 +9076,9 @@
         <var-decl name='valid' type-id='297d38bc' visibility='default'/>
       </data-member>
       <data-member access='public' layout-offset-in-bits='448'>
+        <var-decl name='uses_fpu' type-id='c19b74c3' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='512'>
         <var-decl name='name' type-id='80f4b756' visibility='default'/>
       </data-member>
     </class-decl>

diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
@@ -160,6 +160,7 @@ static const fletcher_4_ops_t fletcher_4_scalar_ops = {
 	.fini_byteswap = fletcher_4_scalar_fini,
 	.compute_byteswap = fletcher_4_scalar_byteswap,
 	.valid = fletcher_4_scalar_valid,
+	.uses_fpu = B_FALSE,
 	.name = "scalar"
 };
 
@@ -458,9 +459,15 @@ fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
 	fletcher_4_ctx_t ctx;
 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
 
+	if (ops->uses_fpu == B_TRUE) {
+		kfpu_begin();
+	}
 	ops->init_native(&ctx);
 	ops->compute_native(&ctx, buf, size);
 	ops->fini_native(&ctx, zcp);
+	if (ops->uses_fpu == B_TRUE) {
+		kfpu_end();
+	}
 }
 
 void
@@ -500,9 +507,15 @@ fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
 	fletcher_4_ctx_t ctx;
 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
 
+	if (ops->uses_fpu == B_TRUE) {
+		kfpu_begin();
+	}
 	ops->init_byteswap(&ctx);
 	ops->compute_byteswap(&ctx, buf, size);
 	ops->fini_byteswap(&ctx, zcp);
+	if (ops->uses_fpu == B_TRUE) {
+		kfpu_end();
+	}
 }
 
 void
@@ -661,6 +674,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
 	fletcher_4_fastest_impl.init_ ## type = src->init_ ## type;	  \
 	fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type;	  \
 	fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
+	fletcher_4_fastest_impl.uses_fpu = src->uses_fpu;		  \
 }
 
 #define	FLETCHER_4_BENCH_NS	(MSEC2NSEC(1))		/* 1ms */
@@ -816,10 +830,14 @@ abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
 	cdp->acd_private = (void *) ops;
 
+	if (ops->uses_fpu == B_TRUE) {
+		kfpu_begin();
+	}
 	if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
 		ops->init_native(cdp->acd_ctx);
 	else
 		ops->init_byteswap(cdp->acd_ctx);
+
 }
 
 static void
@@ -833,8 +851,13 @@ abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
 		ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
 	else
 		ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
+
+	if (ops->uses_fpu == B_TRUE) {
+		kfpu_end();
+	}
 }
 
+
 static void
 abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
     zio_abd_checksum_data_t *cdp)

diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c
@@ -52,7 +52,6 @@ ZFS_NO_SANITIZE_UNDEFINED
 static void
 fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx)
 {
-	kfpu_begin();
 	memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t));
 }
 
@@ -70,7 +69,6 @@ fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
 	    8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] +
 	    ctx->aarch64_neon[1].v[1];
 	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
-	kfpu_end();
 }
 
 #define	NEON_INIT_LOOP()			\
@@ -205,6 +203,7 @@ const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
 	.compute_byteswap = fletcher_4_aarch64_neon_byteswap,
 	.fini_byteswap = fletcher_4_aarch64_neon_fini,
 	.valid = fletcher_4_aarch64_neon_valid,
+	.uses_fpu = B_TRUE,
 	.name = "aarch64_neon"
 };
 

diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c
@@ -39,7 +39,6 @@ ZFS_NO_SANITIZE_UNDEFINED
 static void
 fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
 {
-	kfpu_begin();
 	memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t));
 }
 
@@ -73,7 +72,6 @@ fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
 	}
 
 	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
-	kfpu_end();
 }
 
 #define	FLETCHER_4_AVX512_RESTORE_CTX(ctx)				\
@@ -166,6 +164,7 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
 	.fini_byteswap = fletcher_4_avx512f_fini,
 	.compute_byteswap = fletcher_4_avx512f_byteswap,
 	.valid = fletcher_4_avx512f_valid,
+	.uses_fpu = B_TRUE,
 	.name = "avx512f"
 };
 
@@ -216,6 +215,7 @@ const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
 	.fini_byteswap = fletcher_4_avx512f_fini,
 	.compute_byteswap = fletcher_4_avx512bw_byteswap,
 	.valid = fletcher_4_avx512bw_valid,
+	.uses_fpu = B_TRUE,
 	.name = "avx512bw"
 };
 #endif

diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c
@@ -51,7 +51,6 @@ ZFS_NO_SANITIZE_UNDEFINED
 static void
 fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
 {
-	kfpu_begin();
 	memset(ctx->avx, 0, 4 * sizeof (zfs_fletcher_avx_t));
 }
 
@@ -82,7 +81,6 @@ fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
 	    64 * ctx->avx[3].v[3];
 
 	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
-	kfpu_end();
 }
 
 #define	FLETCHER_4_AVX2_RESTORE_CTX(ctx)				\
@@ -163,6 +161,7 @@ const fletcher_4_ops_t fletcher_4_avx2_ops = {
 	.fini_byteswap = fletcher_4_avx2_fini,
 	.compute_byteswap = fletcher_4_avx2_byteswap,
 	.valid = fletcher_4_avx2_valid,
+	.uses_fpu = B_TRUE,
 	.name = "avx2"
 };
 

diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c
@@ -53,7 +53,6 @@ ZFS_NO_SANITIZE_UNDEFINED
 static void
 fletcher_4_sse2_init(fletcher_4_ctx_t *ctx)
 {
-	kfpu_begin();
 	memset(ctx->sse, 0, 4 * sizeof (zfs_fletcher_sse_t));
 }
 
@@ -81,7 +80,6 @@ fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
 	    8 * ctx->sse[2].v[1] + ctx->sse[1].v[1];
 
 	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
-	kfpu_end();
 }
 
 #define	FLETCHER_4_SSE_RESTORE_CTX(ctx)					\
@@ -164,6 +162,7 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = {
 	.fini_byteswap = fletcher_4_sse2_fini,
 	.compute_byteswap = fletcher_4_sse2_byteswap,
 	.valid = fletcher_4_sse2_valid,
+	.uses_fpu = B_TRUE,
 	.name = "sse2"
 };
 
@@ -218,6 +217,7 @@ const fletcher_4_ops_t fletcher_4_ssse3_ops = {
 	.fini_byteswap = fletcher_4_sse2_fini,
 	.compute_byteswap = fletcher_4_ssse3_byteswap,
 	.valid = fletcher_4_ssse3_valid,
+	.uses_fpu = B_TRUE,
 	.name = "ssse3"
 };
 

diff --git a/module/zcommon/zfs_fletcher_superscalar.c b/module/zcommon/zfs_fletcher_superscalar.c
@@ -163,5 +163,6 @@ const fletcher_4_ops_t fletcher_4_superscalar_ops = {
 	.compute_byteswap = fletcher_4_superscalar_byteswap,
 	.fini_byteswap = fletcher_4_superscalar_fini,
 	.valid = fletcher_4_superscalar_valid,
+	.uses_fpu = B_FALSE,
 	.name = "superscalar"
 };
diff --git a/module/zcommon/zfs_fletcher_superscalar4.c b/module/zcommon/zfs_fletcher_superscalar4.c
@@ -229,5 +229,6 @@ const fletcher_4_ops_t fletcher_4_superscalar4_ops = {
 	.compute_byteswap = fletcher_4_superscalar4_byteswap,
 	.fini_byteswap = fletcher_4_superscalar4_fini,
 	.valid = fletcher_4_superscalar4_valid,
+	.uses_fpu = B_FALSE,
 	.name = "superscalar4"
 };