From e0db28b9909e206ccb246c19447df443fa2281c6 Mon Sep 17 00:00:00 2001 From: Romain Dolbeau Date: Fri, 8 May 2015 11:36:41 +0200 Subject: [PATCH] RAID-Z1/AVX2, RAID-Z3/AVX2 --- module/zfs/vdev_raidz.c | 422 +++++++++++++++++++++++++++++----------- 1 file changed, 313 insertions(+), 109 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index cf0f267869d6..5245717d0fe3 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1137,6 +1137,225 @@ vdev_raidz_generate_parity_pqr_sse(raidz_map_t *rm) #undef COMPUTE8_R_SSE #if defined(_KERNEL) && defined(CONFIG_AS_AVX2) +#define MAKE_CST32_AVX2(regx,regy,val) \ + asm volatile("vmovd %0,%%"#regx : : "r"(val));\ + asm volatile("vpbroadcastd %"#regx",%"#regy); + +#define COPY16P_AVX2 asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0)));\ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4)));\ + asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8)));\ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12)));\ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0)));\ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4)));\ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8)));\ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12))) + +#define COPY16PQ_AVX2 asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0)));\ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4)));\ + asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8)));\ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12)));\ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0)));\ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4)));\ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8)));\ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12)));\ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(q+0)));\ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(q+4)));\ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(q+8)));\ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(q+12))) + +#define COPY16PQR_AVX2 asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0)));\ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4)));\ + asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8)));\ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12)));\ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0)));\ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4)));\ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8)));\ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12)));\ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(q+0)));\ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(q+4)));\ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(q+8)));\ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(q+12)));\ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(r+0)));\ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(r+4)));\ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(r+8)));\ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(r+12))) + +#define LOAD16_SRC_AVX2 asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0)));\ + asm volatile("vmovdqa %0,%%ymm4" : : "m" (*(src+4)));\ + asm volatile("vmovdqa %0,%%ymm8" : : "m" (*(src+8)));\ + asm volatile("vmovdqa %0,%%ymm12" : : "m" (*(src+12))) + +#define COMPUTE16_P_AVX2 asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(p+0)));\ + asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(p+4)));\ + asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(p+8)));\ + asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(p+12)));\ + asm volatile("vpxor %ymm0,%ymm1,%ymm1");\ + asm volatile("vpxor %ymm4,%ymm5,%ymm5");\ + asm volatile("vpxor %ymm8,%ymm9,%ymm9");\ + asm volatile("vpxor %ymm12,%ymm13,%ymm13");\ + asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(p+0)));\ + asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(p+4)));\ + asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(p+8)));\ + asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(p+12))) + +#define COMPUTE16_Q_AVX2 asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(q+0))); /* ymm1 = q */\ + asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(q+4)));\ + asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(q+8)));\ + asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(q+12)));\ + /* to implement R in RAID-Z3, just copy the whole Q block and repeat from here ... */\ + /* I think the movdqa from the static array should work, but it doesn't.\ + So the constants are synthesized from a 32 bits value in a conventional\ + register */\ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[0])); */\ + MAKE_CST32_AVX2(xmm3,ymm3,0x80808080);\ + asm volatile("vpand %ymm3,%ymm1,%ymm2"); /* ymm2 = q & 0x8080808080808080ULL */\ + asm volatile("vpand %ymm3,%ymm5,%ymm6");\ + asm volatile("vpand %ymm3,%ymm9,%ymm10");\ + asm volatile("vpand %ymm3,%ymm13,%ymm14");\ + asm volatile("vpsrlq $7,%ymm2,%ymm3"); /* ymm3 = (q & 0x8080808080808080ULL) >> 7 */\ + asm volatile("vpsrlq $7,%ymm6,%ymm7");\ + asm volatile("vpsrlq $7,%ymm10,%ymm11");\ + asm volatile("vpsrlq $7,%ymm14,%ymm15");\ + asm volatile("vpsllq $1,%ymm2,%ymm2"); /* ymm2 = (q & 0x8080808080808080ULL) << 1 */\ + asm volatile("vpsllq $1,%ymm6,%ymm6");\ + asm volatile("vpsllq $1,%ymm10,%ymm10");\ + asm volatile("vpsllq $1,%ymm14,%ymm14");\ + asm volatile("vpsubq %ymm3,%ymm2,%ymm2"); /* ymm2 = ((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7) */\ + asm volatile("vpsubq %ymm7,%ymm6,%ymm6");\ + asm volatile("vpsubq %ymm11,%ymm10,%ymm10");\ + asm volatile("vpsubq %ymm15,%ymm14,%ymm14");\ + asm volatile("vpsllq $1,%ymm1,%ymm1"); /* ymm1 = q << 1 */\ + asm volatile("vpsllq $1,%ymm5,%ymm5");\ + asm volatile("vpsllq $1,%ymm9,%ymm9");\ + asm volatile("vpsllq $1,%ymm13,%ymm13");\ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[1])); */\ + MAKE_CST32_AVX2(xmm3,ymm3,0x1d1d1d1d);\ + asm volatile("vpand %ymm3,%ymm2,%ymm2"); /* ymm2 = (((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */\ + asm volatile("vpand %ymm3,%ymm6,%ymm6");\ + asm volatile("vpand %ymm3,%ymm10,%ymm10");\ + asm volatile("vpand %ymm3,%ymm14,%ymm14");\ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[2])); */\ + MAKE_CST32_AVX2(xmm3,ymm3,0xfefefefe);\ + asm volatile("vpand %ymm3,%ymm1,%ymm1"); /* ymm1 = (q << 1) & 0xfefefefefefefefeULL */\ + asm volatile("vpand %ymm3,%ymm5,%ymm5");\ + asm volatile("vpand %ymm3,%ymm9,%ymm9");\ + asm volatile("vpand %ymm3,%ymm13,%ymm13");\ + asm volatile("vpxor %ymm2,%ymm1,%ymm1"); /* ymm1 = ((((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((q << 1) & 0xfefefefefefefefeULL) */\ + asm volatile("vpxor %ymm6,%ymm5,%ymm5");\ + asm volatile("vpxor %ymm10,%ymm9,%ymm9");\ + asm volatile("vpxor %ymm14,%ymm13,%ymm13");\ + /* to implement R in RAID-Z3, ... repeat until here, at this point. */\ + asm volatile("vpxor %ymm0,%ymm1,%ymm1"); /* final xor */\ + asm volatile("vpxor %ymm4,%ymm5,%ymm5");\ + asm volatile("vpxor %ymm8,%ymm9,%ymm9");\ + asm volatile("vpxor %ymm12,%ymm13,%ymm13");\ + asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(q+0)));\ + asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(q+4)));\ + asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(q+8)));\ + asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(q+12))) + +#define COMPUTE16_R_AVX2 asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(r+0))); /* ymm1 = r */\ + asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(r+4)));\ + asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(r+8)));\ + asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(r+12)));\ + /* to implement R in RAID-Z3, just copy the whole R block and repeat from here ... */\ + for (j = 0 ; j < 2 ; j++) {\ + /* I think the movdqa from the static array should work, but it doesn't.\ + So the constants are synthesized from a 32 bits value in a conventional\ + register */\ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[0])); */\ + MAKE_CST32_AVX2(xmm3,ymm3,0x80808080);\ + asm volatile("vpand %ymm3,%ymm1,%ymm2"); /* ymm2 = r & 0x8080808080808080ULL */\ + asm volatile("vpand %ymm3,%ymm5,%ymm6");\ + asm volatile("vpand %ymm3,%ymm9,%ymm10");\ + asm volatile("vpand %ymm3,%ymm13,%ymm14");\ + asm volatile("vpsrlq $7,%ymm2,%ymm3"); /* ymm3 = (r & 0x8080808080808080ULL) >> 7 */\ + asm volatile("vpsrlq $7,%ymm6,%ymm7");\ + asm volatile("vpsrlq $7,%ymm10,%ymm11");\ + asm volatile("vpsrlq $7,%ymm14,%ymm15");\ + asm volatile("vpsllq $1,%ymm2,%ymm2"); /* ymm2 = (r & 0x8080808080808080ULL) << 1 */\ + asm volatile("vpsllq $1,%ymm6,%ymm6");\ + asm volatile("vpsllq $1,%ymm10,%ymm10");\ + asm volatile("vpsllq $1,%ymm14,%ymm14");\ + asm volatile("vpsubq %ymm3,%ymm2,%ymm2"); /* ymm2 = ((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7) */\ + asm volatile("vpsubq %ymm7,%ymm6,%ymm6");\ + asm volatile("vpsubq %ymm11,%ymm10,%ymm10");\ + asm volatile("vpsubq %ymm15,%ymm14,%ymm14");\ + asm volatile("vpsllq $1,%ymm1,%ymm1"); /* ymm1 = r << 1 */\ + asm volatile("vpsllq $1,%ymm5,%ymm5");\ + asm volatile("vpsllq $1,%ymm9,%ymm9");\ + asm volatile("vpsllq $1,%ymm13,%ymm13");\ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[1])); */\ + MAKE_CST32_AVX2(xmm3,ymm3,0x1d1d1d1d);\ + asm volatile("vpand %ymm3,%ymm2,%ymm2"); /* ymm2 = (((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */\ + asm volatile("vpand %ymm3,%ymm6,%ymm6");\ + asm volatile("vpand %ymm3,%ymm10,%ymm10");\ + asm volatile("vpand %ymm3,%ymm14,%ymm14");\ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[2])); */\ + MAKE_CST32_AVX2(xmm3,ymm3,0xfefefefe);\ + asm volatile("vpand %ymm3,%ymm1,%ymm1"); /* ymm1 = (r << 1) & 0xfefefefefefefefeULL */\ + asm volatile("vpand %ymm3,%ymm5,%ymm5");\ + asm volatile("vpand %ymm3,%ymm9,%ymm9");\ + asm volatile("vpand %ymm3,%ymm13,%ymm13");\ + asm volatile("vpxor %ymm2,%ymm1,%ymm1"); /* ymm1 = ((((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((r << 1) & 0xfefefefefefefefeULL) */\ + asm volatile("vpxor %ymm6,%ymm5,%ymm5");\ + asm volatile("vpxor %ymm10,%ymm9,%ymm9");\ + asm volatile("vpxor %ymm14,%ymm13,%ymm13");\ + /* to implement R in RAID-Z3, ... repeat until here, at this point. */\ + }\ + asm volatile("vpxor %ymm0,%ymm1,%ymm1"); /* final xor */\ + asm volatile("vpxor %ymm4,%ymm5,%ymm5");\ + asm volatile("vpxor %ymm8,%ymm9,%ymm9");\ + asm volatile("vpxor %ymm12,%ymm13,%ymm13");\ + asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(r+0)));\ + asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(r+4)));\ + asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(r+8)));\ + asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(r+12))) + +static void +vdev_raidz_generate_parity_p_avx2(raidz_map_t *rm) +{ + uint64_t *p, *src, pcount, ccount, i; + int c; + + pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); +#if defined(_KERNEL) + kernel_fpu_begin(); +#endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccount == pcount); + i = 0; + if (ccount>15) /* ccount is unsigned */ + for (; i < ccount-15; i+=16, src+=16, p+=16) { + COPY16P_AVX2; + } + for (; i < ccount; i++, src++, p++) { + *p = *src; + } + } else { + ASSERT(ccount <= pcount); + i = 0; + if (ccount>15) /* ccount is unsigned */ + for (; i < ccount-15; i+=16, src+=16, p+=16) { + /* raw, unscheduled, unrolled-by-16 (4x256bits) AVX2 implementation */ + LOAD16_SRC_AVX2; + COMPUTE16_P_AVX2; + } + for (; i < ccount; i++, src++, p++) { + *p ^= *src; + } + } + } +#if defined(_KERNEL) + kernel_fpu_end(); +#endif +} + static void vdev_raidz_generate_parity_pq_avx2(raidz_map_t *rm) { @@ -1161,18 +1380,7 @@ vdev_raidz_generate_parity_pq_avx2(raidz_map_t *rm) i = 0; if (ccnt>15) /* ccnt is unsigned */ for (; i < ccnt-15; i+=16, src+=16, p+=16, q+=16) { - asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0))); - asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4))); - asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8))); - asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12))); - asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0))); - asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4))); - asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8))); - asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12))); - asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(q+0))); - asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(q+4))); - asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(q+8))); - asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(q+12))); + COPY16PQ_AVX2; } for (; i < ccnt; i++, src++, p++, q++) { *p = *src; @@ -1193,101 +1401,9 @@ vdev_raidz_generate_parity_pq_avx2(raidz_map_t *rm) if (ccnt>15) /* ccnt is unsigned */ for (; i < ccnt-15; i+=16, src+=16, p+=16, q+=16) { /* raw, unscheduled, unrolled-by-16 (4x256bits) AVX2 implementation */ - - /* compute P */ - /* this would be the same in RAID-Z1 & RAID-Z3 */ - asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0))); - asm volatile("vmovdqa %0,%%ymm4" : : "m" (*(src+4))); - asm volatile("vmovdqa %0,%%ymm8" : : "m" (*(src+8))); - asm volatile("vmovdqa %0,%%ymm12" : : "m" (*(src+12))); - - asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(p+0))); - asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(p+4))); - asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(p+8))); - asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(p+12))); - - asm volatile("vpxor %ymm0,%ymm1,%ymm1"); - asm volatile("vpxor %ymm4,%ymm5,%ymm5"); - asm volatile("vpxor %ymm8,%ymm9,%ymm9"); - asm volatile("vpxor %ymm12,%ymm13,%ymm13"); - - asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(p+0))); - asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(p+4))); - asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(p+8))); - asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(p+12))); - - /* compute Q */ - /* this would be the same in RAID-Z3 */ - asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(q+0))); /* ymm1 = q */ - asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(q+4))); - asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(q+8))); - asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(q+12))); - - /* to implement R in RAID-Z3, just copy the whole Q block and repeat from here ... */ - - /* I think the movdqa from the static array should work, but it doesn't. - So the constants are synthesized from a 32 bits value in a conventional - register */ -#define MAKE_CST32_AVX2(regx,regy,val) \ - asm volatile("vmovd %0,%%"#regx : : "r"(val));\ - asm volatile("vpbroadcastd %"#regx",%"#regy); - /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[0])); */ - MAKE_CST32_AVX2(xmm3,ymm3,0x80808080); - asm volatile("vpand %ymm3,%ymm1,%ymm2"); /* ymm2 = q & 0x8080808080808080ULL */ - asm volatile("vpand %ymm3,%ymm5,%ymm6"); - asm volatile("vpand %ymm3,%ymm9,%ymm10"); - asm volatile("vpand %ymm3,%ymm13,%ymm14"); - - asm volatile("vpsrlq $7,%ymm2,%ymm3"); /* ymm3 = (q & 0x8080808080808080ULL) >> 7 */ - asm volatile("vpsrlq $7,%ymm6,%ymm7"); - asm volatile("vpsrlq $7,%ymm10,%ymm11"); - asm volatile("vpsrlq $7,%ymm14,%ymm15"); - - asm volatile("vpsllq $1,%ymm2,%ymm2"); /* ymm2 = (q & 0x8080808080808080ULL) << 1 */ - asm volatile("vpsllq $1,%ymm6,%ymm6"); - asm volatile("vpsllq $1,%ymm10,%ymm10"); - asm volatile("vpsllq $1,%ymm14,%ymm14"); - - asm volatile("vpsubq %ymm3,%ymm2,%ymm2"); /* ymm2 = ((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7) */ - asm volatile("vpsubq %ymm7,%ymm6,%ymm6"); - asm volatile("vpsubq %ymm11,%ymm10,%ymm10"); - asm volatile("vpsubq %ymm15,%ymm14,%ymm14"); - - asm volatile("vpsllq $1,%ymm1,%ymm1"); /* ymm1 = q << 1 */ - asm volatile("vpsllq $1,%ymm5,%ymm5"); - asm volatile("vpsllq $1,%ymm9,%ymm9"); - asm volatile("vpsllq $1,%ymm13,%ymm13"); - - /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[1])); */ - MAKE_CST32_AVX2(xmm3,ymm3,0x1d1d1d1d); - asm volatile("vpand %ymm3,%ymm2,%ymm2"); /* ymm2 = (((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */ - asm volatile("vpand %ymm3,%ymm6,%ymm6"); - asm volatile("vpand %ymm3,%ymm10,%ymm10"); - asm volatile("vpand %ymm3,%ymm14,%ymm14"); - - /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[2])); */ - MAKE_CST32_AVX2(xmm3,ymm3,0xfefefefe); - asm volatile("vpand %ymm3,%ymm1,%ymm1"); /* ymm1 = (q << 1) & 0xfefefefefefefefeULL */ - asm volatile("vpand %ymm3,%ymm5,%ymm5"); - asm volatile("vpand %ymm3,%ymm9,%ymm9"); - asm volatile("vpand %ymm3,%ymm13,%ymm13"); - - asm volatile("vpxor %ymm2,%ymm1,%ymm1"); /* ymm1 = ((((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((q << 1) & 0xfefefefefefefefeULL) */ - asm volatile("vpxor %ymm6,%ymm5,%ymm5"); - asm volatile("vpxor %ymm10,%ymm9,%ymm9"); - asm volatile("vpxor %ymm14,%ymm13,%ymm13"); - /* to implement R in RAID-Z3, ... repeat until here, at this point. */ - - asm volatile("vpxor %ymm0,%ymm1,%ymm1"); /* final xor */ - asm volatile("vpxor %ymm4,%ymm5,%ymm5"); - asm volatile("vpxor %ymm8,%ymm9,%ymm9"); - asm volatile("vpxor %ymm12,%ymm13,%ymm13"); - - asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(q+0))); - asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(q+4))); - asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(q+8))); - asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(q+12))); -#undef MAKE_CST32_AVX2 + LOAD16_SRC_AVX2; + COMPUTE16_P_AVX2; + COMPUTE16_Q_AVX2; } for (; i < ccnt; i++, src++, p++, q++) { *p ^= *src; @@ -1309,6 +1425,94 @@ vdev_raidz_generate_parity_pq_avx2(raidz_map_t *rm) kernel_fpu_end(); #endif } + +static void +vdev_raidz_generate_parity_pqr_avx2(raidz_map_t *rm) +{ + uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i, j; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_R].rc_size); +#if defined(_KERNEL) + kernel_fpu_begin(); +#endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + i = 0; + if (ccnt>15) /* ccnt is unsigned */ + for (; i < ccnt-15; i+=16, src+=16, p+=16, q+=16, r+=16) { + COPY16PQR_AVX2; + } + for (; i < ccnt; i++, src++, p++, q++, r++) { + *p = *src; + *q = *src; + *r = *src; + } + for (; i < pcnt; i++, src++, p++, q++, r++) { + *p = 0; + *q = 0; + *r = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + i = 0; + if (ccnt>15) /* ccnt is unsigned */ + for (; i < ccnt-15; i+=16, src+=16, p+=16, q+=16, r+=16) { + /* raw, unscheduled, unrolled-by-16 (4x256bits) AVX2 implementation */ + LOAD16_SRC_AVX2; + COMPUTE16_P_AVX2; + COMPUTE16_Q_AVX2; + COMPUTE16_R_AVX2; + } + for (; i < ccnt; i++, src++, p++, q++, r++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + + VDEV_RAIDZ_64MUL_4(*r, mask); + *r ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++, r++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + VDEV_RAIDZ_64MUL_4(*r, mask); + } + } + } +#if defined(_KERNEL) + kernel_fpu_end(); +#endif +} +#undef MAKE_CST32_AVX2 +#undef COPY16P_AVX2 +#undef COPY16PQ_AVX2 +#undef COPY16PQR_AVX2 +#undef LOAD16_SRC_AVX2 +#undef COMPUTE16_P_AVX2 +#undef COMPUTE16_Q_AVX2 +#undef COMPUTE16_R_AVX2 #endif // _KERNEL && CONFIG_AS_AVX2 #if defined(_KERNEL) && defined(CONFIG_AS_AVX) @@ -1700,9 +1904,9 @@ static void vdev_raidz_pick_parity_functions(void) { #if defined(__x86_64__) #if defined(_KERNEL) && defined(CONFIG_AS_AVX2) if (boot_cpu_has(X86_FEATURE_AVX2)) { - vdev_raidz_generate_parity_p = &vdev_raidz_generate_parity_p_avx128; + vdev_raidz_generate_parity_p = &vdev_raidz_generate_parity_p_avx2; vdev_raidz_generate_parity_pq = &vdev_raidz_generate_parity_pq_avx2; - vdev_raidz_generate_parity_pqr = &vdev_raidz_generate_parity_pqr_avx128; + vdev_raidz_generate_parity_pqr = &vdev_raidz_generate_parity_pqr_avx2; printk("ZFS: using vdev_raidz_generate_parity_*_avx2\n"); } else #endif