diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 5245717d0fe3..d4cec4d55383 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -734,205 +734,217 @@ vdev_raidz_generate_parity_pqr_c(raidz_map_t *rm) } #if defined(__x86_64__) -/* static const struct raidz2_sse_constants { */ -/* uint64_t cst[6]; */ -/* } raidz2_sse_constants __aligned(16) = { */ -/* { 0x8080808080808080ULL, 0x8080808080808080ULL, */ -/* 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, */ -/* 0xfefefefefefefefeULL, 0xfefefefefefefefeULL, */ -/* }, */ -/* }; */ - -#define MAKE_CST32_SSE(reg,val) \ - asm volatile("movd %0,%%"#reg : : "r"(val));\ - asm volatile("pshufd $0,%"#reg",%"#reg); - -#define COPY8P_SSE asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0)));\ - asm volatile("movdqa %0,%%xmm1" : : "m" (*(src+2)));\ - asm volatile("movdqa %0,%%xmm2" : : "m" (*(src+4)));\ - asm volatile("movdqa %0,%%xmm3" : : "m" (*(src+6)));\ - asm volatile("movdqa %%xmm0, %0" : "=m" (*(p+0)));\ - asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+2)));\ - asm volatile("movdqa %%xmm2, %0" : "=m" (*(p+4)));\ - asm volatile("movdqa %%xmm3, %0" : "=m" (*(p+6))) - -#define COPY8PQ_SSE asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0)));\ - asm volatile("movdqa %0,%%xmm1" : : "m" (*(src+2)));\ - asm volatile("movdqa %0,%%xmm2" : : "m" (*(src+4)));\ - asm volatile("movdqa %0,%%xmm3" : : "m" (*(src+6)));\ - asm volatile("movdqa %%xmm0, %0" : "=m" (*(p+0)));\ - asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+2)));\ - asm volatile("movdqa %%xmm2, %0" : "=m" (*(p+4)));\ - asm volatile("movdqa %%xmm3, %0" : "=m" (*(p+6)));\ - asm volatile("movdqa %%xmm0, %0" : "=m" (*(q+0)));\ - asm volatile("movdqa %%xmm1, %0" : "=m" (*(q+2)));\ - asm volatile("movdqa %%xmm2, %0" : "=m" (*(q+4)));\ - asm volatile("movdqa %%xmm3, %0" : "=m" (*(q+6))) - -#define COPY8PQR_SSE asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0)));\ - asm volatile("movdqa %0,%%xmm1" : : "m" (*(src+2)));\ - asm volatile("movdqa %0,%%xmm2" : : "m" (*(src+4)));\ - asm volatile("movdqa %0,%%xmm3" : : "m" (*(src+6)));\ - asm volatile("movdqa %%xmm0, %0" : "=m" (*(p+0)));\ - asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+2)));\ - asm volatile("movdqa %%xmm2, %0" : "=m" (*(p+4)));\ - asm volatile("movdqa %%xmm3, %0" : "=m" (*(p+6)));\ - asm volatile("movdqa %%xmm0, %0" : "=m" (*(q+0)));\ - asm volatile("movdqa %%xmm1, %0" : "=m" (*(q+2)));\ - asm volatile("movdqa %%xmm2, %0" : "=m" (*(q+4)));\ - asm volatile("movdqa %%xmm3, %0" : "=m" (*(q+6)));\ - asm volatile("movdqa %%xmm0, %0" : "=m" (*(r+0)));\ - asm volatile("movdqa %%xmm1, %0" : "=m" (*(r+2)));\ - asm volatile("movdqa %%xmm2, %0" : "=m" (*(r+4)));\ - asm volatile("movdqa %%xmm3, %0" : "=m" (*(r+6))) - -#define LOAD8_SRC_SSE asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0)));\ - asm volatile("movdqa %0,%%xmm4" : : "m" (*(src+2)));\ - asm volatile("movdqa %0,%%xmm8" : : "m" (*(src+4)));\ - asm volatile("movdqa %0,%%xmm12" : : "m" (*(src+6))) - -#define COMPUTE8_P_SSE asm volatile("movdqa %0,%%xmm1" : : "m" (*(p+0)));\ - asm volatile("movdqa %0,%%xmm5" : : "m" (*(p+2)));\ - asm volatile("movdqa %0,%%xmm9" : : "m" (*(p+4)));\ - asm volatile("movdqa %0,%%xmm13" : : "m" (*(p+6)));\ - asm volatile("pxor %xmm0,%xmm1");\ - asm volatile("pxor %xmm4,%xmm5");\ - asm volatile("pxor %xmm8,%xmm9");\ - asm volatile("pxor %xmm12,%xmm13");\ - asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+0)));\ - asm volatile("movdqa %%xmm5, %0" : "=m" (*(p+2)));\ - asm volatile("movdqa %%xmm9, %0" : "=m" (*(p+4)));\ - asm volatile("movdqa %%xmm13, %0" : "=m" (*(p+6))) - -#define COMPUTE8_Q_SSE asm volatile("movdqa %0,%%xmm1" : : "m" (*(q+0))); /* xmm1 = q */\ - asm volatile("movdqa %0,%%xmm5" : : "m" (*(q+2)));\ - asm volatile("movdqa %0,%%xmm9" : : "m" (*(q+4)));\ - asm volatile("movdqa %0,%%xmm13" : : "m" (*(q+6)));\ - /* to implement R in RAID-Z3, just copy the whole Q block and repeat from here ... */\ - asm volatile("movdqa %xmm1,%xmm2"); /* xmm2 = q */\ - asm volatile("movdqa %xmm5,%xmm6");\ - asm volatile("movdqa %xmm9,%xmm10");\ - asm volatile("movdqa %xmm13,%xmm14");\ - /* I think the movdqa from the static array should work, but it doesn't.\ - So the constants are synthesized from a 32 bits value in a conventional\ - register */\ - /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[0])); */\ - MAKE_CST32_SSE(xmm3,0x80808080);\ - asm volatile("pand %xmm3,%xmm2"); /* xmm2 = q & 0x8080808080808080ULL */\ - asm volatile("pand %xmm3,%xmm6");\ - asm volatile("pand %xmm3,%xmm10");\ - asm volatile("pand %xmm3,%xmm14");\ - asm volatile("movdqa %xmm2,%xmm3"); /* xmm3 = q & 0x8080808080808080ULL */\ - asm volatile("movdqa %xmm6,%xmm7");\ - asm volatile("movdqa %xmm10,%xmm11");\ - asm volatile("movdqa %xmm14,%xmm15");\ - asm volatile("psllq $1,%xmm2"); /* xmm2 = (q & 0x8080808080808080ULL) << 1 */\ - asm volatile("psllq $1,%xmm6");\ - asm volatile("psllq $1,%xmm10");\ - asm volatile("psllq $1,%xmm14");\ - asm volatile("psrlq $7,%xmm3"); /* xmm3 = (q & 0x8080808080808080ULL) >> 7 */\ - asm volatile("psrlq $7,%xmm7");\ - asm volatile("psrlq $7,%xmm11");\ - asm volatile("psrlq $7,%xmm15");\ - asm volatile("psubq %xmm3, %xmm2"); /* xmm2 = ((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7) */\ - asm volatile("psubq %xmm7, %xmm6");\ - asm volatile("psubq %xmm11, %xmm10");\ - asm volatile("psubq %xmm15, %xmm14");\ - asm volatile("psllq $1,%xmm1"); /* xmm1 = q << 1 */\ - asm volatile("psllq $1,%xmm5");\ - asm volatile("psllq $1,%xmm9");\ - asm volatile("psllq $1,%xmm13");\ - /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[1])); */\ - MAKE_CST32_SSE(xmm3,0x1d1d1d1d);\ - asm volatile("pand %xmm3,%xmm2"); /* xmm2 = (((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */\ - asm volatile("pand %xmm3,%xmm6");\ - asm volatile("pand %xmm3,%xmm10");\ - asm volatile("pand %xmm3,%xmm14");\ - /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[2])); */\ - MAKE_CST32_SSE(xmm3,0xfefefefe);\ - asm volatile("pand %xmm3,%xmm1"); /* xmm1 = (q << 1) & 0xfefefefefefefefeULL */\ - asm volatile("pand %xmm3,%xmm5");\ - asm volatile("pand %xmm3,%xmm9");\ - asm volatile("pand %xmm3,%xmm13");\ - asm volatile("pxor %xmm2, %xmm1"); /* xmm1 = ((((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((q << 1) & 0xfefefefefefefefeULL) */\ - asm volatile("pxor %xmm6, %xmm5");\ - asm volatile("pxor %xmm10, %xmm9");\ - asm volatile("pxor %xmm14, %xmm13");\ - /* to implement R in RAID-Z3, ... repeat until here, at this point. */\ - asm volatile("pxor %xmm0, %xmm1"); /* final xor */\ - asm volatile("pxor %xmm4, %xmm5");\ - asm volatile("pxor %xmm8, %xmm9");\ - asm volatile("pxor %xmm12, %xmm13");\ - asm volatile("movdqa %%xmm1, %0" : "=m" (*(q+0)));\ - asm volatile("movdqa %%xmm5, %0" : "=m" (*(q+2)));\ - asm volatile("movdqa %%xmm9, %0" : "=m" (*(q+4)));\ - asm volatile("movdqa %%xmm13, %0" : "=m" (*(q+6))) - -#define COMPUTE8_R_SSE asm volatile("movdqa %0,%%xmm1" : : "m" (*(r+0))); /* xmm1 = r */\ - asm volatile("movdqa %0,%%xmm5" : : "m" (*(r+2)));\ - asm volatile("movdqa %0,%%xmm9" : : "m" (*(r+4)));\ - asm volatile("movdqa %0,%%xmm13" : : "m" (*(r+6)));\ - /* to implement R in RAID-Z3, just copy the whole R block and repeat from here ... */\ - for (j = 0 ; j < 2 ; j++) { \ - asm volatile("movdqa %xmm1,%xmm2"); /* xmm2 = r */\ - asm volatile("movdqa %xmm5,%xmm6");\ - asm volatile("movdqa %xmm9,%xmm10");\ - asm volatile("movdqa %xmm13,%xmm14");\ - /* I think the movdqa from the static array should work, but it doesn't.\ - So the constants are synthesized from a 32 bits value in a conventional\ - register */\ - /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[0])); */\ - MAKE_CST32_SSE(xmm3,0x80808080);\ - asm volatile("pand %xmm3,%xmm2"); /* xmm2 = r & 0x8080808080808080ULL */\ - asm volatile("pand %xmm3,%xmm6");\ - asm volatile("pand %xmm3,%xmm10");\ - asm volatile("pand %xmm3,%xmm14");\ - asm volatile("movdqa %xmm2,%xmm3"); /* xmm3 = r & 0x8080808080808080ULL */\ - asm volatile("movdqa %xmm6,%xmm7");\ - asm volatile("movdqa %xmm10,%xmm11");\ - asm volatile("movdqa %xmm14,%xmm15");\ - asm volatile("psllq $1,%xmm2"); /* xmm2 = (r & 0x8080808080808080ULL) << 1 */\ - asm volatile("psllq $1,%xmm6");\ - asm volatile("psllq $1,%xmm10");\ - asm volatile("psllq $1,%xmm14");\ - asm volatile("psrlq $7,%xmm3"); /* xmm3 = (r & 0x8080808080808080ULL) >> 7 */\ - asm volatile("psrlq $7,%xmm7");\ - asm volatile("psrlq $7,%xmm11");\ - asm volatile("psrlq $7,%xmm15");\ - asm volatile("psubq %xmm3, %xmm2"); /* xmm2 = ((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7) */\ - asm volatile("psubq %xmm7, %xmm6");\ - asm volatile("psubq %xmm11, %xmm10");\ - asm volatile("psubq %xmm15, %xmm14");\ - asm volatile("psllq $1,%xmm1"); /* xmm1 = r << 1 */\ - asm volatile("psllq $1,%xmm5");\ - asm volatile("psllq $1,%xmm9");\ - asm volatile("psllq $1,%xmm13");\ - /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[1])); */\ - MAKE_CST32_SSE(xmm3,0x1d1d1d1d);\ - asm volatile("pand %xmm3,%xmm2"); /* xmm2 = (((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */\ - asm volatile("pand %xmm3,%xmm6");\ - asm volatile("pand %xmm3,%xmm10");\ - asm volatile("pand %xmm3,%xmm14");\ - /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[2])); */\ - MAKE_CST32_SSE(xmm3,0xfefefefe);\ - asm volatile("pand %xmm3,%xmm1"); /* xmm1 = (r << 1) & 0xfefefefefefefefeULL */\ - asm volatile("pand %xmm3,%xmm5");\ - asm volatile("pand %xmm3,%xmm9");\ - asm volatile("pand %xmm3,%xmm13");\ - asm volatile("pxor %xmm2, %xmm1"); /* xmm1 = ((((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((r << 1) & 0xfefefefefefefefeULL) */\ - asm volatile("pxor %xmm6, %xmm5");\ - asm volatile("pxor %xmm10, %xmm9");\ - asm volatile("pxor %xmm14, %xmm13");\ - }\ - /* to implement R in RAID-Z3, ... repeat until here, at this point. */\ - asm volatile("pxor %xmm0, %xmm1"); /* final xor */\ - asm volatile("pxor %xmm4, %xmm5");\ - asm volatile("pxor %xmm8, %xmm9");\ - asm volatile("pxor %xmm12, %xmm13");\ - asm volatile("movdqa %%xmm1, %0" : "=m" (*(r+0)));\ - asm volatile("movdqa %%xmm5, %0" : "=m" (*(r+2)));\ - asm volatile("movdqa %%xmm9, %0" : "=m" (*(r+4)));\ - asm volatile("movdqa %%xmm13, %0" : "=m" (*(r+6))) +/* + * static const struct raidz2_sse_constants { + * uint64_t cst[6]; + * } raidz2_sse_constants __aligned(16) = { + * { 0x8080808080808080ULL, 0x8080808080808080ULL, + * 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + * 0xfefefefefefefefeULL, 0xfefefefefefefefeULL, + * }, + * }; + */ +#define MAKE_CST32_SSE(reg, val) \ + asm volatile("movd %0,%%"#reg : : "r"(val)); \ + asm volatile("pshufd $0,%"#reg",%"#reg); + +#define COPY8P_SSE \ + asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("movdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("movdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(p+6))) + +#define COPY8PQ_SSE \ + asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("movdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("movdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(p+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(q+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(q+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(q+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(q+6))) + +#define COPY8PQR_SSE \ + asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("movdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("movdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(p+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(q+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(q+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(q+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(q+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(r+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(r+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(r+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(r+6))) + +#define LOAD8_SRC_SSE \ + asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("movdqa %0,%%xmm4" : : "m" (*(src+2))); \ + asm volatile("movdqa %0,%%xmm8" : : "m" (*(src+4))); \ + asm volatile("movdqa %0,%%xmm12" : : "m" (*(src+6))) + +#define COMPUTE8_P_SSE \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(p+0))); \ + asm volatile("movdqa %0,%%xmm5" : : "m" (*(p+2))); \ + asm volatile("movdqa %0,%%xmm9" : : "m" (*(p+4))); \ + asm volatile("movdqa %0,%%xmm13" : : "m" (*(p+6))); \ + asm volatile("pxor %xmm0,%xmm1"); \ + asm volatile("pxor %xmm4,%xmm5"); \ + asm volatile("pxor %xmm8,%xmm9"); \ + asm volatile("pxor %xmm12,%xmm13"); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+0))); \ + asm volatile("movdqa %%xmm5, %0" : "=m" (*(p+2))); \ + asm volatile("movdqa %%xmm9, %0" : "=m" (*(p+4))); \ + asm volatile("movdqa %%xmm13, %0" : "=m" (*(p+6))) + +#define COMPUTE8_Q_SSE \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(q+0))); /* xmm1 = q */ \ + asm volatile("movdqa %0,%%xmm5" : : "m" (*(q+2))); \ + asm volatile("movdqa %0,%%xmm9" : : "m" (*(q+4))); \ + asm volatile("movdqa %0,%%xmm13" : : "m" (*(q+6))); \ + /* to implement R in RAID-Z3, just copy the whole Q block and repeat from here ... */ \ + asm volatile("movdqa %xmm1,%xmm2"); /* xmm2 = q */ \ + asm volatile("movdqa %xmm5,%xmm6"); \ + asm volatile("movdqa %xmm9,%xmm10"); \ + asm volatile("movdqa %xmm13,%xmm14"); \ + /* \ + * I think the movdqa from the static array should work, but it doesn't. \ + * So the constants are synthesized from a 32 bits value in a conventional \ + * register \ + */ \ + /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[0])); */ \ + MAKE_CST32_SSE(xmm3,0x80808080); \ + asm volatile("pand %xmm3,%xmm2"); /* xmm2 = q & 0x8080808080808080ULL */ \ + asm volatile("pand %xmm3,%xmm6"); \ + asm volatile("pand %xmm3,%xmm10"); \ + asm volatile("pand %xmm3,%xmm14"); \ + asm volatile("movdqa %xmm2,%xmm3"); /* xmm3 = q & 0x8080808080808080ULL */ \ + asm volatile("movdqa %xmm6,%xmm7"); \ + asm volatile("movdqa %xmm10,%xmm11"); \ + asm volatile("movdqa %xmm14,%xmm15"); \ + asm volatile("psllq $1,%xmm2"); /* xmm2 = (q & 0x8080808080808080ULL) << 1 */ \ + asm volatile("psllq $1,%xmm6"); \ + asm volatile("psllq $1,%xmm10"); \ + asm volatile("psllq $1,%xmm14"); \ + asm volatile("psrlq $7,%xmm3"); /* xmm3 = (q & 0x8080808080808080ULL) >> 7 */ \ + asm volatile("psrlq $7,%xmm7"); \ + asm volatile("psrlq $7,%xmm11"); \ + asm volatile("psrlq $7,%xmm15"); \ + asm volatile("psubq %xmm3, %xmm2"); /* xmm2 = ((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7) */ \ + asm volatile("psubq %xmm7, %xmm6"); \ + asm volatile("psubq %xmm11, %xmm10"); \ + asm volatile("psubq %xmm15, %xmm14"); \ + asm volatile("psllq $1,%xmm1"); /* xmm1 = q << 1 */ \ + asm volatile("psllq $1,%xmm5"); \ + asm volatile("psllq $1,%xmm9"); \ + asm volatile("psllq $1,%xmm13"); \ + /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[1])); */ \ + MAKE_CST32_SSE(xmm3,0x1d1d1d1d); \ + asm volatile("pand %xmm3,%xmm2"); /* xmm2 = (((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */ \ + asm volatile("pand %xmm3,%xmm6"); \ + asm volatile("pand %xmm3,%xmm10"); \ + asm volatile("pand %xmm3,%xmm14"); \ + /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[2])); */ \ + MAKE_CST32_SSE(xmm3,0xfefefefe); \ + asm volatile("pand %xmm3,%xmm1"); /* xmm1 = (q << 1) & 0xfefefefefefefefeULL */ \ + asm volatile("pand %xmm3,%xmm5"); \ + asm volatile("pand %xmm3,%xmm9"); \ + asm volatile("pand %xmm3,%xmm13"); \ + asm volatile("pxor %xmm2, %xmm1"); /* xmm1 = ((((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((q << 1) & 0xfefefefefefefefeULL) */ \ + asm volatile("pxor %xmm6, %xmm5"); \ + asm volatile("pxor %xmm10, %xmm9"); \ + asm volatile("pxor %xmm14, %xmm13"); \ + /* to implement R in RAID-Z3, ... repeat until here, at this point. */ \ + asm volatile("pxor %xmm0, %xmm1"); /* final xor */ \ + asm volatile("pxor %xmm4, %xmm5"); \ + asm volatile("pxor %xmm8, %xmm9"); \ + asm volatile("pxor %xmm12, %xmm13"); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(q+0))); \ + asm volatile("movdqa %%xmm5, %0" : "=m" (*(q+2))); \ + asm volatile("movdqa %%xmm9, %0" : "=m" (*(q+4))); \ + asm volatile("movdqa %%xmm13, %0" : "=m" (*(q+6))) + +#define COMPUTE8_R_SSE \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(r+0))); /* xmm1 = r */ \ + asm volatile("movdqa %0,%%xmm5" : : "m" (*(r+2))); \ + asm volatile("movdqa %0,%%xmm9" : : "m" (*(r+4))); \ + asm volatile("movdqa %0,%%xmm13" : : "m" (*(r+6))); \ + /* to implement R in RAID-Z3, just copy the whole R block and repeat from here ... */ \ + for (j = 0; j < 2; j++) { \ + asm volatile("movdqa %xmm1,%xmm2"); /* xmm2 = r */ \ + asm volatile("movdqa %xmm5,%xmm6"); \ + asm volatile("movdqa %xmm9,%xmm10"); \ + asm volatile("movdqa %xmm13,%xmm14"); \ + /* \ + * I think the movdqa from the static array should work, but it doesn't. \ + * So the constants are synthesized from a 32 bits value in a conventional \ + * register \ + */ \ + /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[0])); */ \ + MAKE_CST32_SSE(xmm3,0x80808080); \ + asm volatile("pand %xmm3,%xmm2"); /* xmm2 = r & 0x8080808080808080ULL */ \ + asm volatile("pand %xmm3,%xmm6"); \ + asm volatile("pand %xmm3,%xmm10"); \ + asm volatile("pand %xmm3,%xmm14"); \ + asm volatile("movdqa %xmm2,%xmm3"); /* xmm3 = r & 0x8080808080808080ULL */ \ + asm volatile("movdqa %xmm6,%xmm7"); \ + asm volatile("movdqa %xmm10,%xmm11"); \ + asm volatile("movdqa %xmm14,%xmm15"); \ + asm volatile("psllq $1,%xmm2"); /* xmm2 = (r & 0x8080808080808080ULL) << 1 */ \ + asm volatile("psllq $1,%xmm6"); \ + asm volatile("psllq $1,%xmm10"); \ + asm volatile("psllq $1,%xmm14"); \ + asm volatile("psrlq $7,%xmm3"); /* xmm3 = (r & 0x8080808080808080ULL) >> 7 */ \ + asm volatile("psrlq $7,%xmm7"); \ + asm volatile("psrlq $7,%xmm11"); \ + asm volatile("psrlq $7,%xmm15"); \ + asm volatile("psubq %xmm3, %xmm2"); /* xmm2 = ((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7) */ \ + asm volatile("psubq %xmm7, %xmm6"); \ + asm volatile("psubq %xmm11, %xmm10"); \ + asm volatile("psubq %xmm15, %xmm14"); \ + asm volatile("psllq $1,%xmm1"); /* xmm1 = r << 1 */ \ + asm volatile("psllq $1,%xmm5"); \ + asm volatile("psllq $1,%xmm9"); \ + asm volatile("psllq $1,%xmm13"); \ + /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[1])); */ \ + MAKE_CST32_SSE(xmm3,0x1d1d1d1d); \ + asm volatile("pand %xmm3,%xmm2"); /* xmm2 = (((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */ \ + asm volatile("pand %xmm3,%xmm6"); \ + asm volatile("pand %xmm3,%xmm10"); \ + asm volatile("pand %xmm3,%xmm14"); \ + /* asm volatile("movdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[2])); */ \ + MAKE_CST32_SSE(xmm3,0xfefefefe); \ + asm volatile("pand %xmm3,%xmm1"); /* xmm1 = (r << 1) & 0xfefefefefefefefeULL */ \ + asm volatile("pand %xmm3,%xmm5"); \ + asm volatile("pand %xmm3,%xmm9"); \ + asm volatile("pand %xmm3,%xmm13"); \ + asm volatile("pxor %xmm2, %xmm1"); /* xmm1 = ((((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((r << 1) & 0xfefefefefefefefeULL) */ \ + asm volatile("pxor %xmm6, %xmm5"); \ + asm volatile("pxor %xmm10, %xmm9"); \ + asm volatile("pxor %xmm14, %xmm13"); \ + } \ + /* to implement R in RAID-Z3, ... repeat until here, at this point. */ \ + asm volatile("pxor %xmm0, %xmm1"); /* final xor */ \ + asm volatile("pxor %xmm4, %xmm5"); \ + asm volatile("pxor %xmm8, %xmm9"); \ + asm volatile("pxor %xmm12, %xmm13"); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(r+0))); \ + asm volatile("movdqa %%xmm5, %0" : "=m" (*(r+2))); \ + asm volatile("movdqa %%xmm9, %0" : "=m" (*(r+4))); \ + asm volatile("movdqa %%xmm13, %0" : "=m" (*(r+6))) static void @@ -952,22 +964,22 @@ vdev_raidz_generate_parity_p_sse(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccount == pcount); - i = 0; - if (ccount>7) /* ccount is unsigned */ - for (; i < ccount-7; i+=8, src+=8, p+=8) { - COPY8P_SSE; - } + i = 0; + if (ccount > 7) /* ccount is unsigned */ + for (; i < ccount-7; i += 8, src += 8, p += 8) { + COPY8P_SSE; + } for (; i < ccount; i++, src++, p++) { *p = *src; } } else { ASSERT(ccount <= pcount); - i = 0; - if (ccount>7) /* ccount is unsigned */ - for (; i < ccount-7; i+=8, src+=8, p+=8) { - LOAD8_SRC_SSE; - COMPUTE8_P_SSE; - } + i = 0; + if (ccount > 7) /* ccount is unsigned */ + for (; i < ccount-7; i += 8, src += 8, p += 8) { + LOAD8_SRC_SSE; + COMPUTE8_P_SSE; + } for (; i < ccount; i++, src++, p++) { *p ^= *src; } @@ -999,11 +1011,11 @@ vdev_raidz_generate_parity_pq_sse(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - i = 0; - if (ccnt>7) /* ccnt is unsigned */ - for (; i < ccnt-7; i+=8, src+=8, p+=8, q+=8) { - COPY8PQ_SSE; - } + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8) { + COPY8PQ_SSE; + } for (; i < ccnt; i++, src++, p++, q++) { *p = *src; *q = *src; @@ -1020,12 +1032,12 @@ vdev_raidz_generate_parity_pq_sse(raidz_map_t *rm) * the previous result and adding in the new value. */ i = 0; - if (ccnt>7) /* ccnt is unsigned */ - for (; i < ccnt-7; i+=8, src+=8, p+=8, q+=8) { - /* raw, unscheduled, unrolled-by-8 (4x128bits) SSE implementation */ - LOAD8_SRC_SSE; - COMPUTE8_P_SSE; - COMPUTE8_Q_SSE; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8) { + /* raw, unscheduled, unrolled-by-8 (4x128bits) SSE implementation */ + LOAD8_SRC_SSE; + COMPUTE8_P_SSE; + COMPUTE8_Q_SSE; } for (; i < ccnt; i++, src++, p++, q++) { *p ^= *src; @@ -1072,11 +1084,11 @@ vdev_raidz_generate_parity_pqr_sse(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - i = 0; - if (ccnt>7) /* ccnt is unsigned */ - for (; i < ccnt-7; i+=8, src+=8, p+=8, q+=8, r+=8) { - COPY8PQR_SSE; - } + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8, r += 8) { + COPY8PQR_SSE; + } for (; i < ccnt; i++, src++, p++, q++, r++) { *p = *src; *q = *src; @@ -1094,15 +1106,15 @@ vdev_raidz_generate_parity_pqr_sse(raidz_map_t *rm) * Apply the algorithm described above by multiplying * the previous result and adding in the new value. */ - i = 0; - if (ccnt>7) /* ccnt is unsigned */ - for (; i < ccnt-7; i+=8, src+=8, p+=8, q+=8, r+=8) { - /* raw, unscheduled, unrolled-by-8 (4x128bits) SSE implementation */ - LOAD8_SRC_SSE; - COMPUTE8_P_SSE; - COMPUTE8_Q_SSE; - COMPUTE8_R_SSE; - } + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8, r += 8) { + /* raw, unscheduled, unrolled-by-8 (4x128bits) SSE implementation */ + LOAD8_SRC_SSE; + COMPUTE8_P_SSE; + COMPUTE8_Q_SSE; + COMPUTE8_R_SSE; + } for (;i < ccnt; i++, src++, p++, q++, r++) { *p ^= *src; @@ -1137,180 +1149,190 @@ vdev_raidz_generate_parity_pqr_sse(raidz_map_t *rm) #undef COMPUTE8_R_SSE #if defined(_KERNEL) && defined(CONFIG_AS_AVX2) -#define MAKE_CST32_AVX2(regx,regy,val) \ - asm volatile("vmovd %0,%%"#regx : : "r"(val));\ - asm volatile("vpbroadcastd %"#regx",%"#regy); - -#define COPY16P_AVX2 asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0)));\ - asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4)));\ - asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8)));\ - asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12)));\ - asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0)));\ - asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4)));\ - asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8)));\ - asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12))) - -#define COPY16PQ_AVX2 asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0)));\ - asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4)));\ - asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8)));\ - asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12)));\ - asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0)));\ - asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4)));\ - asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8)));\ - asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12)));\ - asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(q+0)));\ - asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(q+4)));\ - asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(q+8)));\ - asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(q+12))) - -#define COPY16PQR_AVX2 asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0)));\ - asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4)));\ - asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8)));\ - asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12)));\ - asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0)));\ - asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4)));\ - asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8)));\ - asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12)));\ - asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(q+0)));\ - asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(q+4)));\ - asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(q+8)));\ - asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(q+12)));\ - asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(r+0)));\ - asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(r+4)));\ - asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(r+8)));\ - asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(r+12))) - -#define LOAD16_SRC_AVX2 asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0)));\ - asm volatile("vmovdqa %0,%%ymm4" : : "m" (*(src+4)));\ - asm volatile("vmovdqa %0,%%ymm8" : : "m" (*(src+8)));\ - asm volatile("vmovdqa %0,%%ymm12" : : "m" (*(src+12))) - -#define COMPUTE16_P_AVX2 asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(p+0)));\ - asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(p+4)));\ - asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(p+8)));\ - asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(p+12)));\ - asm volatile("vpxor %ymm0,%ymm1,%ymm1");\ - asm volatile("vpxor %ymm4,%ymm5,%ymm5");\ - asm volatile("vpxor %ymm8,%ymm9,%ymm9");\ - asm volatile("vpxor %ymm12,%ymm13,%ymm13");\ - asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(p+0)));\ - asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(p+4)));\ - asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(p+8)));\ - asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(p+12))) - -#define COMPUTE16_Q_AVX2 asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(q+0))); /* ymm1 = q */\ - asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(q+4)));\ - asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(q+8)));\ - asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(q+12)));\ - /* to implement R in RAID-Z3, just copy the whole Q block and repeat from here ... */\ - /* I think the movdqa from the static array should work, but it doesn't.\ - So the constants are synthesized from a 32 bits value in a conventional\ - register */\ - /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[0])); */\ - MAKE_CST32_AVX2(xmm3,ymm3,0x80808080);\ - asm volatile("vpand %ymm3,%ymm1,%ymm2"); /* ymm2 = q & 0x8080808080808080ULL */\ - asm volatile("vpand %ymm3,%ymm5,%ymm6");\ - asm volatile("vpand %ymm3,%ymm9,%ymm10");\ - asm volatile("vpand %ymm3,%ymm13,%ymm14");\ - asm volatile("vpsrlq $7,%ymm2,%ymm3"); /* ymm3 = (q & 0x8080808080808080ULL) >> 7 */\ - asm volatile("vpsrlq $7,%ymm6,%ymm7");\ - asm volatile("vpsrlq $7,%ymm10,%ymm11");\ - asm volatile("vpsrlq $7,%ymm14,%ymm15");\ - asm volatile("vpsllq $1,%ymm2,%ymm2"); /* ymm2 = (q & 0x8080808080808080ULL) << 1 */\ - asm volatile("vpsllq $1,%ymm6,%ymm6");\ - asm volatile("vpsllq $1,%ymm10,%ymm10");\ - asm volatile("vpsllq $1,%ymm14,%ymm14");\ - asm volatile("vpsubq %ymm3,%ymm2,%ymm2"); /* ymm2 = ((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7) */\ - asm volatile("vpsubq %ymm7,%ymm6,%ymm6");\ - asm volatile("vpsubq %ymm11,%ymm10,%ymm10");\ - asm volatile("vpsubq %ymm15,%ymm14,%ymm14");\ - asm volatile("vpsllq $1,%ymm1,%ymm1"); /* ymm1 = q << 1 */\ - asm volatile("vpsllq $1,%ymm5,%ymm5");\ - asm volatile("vpsllq $1,%ymm9,%ymm9");\ - asm volatile("vpsllq $1,%ymm13,%ymm13");\ - /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[1])); */\ - MAKE_CST32_AVX2(xmm3,ymm3,0x1d1d1d1d);\ - asm volatile("vpand %ymm3,%ymm2,%ymm2"); /* ymm2 = (((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */\ - asm volatile("vpand %ymm3,%ymm6,%ymm6");\ - asm volatile("vpand %ymm3,%ymm10,%ymm10");\ - asm volatile("vpand %ymm3,%ymm14,%ymm14");\ - /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[2])); */\ - MAKE_CST32_AVX2(xmm3,ymm3,0xfefefefe);\ - asm volatile("vpand %ymm3,%ymm1,%ymm1"); /* ymm1 = (q << 1) & 0xfefefefefefefefeULL */\ - asm volatile("vpand %ymm3,%ymm5,%ymm5");\ - asm volatile("vpand %ymm3,%ymm9,%ymm9");\ - asm volatile("vpand %ymm3,%ymm13,%ymm13");\ - asm volatile("vpxor %ymm2,%ymm1,%ymm1"); /* ymm1 = ((((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((q << 1) & 0xfefefefefefefefeULL) */\ - asm volatile("vpxor %ymm6,%ymm5,%ymm5");\ - asm volatile("vpxor %ymm10,%ymm9,%ymm9");\ - asm volatile("vpxor %ymm14,%ymm13,%ymm13");\ - /* to implement R in RAID-Z3, ... repeat until here, at this point. */\ - asm volatile("vpxor %ymm0,%ymm1,%ymm1"); /* final xor */\ - asm volatile("vpxor %ymm4,%ymm5,%ymm5");\ - asm volatile("vpxor %ymm8,%ymm9,%ymm9");\ - asm volatile("vpxor %ymm12,%ymm13,%ymm13");\ - asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(q+0)));\ - asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(q+4)));\ - asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(q+8)));\ - asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(q+12))) - -#define COMPUTE16_R_AVX2 asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(r+0))); /* ymm1 = r */\ - asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(r+4)));\ - asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(r+8)));\ - asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(r+12)));\ - /* to implement R in RAID-Z3, just copy the whole R block and repeat from here ... */\ - for (j = 0 ; j < 2 ; j++) {\ - /* I think the movdqa from the static array should work, but it doesn't.\ - So the constants are synthesized from a 32 bits value in a conventional\ - register */\ - /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[0])); */\ - MAKE_CST32_AVX2(xmm3,ymm3,0x80808080);\ - asm volatile("vpand %ymm3,%ymm1,%ymm2"); /* ymm2 = r & 0x8080808080808080ULL */\ - asm volatile("vpand %ymm3,%ymm5,%ymm6");\ - asm volatile("vpand %ymm3,%ymm9,%ymm10");\ - asm volatile("vpand %ymm3,%ymm13,%ymm14");\ - asm volatile("vpsrlq $7,%ymm2,%ymm3"); /* ymm3 = (r & 0x8080808080808080ULL) >> 7 */\ - asm volatile("vpsrlq $7,%ymm6,%ymm7");\ - asm volatile("vpsrlq $7,%ymm10,%ymm11");\ - asm volatile("vpsrlq $7,%ymm14,%ymm15");\ - asm volatile("vpsllq $1,%ymm2,%ymm2"); /* ymm2 = (r & 0x8080808080808080ULL) << 1 */\ - asm volatile("vpsllq $1,%ymm6,%ymm6");\ - asm volatile("vpsllq $1,%ymm10,%ymm10");\ - asm volatile("vpsllq $1,%ymm14,%ymm14");\ - asm volatile("vpsubq %ymm3,%ymm2,%ymm2"); /* ymm2 = ((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7) */\ - asm volatile("vpsubq %ymm7,%ymm6,%ymm6");\ - asm volatile("vpsubq %ymm11,%ymm10,%ymm10");\ - asm volatile("vpsubq %ymm15,%ymm14,%ymm14");\ - asm volatile("vpsllq $1,%ymm1,%ymm1"); /* ymm1 = r << 1 */\ - asm volatile("vpsllq $1,%ymm5,%ymm5");\ - asm volatile("vpsllq $1,%ymm9,%ymm9");\ - asm volatile("vpsllq $1,%ymm13,%ymm13");\ - /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[1])); */\ - MAKE_CST32_AVX2(xmm3,ymm3,0x1d1d1d1d);\ - asm volatile("vpand %ymm3,%ymm2,%ymm2"); /* ymm2 = (((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */\ - asm volatile("vpand %ymm3,%ymm6,%ymm6");\ - asm volatile("vpand %ymm3,%ymm10,%ymm10");\ - asm volatile("vpand %ymm3,%ymm14,%ymm14");\ - /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[2])); */\ - MAKE_CST32_AVX2(xmm3,ymm3,0xfefefefe);\ - asm volatile("vpand %ymm3,%ymm1,%ymm1"); /* ymm1 = (r << 1) & 0xfefefefefefefefeULL */\ - asm volatile("vpand %ymm3,%ymm5,%ymm5");\ - asm volatile("vpand %ymm3,%ymm9,%ymm9");\ - asm volatile("vpand %ymm3,%ymm13,%ymm13");\ - asm volatile("vpxor %ymm2,%ymm1,%ymm1"); /* ymm1 = ((((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((r << 1) & 0xfefefefefefefefeULL) */\ - asm volatile("vpxor %ymm6,%ymm5,%ymm5");\ - asm volatile("vpxor %ymm10,%ymm9,%ymm9");\ - asm volatile("vpxor %ymm14,%ymm13,%ymm13");\ - /* to implement R in RAID-Z3, ... repeat until here, at this point. */\ - }\ - asm volatile("vpxor %ymm0,%ymm1,%ymm1"); /* final xor */\ - asm volatile("vpxor %ymm4,%ymm5,%ymm5");\ - asm volatile("vpxor %ymm8,%ymm9,%ymm9");\ - asm volatile("vpxor %ymm12,%ymm13,%ymm13");\ - asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(r+0)));\ - asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(r+4)));\ - asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(r+8)));\ - asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(r+12))) +#define MAKE_CST32_AVX2(regx, regy, val) \ + asm volatile("vmovd %0,%%"#regx : : "r"(val)); \ + asm volatile("vpbroadcastd %"#regx",%"#regy); + +#define COPY16P_AVX2 \ + asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8))); \ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12))) + +#define COPY16PQ_AVX2 \ + asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8))); \ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(q+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(q+12))) + +#define COPY16PQR_AVX2 \ + asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8))); \ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(q+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(q+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(r+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(r+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(r+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(r+12))) + +#define LOAD16_SRC_AVX2 \ + asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%ymm4" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%ymm8" : : "m" (*(src+8))); \ + asm volatile("vmovdqa %0,%%ymm12" : : "m" (*(src+12))) + +#define COMPUTE16_P_AVX2 \ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(p+0))); \ + asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(p+4))); \ + asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(p+8))); \ + asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(p+12))); \ + asm volatile("vpxor %ymm0,%ymm1,%ymm1"); \ + asm volatile("vpxor %ymm4,%ymm5,%ymm5"); \ + asm volatile("vpxor %ymm8,%ymm9,%ymm9"); \ + asm volatile("vpxor %ymm12,%ymm13,%ymm13"); \ + asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(p+8))); \ + asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(p+12))) + +#define COMPUTE16_Q_AVX2 \ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(q+0))); /* ymm1 = q */ \ + asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(q+4))); \ + asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(q+8))); \ + asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(q+12))); \ + /* to implement R in RAID-Z3, just copy the whole Q block and repeat from here ... */ \ + /* \ + * I think the movdqa from the static array should work, but it doesn't. \ + * So the constants are synthesized from a 32 bits value in a conventional \ + * register \ + */ \ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[0])); */ \ + MAKE_CST32_AVX2(xmm3,ymm3,0x80808080); \ + asm volatile("vpand %ymm3,%ymm1,%ymm2"); /* ymm2 = q & 0x8080808080808080ULL */ \ + asm volatile("vpand %ymm3,%ymm5,%ymm6"); \ + asm volatile("vpand %ymm3,%ymm9,%ymm10"); \ + asm volatile("vpand %ymm3,%ymm13,%ymm14"); \ + asm volatile("vpsrlq $7,%ymm2,%ymm3"); /* ymm3 = (q & 0x8080808080808080ULL) >> 7 */ \ + asm volatile("vpsrlq $7,%ymm6,%ymm7"); \ + asm volatile("vpsrlq $7,%ymm10,%ymm11"); \ + asm volatile("vpsrlq $7,%ymm14,%ymm15"); \ + asm volatile("vpsllq $1,%ymm2,%ymm2"); /* ymm2 = (q & 0x8080808080808080ULL) << 1 */ \ + asm volatile("vpsllq $1,%ymm6,%ymm6"); \ + asm volatile("vpsllq $1,%ymm10,%ymm10"); \ + asm volatile("vpsllq $1,%ymm14,%ymm14"); \ + asm volatile("vpsubq %ymm3,%ymm2,%ymm2"); /* ymm2 = ((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7) */ \ + asm volatile("vpsubq %ymm7,%ymm6,%ymm6"); \ + asm volatile("vpsubq %ymm11,%ymm10,%ymm10"); \ + asm volatile("vpsubq %ymm15,%ymm14,%ymm14"); \ + asm volatile("vpsllq $1,%ymm1,%ymm1"); /* ymm1 = q << 1 */ \ + asm volatile("vpsllq $1,%ymm5,%ymm5"); \ + asm volatile("vpsllq $1,%ymm9,%ymm9"); \ + asm volatile("vpsllq $1,%ymm13,%ymm13"); \ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[1])); */ \ + MAKE_CST32_AVX2(xmm3,ymm3,0x1d1d1d1d); \ + asm volatile("vpand %ymm3,%ymm2,%ymm2"); /* ymm2 = (((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */ \ + asm volatile("vpand %ymm3,%ymm6,%ymm6"); \ + asm volatile("vpand %ymm3,%ymm10,%ymm10"); \ + asm volatile("vpand %ymm3,%ymm14,%ymm14"); \ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[2])); */ \ + MAKE_CST32_AVX2(xmm3,ymm3,0xfefefefe); \ + asm volatile("vpand %ymm3,%ymm1,%ymm1"); /* ymm1 = (q << 1) & 0xfefefefefefefefeULL */ \ + asm volatile("vpand %ymm3,%ymm5,%ymm5"); \ + asm volatile("vpand %ymm3,%ymm9,%ymm9"); \ + asm volatile("vpand %ymm3,%ymm13,%ymm13"); \ + asm volatile("vpxor %ymm2,%ymm1,%ymm1"); /* ymm1 = ((((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((q << 1) & 0xfefefefefefefefeULL) */ \ + asm volatile("vpxor %ymm6,%ymm5,%ymm5"); \ + asm volatile("vpxor %ymm10,%ymm9,%ymm9"); \ + asm volatile("vpxor %ymm14,%ymm13,%ymm13"); \ + /* to implement R in RAID-Z3, ... repeat until here, at this point. */ \ + asm volatile("vpxor %ymm0,%ymm1,%ymm1"); /* final xor */ \ + asm volatile("vpxor %ymm4,%ymm5,%ymm5"); \ + asm volatile("vpxor %ymm8,%ymm9,%ymm9"); \ + asm volatile("vpxor %ymm12,%ymm13,%ymm13"); \ + asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(q+8))); \ + asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(q+12))) + +#define COMPUTE16_R_AVX2 asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(r+0))); /* ymm1 = r */ \ + asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(r+4))); \ + asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(r+8))); \ + asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(r+12))); \ + /* to implement R in RAID-Z3, just copy the whole R block and repeat from here ... */ \ + for (j = 0; j < 2; j++) { \ + /* \ + * I think the movdqa from the static array should work, but it doesn't. \ + * So the constants are synthesized from a 32 bits value in a conventional \ + * register \ + */ \ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[0])); */ \ + MAKE_CST32_AVX2(xmm3,ymm3,0x80808080); \ + asm volatile("vpand %ymm3,%ymm1,%ymm2"); /* ymm2 = r & 0x8080808080808080ULL */ \ + asm volatile("vpand %ymm3,%ymm5,%ymm6"); \ + asm volatile("vpand %ymm3,%ymm9,%ymm10"); \ + asm volatile("vpand %ymm3,%ymm13,%ymm14"); \ + asm volatile("vpsrlq $7,%ymm2,%ymm3"); /* ymm3 = (r & 0x8080808080808080ULL) >> 7 */ \ + asm volatile("vpsrlq $7,%ymm6,%ymm7"); \ + asm volatile("vpsrlq $7,%ymm10,%ymm11"); \ + asm volatile("vpsrlq $7,%ymm14,%ymm15"); \ + asm volatile("vpsllq $1,%ymm2,%ymm2"); /* ymm2 = (r & 0x8080808080808080ULL) << 1 */ \ + asm volatile("vpsllq $1,%ymm6,%ymm6"); \ + asm volatile("vpsllq $1,%ymm10,%ymm10"); \ + asm volatile("vpsllq $1,%ymm14,%ymm14"); \ + asm volatile("vpsubq %ymm3,%ymm2,%ymm2"); /* ymm2 = ((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7) */ \ + asm volatile("vpsubq %ymm7,%ymm6,%ymm6"); \ + asm volatile("vpsubq %ymm11,%ymm10,%ymm10"); \ + asm volatile("vpsubq %ymm15,%ymm14,%ymm14"); \ + asm volatile("vpsllq $1,%ymm1,%ymm1"); /* ymm1 = r << 1 */ \ + asm volatile("vpsllq $1,%ymm5,%ymm5"); \ + asm volatile("vpsllq $1,%ymm9,%ymm9"); \ + asm volatile("vpsllq $1,%ymm13,%ymm13"); \ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[1])); */ \ + MAKE_CST32_AVX2(xmm3,ymm3,0x1d1d1d1d); \ + asm volatile("vpand %ymm3,%ymm2,%ymm2"); /* ymm2 = (((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */ \ + asm volatile("vpand %ymm3,%ymm6,%ymm6"); \ + asm volatile("vpand %ymm3,%ymm10,%ymm10"); \ + asm volatile("vpand %ymm3,%ymm14,%ymm14"); \ + /* asm volatile("vmovdqa %0,%%ymm3" : : "m" (raidz2_sse_constants.cst[2])); */ \ + MAKE_CST32_AVX2(xmm3,ymm3,0xfefefefe); \ + asm volatile("vpand %ymm3,%ymm1,%ymm1"); /* ymm1 = (r << 1) & 0xfefefefefefefefeULL */ \ + asm volatile("vpand %ymm3,%ymm5,%ymm5"); \ + asm volatile("vpand %ymm3,%ymm9,%ymm9"); \ + asm volatile("vpand %ymm3,%ymm13,%ymm13"); \ + asm volatile("vpxor %ymm2,%ymm1,%ymm1"); /* ymm1 = ((((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((r << 1) & 0xfefefefefefefefeULL) */ \ + asm volatile("vpxor %ymm6,%ymm5,%ymm5"); \ + asm volatile("vpxor %ymm10,%ymm9,%ymm9"); \ + asm volatile("vpxor %ymm14,%ymm13,%ymm13"); \ + /* to implement R in RAID-Z3, ... repeat until here, at this point. */ \ + } \ + asm volatile("vpxor %ymm0,%ymm1,%ymm1"); /* final xor */ \ + asm volatile("vpxor %ymm4,%ymm5,%ymm5"); \ + asm volatile("vpxor %ymm8,%ymm9,%ymm9"); \ + asm volatile("vpxor %ymm12,%ymm13,%ymm13"); \ + asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(r+0))); \ + asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(r+4))); \ + asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(r+8))); \ + asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(r+12))) static void vdev_raidz_generate_parity_p_avx2(raidz_map_t *rm) @@ -1329,23 +1351,23 @@ vdev_raidz_generate_parity_p_avx2(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccount == pcount); - i = 0; - if (ccount>15) /* ccount is unsigned */ - for (; i < ccount-15; i+=16, src+=16, p+=16) { - COPY16P_AVX2; - } + i = 0; + if (ccount > 15) /* ccount is unsigned */ + for (; i < ccount-15; i += 16, src += 16, p += 16) { + COPY16P_AVX2; + } for (; i < ccount; i++, src++, p++) { *p = *src; } } else { ASSERT(ccount <= pcount); - i = 0; - if (ccount>15) /* ccount is unsigned */ - for (; i < ccount-15; i+=16, src+=16, p+=16) { - /* raw, unscheduled, unrolled-by-16 (4x256bits) AVX2 implementation */ - LOAD16_SRC_AVX2; - COMPUTE16_P_AVX2; - } + i = 0; + if (ccount > 15) /* ccount is unsigned */ + for (; i < ccount-15; i += 16, src += 16, p += 16) { + /* raw, unscheduled, unrolled-by-16 (4x256bits) AVX2 implementation */ + LOAD16_SRC_AVX2; + COMPUTE16_P_AVX2; + } for (; i < ccount; i++, src++, p++) { *p ^= *src; } @@ -1377,11 +1399,11 @@ vdev_raidz_generate_parity_pq_avx2(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - i = 0; - if (ccnt>15) /* ccnt is unsigned */ - for (; i < ccnt-15; i+=16, src+=16, p+=16, q+=16) { - COPY16PQ_AVX2; - } + i = 0; + if (ccnt > 15) /* ccnt is unsigned */ + for (; i < ccnt-15; i += 16, src += 16, p += 16, q += 16) { + COPY16PQ_AVX2; + } for (; i < ccnt; i++, src++, p++, q++) { *p = *src; *q = *src; @@ -1398,12 +1420,12 @@ vdev_raidz_generate_parity_pq_avx2(raidz_map_t *rm) * the previous result and adding in the new value. */ i = 0; - if (ccnt>15) /* ccnt is unsigned */ - for (; i < ccnt-15; i+=16, src+=16, p+=16, q+=16) { - /* raw, unscheduled, unrolled-by-16 (4x256bits) AVX2 implementation */ - LOAD16_SRC_AVX2; - COMPUTE16_P_AVX2; - COMPUTE16_Q_AVX2; + if (ccnt > 15) /* ccnt is unsigned */ + for (; i < ccnt-15; i += 16, src += 16, p += 16, q += 16) { + /* raw, unscheduled, unrolled-by-16 (4x256bits) AVX2 implementation */ + LOAD16_SRC_AVX2; + COMPUTE16_P_AVX2; + COMPUTE16_Q_AVX2; } for (; i < ccnt; i++, src++, p++, q++) { *p ^= *src; @@ -1450,11 +1472,11 @@ vdev_raidz_generate_parity_pqr_avx2(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - i = 0; - if (ccnt>15) /* ccnt is unsigned */ - for (; i < ccnt-15; i+=16, src+=16, p+=16, q+=16, r+=16) { - COPY16PQR_AVX2; - } + i = 0; + if (ccnt > 15) /* ccnt is unsigned */ + for (; i < ccnt-15; i += 16, src += 16, p += 16, q += 16, r += 16) { + COPY16PQR_AVX2; + } for (; i < ccnt; i++, src++, p++, q++, r++) { *p = *src; *q = *src; @@ -1472,15 +1494,15 @@ vdev_raidz_generate_parity_pqr_avx2(raidz_map_t *rm) * Apply the algorithm described above by multiplying * the previous result and adding in the new value. */ - i = 0; - if (ccnt>15) /* ccnt is unsigned */ - for (; i < ccnt-15; i+=16, src+=16, p+=16, q+=16, r+=16) { - /* raw, unscheduled, unrolled-by-16 (4x256bits) AVX2 implementation */ - LOAD16_SRC_AVX2; - COMPUTE16_P_AVX2; - COMPUTE16_Q_AVX2; - COMPUTE16_R_AVX2; - } + i = 0; + if (ccnt > 15) /* ccnt is unsigned */ + for (; i < ccnt-15; i += 16, src += 16, p += 16, q += 16, r += 16) { + /* raw, unscheduled, unrolled-by-16 (4x256bits) AVX2 implementation */ + LOAD16_SRC_AVX2; + COMPUTE16_P_AVX2; + COMPUTE16_Q_AVX2; + COMPUTE16_R_AVX2; + } for (; i < ccnt; i++, src++, p++, q++, r++) { *p ^= *src; @@ -1516,180 +1538,190 @@ vdev_raidz_generate_parity_pqr_avx2(raidz_map_t *rm) #endif // _KERNEL && CONFIG_AS_AVX2 #if defined(_KERNEL) && defined(CONFIG_AS_AVX) -#define MAKE_CST32_AVX128(reg,val) \ - asm volatile("vmovd %0,%%"#reg : : "r"(val));\ - asm volatile("vpshufd $0,%"#reg",%"#reg); - -#define COPY8P_AVX128 asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0)));\ - asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(src+2)));\ - asm volatile("vmovdqa %0,%%xmm2" : : "m" (*(src+4)));\ - asm volatile("vmovdqa %0,%%xmm3" : : "m" (*(src+6)));\ - asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(p+0)));\ - asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(p+2)));\ - asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(p+4)));\ - asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(p+6))) - -#define COPY8PQ_AVX128 asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0)));\ - asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(src+2)));\ - asm volatile("vmovdqa %0,%%xmm2" : : "m" (*(src+4)));\ - asm volatile("vmovdqa %0,%%xmm3" : : "m" (*(src+6)));\ - asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(p+0)));\ - asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(p+2)));\ - asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(p+4)));\ - asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(p+6)));\ - asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(q+0)));\ - asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(q+2)));\ - asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(q+4)));\ - asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(q+6))) - -#define COPY8PQR_AVX128 asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0)));\ - asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(src+2)));\ - asm volatile("vmovdqa %0,%%xmm2" : : "m" (*(src+4)));\ - asm volatile("vmovdqa %0,%%xmm3" : : "m" (*(src+6)));\ - asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(p+0)));\ - asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(p+2)));\ - asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(p+4)));\ - asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(p+6)));\ - asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(q+0)));\ - asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(q+2)));\ - asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(q+4)));\ - asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(q+6)));\ - asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(r+0)));\ - asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(r+2)));\ - asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(r+4)));\ - asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(r+6))) - -#define LOAD8_SRC_AVX128 asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0)));\ - asm volatile("vmovdqa %0,%%xmm4" : : "m" (*(src+2)));\ - asm volatile("vmovdqa %0,%%xmm8" : : "m" (*(src+4)));\ - asm volatile("vmovdqa %0,%%xmm12" : : "m" (*(src+6))) - -#define COMPUTE8_P_AVX128 asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(p+0)));\ - asm volatile("vmovdqa %0,%%xmm5" : : "m" (*(p+2)));\ - asm volatile("vmovdqa %0,%%xmm9" : : "m" (*(p+4)));\ - asm volatile("vmovdqa %0,%%xmm13" : : "m" (*(p+6)));\ - asm volatile("vpxor %xmm0,%xmm1,%xmm1");\ - asm volatile("vpxor %xmm4,%xmm5,%xmm5");\ - asm volatile("vpxor %xmm8,%xmm9,%xmm9");\ - asm volatile("vpxor %xmm12,%xmm13,%xmm13");\ - asm volatile("vmovdqa %%xmm1,%0" : "=m" (*(p+0)));\ - asm volatile("vmovdqa %%xmm5,%0" : "=m" (*(p+2)));\ - asm volatile("vmovdqa %%xmm9,%0" : "=m" (*(p+4)));\ - asm volatile("vmovdqa %%xmm13,%0" : "=m" (*(p+6))) - -#define COMPUTE8_Q_AVX128 asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(q+0))); /* xmm1 = q */\ - asm volatile("vmovdqa %0,%%xmm5" : : "m" (*(q+2)));\ - asm volatile("vmovdqa %0,%%xmm9" : : "m" (*(q+4)));\ - asm volatile("vmovdqa %0,%%xmm13" : : "m" (*(q+6)));\ - /* to implement R in RAID-Z3, just copy the whole Q block and repeat from here ... */\ - /* I think the movdqa from the static array should work, but it doesn't.\ - So the constants are synthesized from a 32 bits value in a conventional\ - register */\ - /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[0])); */\ - MAKE_CST32_AVX128(xmm3,0x80808080);\ - asm volatile("vpand %xmm3,%xmm1,%xmm2"); /* xmm2 = q & 0x8080808080808080ULL */\ - asm volatile("vpand %xmm3,%xmm5,%xmm6");\ - asm volatile("vpand %xmm3,%xmm9,%xmm10");\ - asm volatile("vpand %xmm3,%xmm13,%xmm14");\ - asm volatile("vpsrlq $7,%xmm2,%xmm3"); /* xmm3 = (q & 0x8080808080808080ULL) >> 7 */\ - asm volatile("vpsrlq $7,%xmm6,%xmm7");\ - asm volatile("vpsrlq $7,%xmm10,%xmm11");\ - asm volatile("vpsrlq $7,%xmm14,%xmm15");\ - asm volatile("vpsllq $1,%xmm2,%xmm2"); /* xmm2 = (q & 0x8080808080808080ULL) << 1 */\ - asm volatile("vpsllq $1,%xmm6,%xmm6");\ - asm volatile("vpsllq $1,%xmm10,%xmm10");\ - asm volatile("vpsllq $1,%xmm14,%xmm14");\ - asm volatile("vpsubq %xmm3,%xmm2,%xmm2"); /* xmm2 = ((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7) */\ - asm volatile("vpsubq %xmm7,%xmm6,%xmm6");\ - asm volatile("vpsubq %xmm11,%xmm10,%xmm10");\ - asm volatile("vpsubq %xmm15,%xmm14,%xmm14");\ - asm volatile("vpsllq $1,%xmm1,%xmm1"); /* xmm1 = q << 1 */\ - asm volatile("vpsllq $1,%xmm5,%xmm5");\ - asm volatile("vpsllq $1,%xmm9,%xmm9");\ - asm volatile("vpsllq $1,%xmm13,%xmm13");\ - /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[1])); */\ - MAKE_CST32_AVX128(xmm3,0x1d1d1d1d);\ - asm volatile("vpand %xmm3,%xmm2,%xmm2"); /* xmm2 = (((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */\ - asm volatile("vpand %xmm3,%xmm6,%xmm6");\ - asm volatile("vpand %xmm3,%xmm10,%xmm10");\ - asm volatile("vpand %xmm3,%xmm14,%xmm14");\ - /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[2])); */\ - MAKE_CST32_AVX128(xmm3,0xfefefefe);\ - asm volatile("vpand %xmm3,%xmm1,%xmm1"); /* xmm1 = (q << 1) & 0xfefefefefefefefeULL */\ - asm volatile("vpand %xmm3,%xmm5,%xmm5");\ - asm volatile("vpand %xmm3,%xmm9,%xmm9");\ - asm volatile("vpand %xmm3,%xmm13,%xmm13");\ - asm volatile("vpxor %xmm2,%xmm1,%xmm1"); /* xmm1 = ((((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((q << 1) & 0xfefefefefefefefeULL) */\ - asm volatile("vpxor %xmm6,%xmm5,%xmm5");\ - asm volatile("vpxor %xmm10,%xmm9,%xmm9");\ - asm volatile("vpxor %xmm14,%xmm13,%xmm13");\ - /* to implement R in RAID-Z3, ... repeat until here, at this point. */\ - asm volatile("vpxor %xmm0,%xmm1,%xmm1"); /* final xor */\ - asm volatile("vpxor %xmm4,%xmm5,%xmm5");\ - asm volatile("vpxor %xmm8,%xmm9,%xmm9");\ - asm volatile("vpxor %xmm12,%xmm13,%xmm13");\ - asm volatile("vmovdqa %%xmm1,%0" : "=m" (*(q+0)));\ - asm volatile("vmovdqa %%xmm5,%0" : "=m" (*(q+2)));\ - asm volatile("vmovdqa %%xmm9,%0" : "=m" (*(q+4)));\ - asm volatile("vmovdqa %%xmm13,%0" : "=m" (*(q+6))) - -#define COMPUTE8_R_AVX128 asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(r+0))); /* xmm1 = r */\ - asm volatile("vmovdqa %0,%%xmm5" : : "m" (*(r+2)));\ - asm volatile("vmovdqa %0,%%xmm9" : : "m" (*(r+4)));\ - asm volatile("vmovdqa %0,%%xmm13" : : "m" (*(r+6)));\ - /* to implement R in RAID-Z3, just copy the whole R block and repeat from here ... */\ - for (j = 0 ; j < 2 ; j++) {\ - /* I think the movdqa from the static array should work, but it doesn't.\ - So the constants are synthesized from a 32 bits value in a conventional\ - register */\ - /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[0])); */\ - MAKE_CST32_AVX128(xmm3,0x80808080);\ - asm volatile("vpand %xmm3,%xmm1,%xmm2"); /* xmm2 = r & 0x8080808080808080ULL */\ - asm volatile("vpand %xmm3,%xmm5,%xmm6");\ - asm volatile("vpand %xmm3,%xmm9,%xmm10");\ - asm volatile("vpand %xmm3,%xmm13,%xmm14");\ - asm volatile("vpsrlq $7,%xmm2,%xmm3"); /* xmm3 = (r & 0x8080808080808080ULL) >> 7 */\ - asm volatile("vpsrlq $7,%xmm6,%xmm7");\ - asm volatile("vpsrlq $7,%xmm10,%xmm11");\ - asm volatile("vpsrlq $7,%xmm14,%xmm15");\ - asm volatile("vpsllq $1,%xmm2,%xmm2"); /* xmm2 = (r & 0x8080808080808080ULL) << 1 */\ - asm volatile("vpsllq $1,%xmm6,%xmm6");\ - asm volatile("vpsllq $1,%xmm10,%xmm10");\ - asm volatile("vpsllq $1,%xmm14,%xmm14");\ - asm volatile("vpsubq %xmm3,%xmm2,%xmm2"); /* xmm2 = ((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7) */\ - asm volatile("vpsubq %xmm7,%xmm6,%xmm6");\ - asm volatile("vpsubq %xmm11,%xmm10,%xmm10");\ - asm volatile("vpsubq %xmm15,%xmm14,%xmm14");\ - asm volatile("vpsllq $1,%xmm1,%xmm1"); /* xmm1 = r << 1 */\ - asm volatile("vpsllq $1,%xmm5,%xmm5");\ - asm volatile("vpsllq $1,%xmm9,%xmm9");\ - asm volatile("vpsllq $1,%xmm13,%xmm13");\ - /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[1])); */\ - MAKE_CST32_AVX128(xmm3,0x1d1d1d1d);\ - asm volatile("vpand %xmm3,%xmm2,%xmm2"); /* xmm2 = (((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */\ - asm volatile("vpand %xmm3,%xmm6,%xmm6");\ - asm volatile("vpand %xmm3,%xmm10,%xmm10");\ - asm volatile("vpand %xmm3,%xmm14,%xmm14");\ - /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[2])); */\ - MAKE_CST32_AVX128(xmm3,0xfefefefe);\ - asm volatile("vpand %xmm3,%xmm1,%xmm1"); /* xmm1 = (r << 1) & 0xfefefefefefefefeULL */\ - asm volatile("vpand %xmm3,%xmm5,%xmm5");\ - asm volatile("vpand %xmm3,%xmm9,%xmm9");\ - asm volatile("vpand %xmm3,%xmm13,%xmm13");\ - asm volatile("vpxor %xmm2,%xmm1,%xmm1"); /* xmm1 = ((((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((r << 1) & 0xfefefefefefefefeULL) */\ - asm volatile("vpxor %xmm6,%xmm5,%xmm5");\ - asm volatile("vpxor %xmm10,%xmm9,%xmm9");\ - asm volatile("vpxor %xmm14,%xmm13,%xmm13");\ - /* to implement R in RAID-Z3, ... repeat until here, at this point. */\ - }\ - asm volatile("vpxor %xmm0,%xmm1,%xmm1"); /* final xor */\ - asm volatile("vpxor %xmm4,%xmm5,%xmm5");\ - asm volatile("vpxor %xmm8,%xmm9,%xmm9");\ - asm volatile("vpxor %xmm12,%xmm13,%xmm13");\ - asm volatile("vmovdqa %%xmm1,%0" : "=m" (*(r+0)));\ - asm volatile("vmovdqa %%xmm5,%0" : "=m" (*(r+2)));\ - asm volatile("vmovdqa %%xmm9,%0" : "=m" (*(r+4)));\ - asm volatile("vmovdqa %%xmm13,%0" : "=m" (*(r+6))) +#define MAKE_CST32_AVX128(reg, val) \ + asm volatile("vmovd %0,%%"#reg : : "r"(val)); \ + asm volatile("vpshufd $0,%"#reg",%"#reg); + +#define COPY8P_AVX128 \ + asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("vmovdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(p+6))) + +#define COPY8PQ_AVX128 \ + asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("vmovdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(p+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(q+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(q+6))) + +#define COPY8PQR_AVX128 \ + asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("vmovdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(p+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(q+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(q+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(r+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(r+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(r+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(r+6))) + +#define LOAD8_SRC_AVX128 \ + asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%xmm4" : : "m" (*(src+2))); \ + asm volatile("vmovdqa %0,%%xmm8" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%xmm12" : : "m" (*(src+6))) + +#define COMPUTE8_P_AVX128 \ + asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(p+0))); \ + asm volatile("vmovdqa %0,%%xmm5" : : "m" (*(p+2))); \ + asm volatile("vmovdqa %0,%%xmm9" : : "m" (*(p+4))); \ + asm volatile("vmovdqa %0,%%xmm13" : : "m" (*(p+6))); \ + asm volatile("vpxor %xmm0,%xmm1,%xmm1"); \ + asm volatile("vpxor %xmm4,%xmm5,%xmm5"); \ + asm volatile("vpxor %xmm8,%xmm9,%xmm9"); \ + asm volatile("vpxor %xmm12,%xmm13,%xmm13"); \ + asm volatile("vmovdqa %%xmm1,%0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%xmm5,%0" : "=m" (*(p+2))); \ + asm volatile("vmovdqa %%xmm9,%0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%xmm13,%0" : "=m" (*(p+6))) + +#define COMPUTE8_Q_AVX128 \ + asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(q+0))); /* xmm1 = q */ \ + asm volatile("vmovdqa %0,%%xmm5" : : "m" (*(q+2))); \ + asm volatile("vmovdqa %0,%%xmm9" : : "m" (*(q+4))); \ + asm volatile("vmovdqa %0,%%xmm13" : : "m" (*(q+6))); \ + /* to implement R in RAID-Z3, just copy the whole Q block and repeat from here ... */ \ + /* \ + * I think the movdqa from the static array should work, but it doesn't. \ + * So the constants are synthesized from a 32 bits value in a conventional \ + * register \ + */ \ + /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[0])); */ \ + MAKE_CST32_AVX128(xmm3,0x80808080); \ + asm volatile("vpand %xmm3,%xmm1,%xmm2"); /* xmm2 = q & 0x8080808080808080ULL */ \ + asm volatile("vpand %xmm3,%xmm5,%xmm6"); \ + asm volatile("vpand %xmm3,%xmm9,%xmm10"); \ + asm volatile("vpand %xmm3,%xmm13,%xmm14"); \ + asm volatile("vpsrlq $7,%xmm2,%xmm3"); /* xmm3 = (q & 0x8080808080808080ULL) >> 7 */ \ + asm volatile("vpsrlq $7,%xmm6,%xmm7"); \ + asm volatile("vpsrlq $7,%xmm10,%xmm11"); \ + asm volatile("vpsrlq $7,%xmm14,%xmm15"); \ + asm volatile("vpsllq $1,%xmm2,%xmm2"); /* xmm2 = (q & 0x8080808080808080ULL) << 1 */ \ + asm volatile("vpsllq $1,%xmm6,%xmm6"); \ + asm volatile("vpsllq $1,%xmm10,%xmm10"); \ + asm volatile("vpsllq $1,%xmm14,%xmm14"); \ + asm volatile("vpsubq %xmm3,%xmm2,%xmm2"); /* xmm2 = ((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7) */ \ + asm volatile("vpsubq %xmm7,%xmm6,%xmm6"); \ + asm volatile("vpsubq %xmm11,%xmm10,%xmm10"); \ + asm volatile("vpsubq %xmm15,%xmm14,%xmm14"); \ + asm volatile("vpsllq $1,%xmm1,%xmm1"); /* xmm1 = q << 1 */ \ + asm volatile("vpsllq $1,%xmm5,%xmm5"); \ + asm volatile("vpsllq $1,%xmm9,%xmm9"); \ + asm volatile("vpsllq $1,%xmm13,%xmm13"); \ + /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[1])); */ \ + MAKE_CST32_AVX128(xmm3,0x1d1d1d1d); \ + asm volatile("vpand %xmm3,%xmm2,%xmm2"); /* xmm2 = (((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */ \ + asm volatile("vpand %xmm3,%xmm6,%xmm6"); \ + asm volatile("vpand %xmm3,%xmm10,%xmm10"); \ + asm volatile("vpand %xmm3,%xmm14,%xmm14"); \ + /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[2])); */ \ + MAKE_CST32_AVX128(xmm3,0xfefefefe); \ + asm volatile("vpand %xmm3,%xmm1,%xmm1"); /* xmm1 = (q << 1) & 0xfefefefefefefefeULL */ \ + asm volatile("vpand %xmm3,%xmm5,%xmm5"); \ + asm volatile("vpand %xmm3,%xmm9,%xmm9"); \ + asm volatile("vpand %xmm3,%xmm13,%xmm13"); \ + asm volatile("vpxor %xmm2,%xmm1,%xmm1"); /* xmm1 = ((((q & 0x8080808080808080ULL) << 1) - ((q & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((q << 1) & 0xfefefefefefefefeULL) */ \ + asm volatile("vpxor %xmm6,%xmm5,%xmm5"); \ + asm volatile("vpxor %xmm10,%xmm9,%xmm9"); \ + asm volatile("vpxor %xmm14,%xmm13,%xmm13"); \ + /* to implement R in RAID-Z3, ... repeat until here, at this point. */ \ + asm volatile("vpxor %xmm0,%xmm1,%xmm1"); /* final xor */ \ + asm volatile("vpxor %xmm4,%xmm5,%xmm5"); \ + asm volatile("vpxor %xmm8,%xmm9,%xmm9"); \ + asm volatile("vpxor %xmm12,%xmm13,%xmm13"); \ + asm volatile("vmovdqa %%xmm1,%0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%xmm5,%0" : "=m" (*(q+2))); \ + asm volatile("vmovdqa %%xmm9,%0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%xmm13,%0" : "=m" (*(q+6))) + +#define COMPUTE8_R_AVX128 asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(r+0))); /* xmm1 = r */ \ + asm volatile("vmovdqa %0,%%xmm5" : : "m" (*(r+2))); \ + asm volatile("vmovdqa %0,%%xmm9" : : "m" (*(r+4))); \ + asm volatile("vmovdqa %0,%%xmm13" : : "m" (*(r+6))); \ + /* to implement R in RAID-Z3, just copy the whole R block and repeat from here ... */ \ + for (j = 0; j < 2; j++) { \ + /* \ + * I think the movdqa from the static array should work, but it doesn't. \ + * So the constants are synthesized from a 32 bits value in a conventional \ + * register \ + */ \ + /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[0])); */ \ + MAKE_CST32_AVX128(xmm3,0x80808080); \ + asm volatile("vpand %xmm3,%xmm1,%xmm2"); /* xmm2 = r & 0x8080808080808080ULL */ \ + asm volatile("vpand %xmm3,%xmm5,%xmm6"); \ + asm volatile("vpand %xmm3,%xmm9,%xmm10"); \ + asm volatile("vpand %xmm3,%xmm13,%xmm14"); \ + asm volatile("vpsrlq $7,%xmm2,%xmm3"); /* xmm3 = (r & 0x8080808080808080ULL) >> 7 */ \ + asm volatile("vpsrlq $7,%xmm6,%xmm7"); \ + asm volatile("vpsrlq $7,%xmm10,%xmm11"); \ + asm volatile("vpsrlq $7,%xmm14,%xmm15"); \ + asm volatile("vpsllq $1,%xmm2,%xmm2"); /* xmm2 = (r & 0x8080808080808080ULL) << 1 */ \ + asm volatile("vpsllq $1,%xmm6,%xmm6"); \ + asm volatile("vpsllq $1,%xmm10,%xmm10"); \ + asm volatile("vpsllq $1,%xmm14,%xmm14"); \ + asm volatile("vpsubq %xmm3,%xmm2,%xmm2"); /* xmm2 = ((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7) */ \ + asm volatile("vpsubq %xmm7,%xmm6,%xmm6"); \ + asm volatile("vpsubq %xmm11,%xmm10,%xmm10"); \ + asm volatile("vpsubq %xmm15,%xmm14,%xmm14"); \ + asm volatile("vpsllq $1,%xmm1,%xmm1"); /* xmm1 = r << 1 */ \ + asm volatile("vpsllq $1,%xmm5,%xmm5"); \ + asm volatile("vpsllq $1,%xmm9,%xmm9"); \ + asm volatile("vpsllq $1,%xmm13,%xmm13"); \ + /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[1])); */ \ + MAKE_CST32_AVX128(xmm3,0x1d1d1d1d); \ + asm volatile("vpand %xmm3,%xmm2,%xmm2"); /* xmm2 = (((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL */ \ + asm volatile("vpand %xmm3,%xmm6,%xmm6"); \ + asm volatile("vpand %xmm3,%xmm10,%xmm10"); \ + asm volatile("vpand %xmm3,%xmm14,%xmm14"); \ + /* asm volatile("vmovdqa %0,%%xmm3" : : "m" (raidz2_sse_constants.cst[2])); */ \ + MAKE_CST32_AVX128(xmm3,0xfefefefe); \ + asm volatile("vpand %xmm3,%xmm1,%xmm1"); /* xmm1 = (r << 1) & 0xfefefefefefefefeULL */ \ + asm volatile("vpand %xmm3,%xmm5,%xmm5"); \ + asm volatile("vpand %xmm3,%xmm9,%xmm9"); \ + asm volatile("vpand %xmm3,%xmm13,%xmm13"); \ + asm volatile("vpxor %xmm2,%xmm1,%xmm1"); /* xmm1 = ((((r & 0x8080808080808080ULL) << 1) - ((r & 0x8080808080808080ULL) >> 7)) & 0x1d1d1d1d1d1d1d1dULL) ^ ((r << 1) & 0xfefefefefefefefeULL) */ \ + asm volatile("vpxor %xmm6,%xmm5,%xmm5"); \ + asm volatile("vpxor %xmm10,%xmm9,%xmm9"); \ + asm volatile("vpxor %xmm14,%xmm13,%xmm13"); \ + /* to implement R in RAID-Z3, ... repeat until here, at this point. */ \ + } \ + asm volatile("vpxor %xmm0,%xmm1,%xmm1"); /* final xor */ \ + asm volatile("vpxor %xmm4,%xmm5,%xmm5"); \ + asm volatile("vpxor %xmm8,%xmm9,%xmm9"); \ + asm volatile("vpxor %xmm12,%xmm13,%xmm13"); \ + asm volatile("vmovdqa %%xmm1,%0" : "=m" (*(r+0))); \ + asm volatile("vmovdqa %%xmm5,%0" : "=m" (*(r+2))); \ + asm volatile("vmovdqa %%xmm9,%0" : "=m" (*(r+4))); \ + asm volatile("vmovdqa %%xmm13,%0" : "=m" (*(r+6))) static void @@ -1709,22 +1741,22 @@ vdev_raidz_generate_parity_p_avx128(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccount == pcount); - i = 0; - if (ccount>7) /* ccount is unsigned */ - for (; i < ccount-7; i+=8, src+=8, p+=8) { - COPY8P_AVX128; - } + i = 0; + if (ccount > 7) /* ccount is unsigned */ + for (; i < ccount-7; i += 8, src += 8, p += 8) { + COPY8P_AVX128; + } for (; i < ccount; i++, src++, p++) { *p = *src; } } else { ASSERT(ccount <= pcount); - i = 0; - if (ccount>7) /* ccount is unsigned */ - for (; i < ccount-7; i+=8, src+=8, p+=8) { - LOAD8_SRC_AVX128; - COMPUTE8_P_AVX128; - } + i = 0; + if (ccount > 7) /* ccount is unsigned */ + for (; i < ccount-7; i += 8, src += 8, p += 8) { + LOAD8_SRC_AVX128; + COMPUTE8_P_AVX128; + } for (; i < ccount; i++, src++, p++) { *p ^= *src; } @@ -1756,11 +1788,11 @@ vdev_raidz_generate_parity_pq_avx128(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - i = 0; - if (ccnt>7) /* ccnt is unsigned */ - for (; i < ccnt-7; i+=8, src+=8, p+=8, q+=8) { - COPY8PQ_AVX128; - } + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8) { + COPY8PQ_AVX128; + } for (; i < ccnt; i++, src++, p++, q++) { *p = *src; *q = *src; @@ -1777,12 +1809,12 @@ vdev_raidz_generate_parity_pq_avx128(raidz_map_t *rm) * the previous result and adding in the new value. */ i = 0; - if (ccnt>7) /* ccnt is unsigned */ - for (; i < ccnt-7; i+=8, src+=8, p+=8, q+=8) { - /* raw, unscheduled, unrolled-by-8 (4x128bits) AVX-128 implementation */ - LOAD8_SRC_AVX128; - COMPUTE8_P_AVX128; - COMPUTE8_Q_AVX128; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8) { + /* raw, unscheduled, unrolled-by-8 (4x128bits) AVX-128 implementation */ + LOAD8_SRC_AVX128; + COMPUTE8_P_AVX128; + COMPUTE8_Q_AVX128; } for (; i < ccnt; i++, src++, p++, q++) { *p ^= *src; @@ -1829,11 +1861,11 @@ vdev_raidz_generate_parity_pqr_avx128(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - i = 0; - if (ccnt>7) /* ccnt is unsigned */ - for (; i < ccnt-7; i+=8, src+=8, p+=8, q+=8, r+=8) { - COPY8PQR_AVX128; - } + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8, r += 8) { + COPY8PQR_AVX128; + } for (; i < ccnt; i++, src++, p++, q++, r++) { *p = *src; *q = *src; @@ -1852,13 +1884,13 @@ vdev_raidz_generate_parity_pqr_avx128(raidz_map_t *rm) * the previous result and adding in the new value. */ i = 0; - if (ccnt>7) /* ccnt is unsigned */ - for (; i < ccnt-7; i+=8, src+=8, p+=8, q+=8, r+=8) { - /* raw, unscheduled, unrolled-by-8 (4x128bits) AVX-128 implementation */ - LOAD8_SRC_AVX128; - COMPUTE8_P_AVX128; - COMPUTE8_Q_AVX128; - COMPUTE8_R_AVX128; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8, r += 8) { + /* raw, unscheduled, unrolled-by-8 (4x128bits) AVX-128 implementation */ + LOAD8_SRC_AVX128; + COMPUTE8_P_AVX128; + COMPUTE8_Q_AVX128; + COMPUTE8_R_AVX128; } for (; i < ccnt; i++, src++, p++, q++, r++) { *p ^= *src; @@ -1898,35 +1930,44 @@ vdev_raidz_generate_parity_pqr_avx128(raidz_map_t *rm) #endif // __x86_64__ static void vdev_raidz_pick_parity_functions(void) { - vdev_raidz_generate_parity_p = &vdev_raidz_generate_parity_p_c; - vdev_raidz_generate_parity_pq = &vdev_raidz_generate_parity_pq_c; - vdev_raidz_generate_parity_pqr = &vdev_raidz_generate_parity_pqr_c; + vdev_raidz_generate_parity_p = &vdev_raidz_generate_parity_p_c; + vdev_raidz_generate_parity_pq = &vdev_raidz_generate_parity_pq_c; + vdev_raidz_generate_parity_pqr = &vdev_raidz_generate_parity_pqr_c; #if defined(__x86_64__) #if defined(_KERNEL) && defined(CONFIG_AS_AVX2) - if (boot_cpu_has(X86_FEATURE_AVX2)) { - vdev_raidz_generate_parity_p = &vdev_raidz_generate_parity_p_avx2; - vdev_raidz_generate_parity_pq = &vdev_raidz_generate_parity_pq_avx2; - vdev_raidz_generate_parity_pqr = &vdev_raidz_generate_parity_pqr_avx2; - printk("ZFS: using vdev_raidz_generate_parity_*_avx2\n"); - } else + if (boot_cpu_has(X86_FEATURE_AVX2)) { + vdev_raidz_generate_parity_p = + &vdev_raidz_generate_parity_p_avx2; + vdev_raidz_generate_parity_pq = + &vdev_raidz_generate_parity_pq_avx2; + vdev_raidz_generate_parity_pqr = + &vdev_raidz_generate_parity_pqr_avx2; + printk("ZFS: using vdev_raidz_generate_parity_*_avx2\n"); + } else #endif #if defined(_KERNEL) && defined(CONFIG_AS_AVX) - if (boot_cpu_has(X86_FEATURE_AVX)) { - vdev_raidz_generate_parity_p = &vdev_raidz_generate_parity_p_avx128; - vdev_raidz_generate_parity_pq = &vdev_raidz_generate_parity_pq_avx128; - vdev_raidz_generate_parity_pqr = &vdev_raidz_generate_parity_pqr_avx128; - printk("ZFS: using vdev_raidz_generate_parity_*_avx128\n"); - } else + if (boot_cpu_has(X86_FEATURE_AVX)) { + vdev_raidz_generate_parity_p = + &vdev_raidz_generate_parity_p_avx128; + vdev_raidz_generate_parity_pq = + &vdev_raidz_generate_parity_pq_avx128; + vdev_raidz_generate_parity_pqr = + &vdev_raidz_generate_parity_pqr_avx128; + printk("ZFS: using vdev_raidz_generate_parity_*_avx128\n"); + } else #endif - { - /* x86-64 always has SSE2 */ - vdev_raidz_generate_parity_p = &vdev_raidz_generate_parity_p_sse; - vdev_raidz_generate_parity_pq = &vdev_raidz_generate_parity_pq_sse; - vdev_raidz_generate_parity_pqr = &vdev_raidz_generate_parity_pqr_sse; + { + /* x86-64 always has SSE2 */ + vdev_raidz_generate_parity_p = + &vdev_raidz_generate_parity_p_sse; + vdev_raidz_generate_parity_pq = + &vdev_raidz_generate_parity_pq_sse; + vdev_raidz_generate_parity_pqr = + &vdev_raidz_generate_parity_pqr_sse; #if defined(_KERNEL) - printk("ZFS: using vdev_raidz_generate_parity_*_sse\n"); + printk("ZFS: using vdev_raidz_generate_parity_*_sse\n"); #endif - } + } #endif } @@ -2688,13 +2729,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, int c; int lasterror = 0; int numerrors = 0; - - /* should probably be done elsewhere, - * to be done once per module load. - * This could cause a race condition - * on which function is used. - */ - vdev_raidz_pick_parity_functions(); + + /* + * Should probably be done elsewhere, + * to be done once per module load. + * This could cause a race condition + * on which function is used. + */ + vdev_raidz_pick_parity_functions(); ASSERT(nparity > 0);