Skip to content

Commit

Permalink
SSE, AVX-128 & AVX2 vectorized RAID-Z1/2/3 parity computation.
Browse files Browse the repository at this point in the history
This is just a proof-of-concept to gauge the interest of doing the parity computation in vector registers.
It has received very little testing. Don't use on production systems, obviously :-)
SSE should work with any x86-64 CPUs. AVX-128 should work on AVX-enabled CPU, i.e. Sandy Bridge and later. AVX2 should work on Haswell and later. The exact variant is picked at runtime.
  • Loading branch information
rdolbeau committed Jul 12, 2015
1 parent 72540ea commit fa494ef
Show file tree
Hide file tree
Showing 8 changed files with 1,253 additions and 57 deletions.
1 change: 1 addition & 0 deletions include/sys/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/vdev_file.h \
$(top_srcdir)/include/sys/vdev.h \
$(top_srcdir)/include/sys/vdev_impl.h \
$(top_srcdir)/include/sys/vdev_raidz.h \
$(top_srcdir)/include/sys/xvattr.h \
$(top_srcdir)/include/sys/zap.h \
$(top_srcdir)/include/sys/zap_impl.h \
Expand Down
80 changes: 80 additions & 0 deletions include/sys/vdev_raidz.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/

typedef struct raidz_col {
uint64_t rc_devidx; /* child device index for I/O */
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
void *rc_data; /* I/O data */
void *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */
} raidz_col_t;

typedef struct raidz_map {
uint64_t rm_cols; /* Regular column count */
uint64_t rm_scols; /* Count including skipped columns */
uint64_t rm_bigcols; /* Number of oversized columns */
uint64_t rm_asize; /* Actual total I/O size */
uint64_t rm_missingdata; /* Count of missing data devices */
uint64_t rm_missingparity; /* Count of missing parity devices */
uint64_t rm_firstdatacol; /* First data column/parity count */
uint64_t rm_nskip; /* Skipped sectors for padding */
uint64_t rm_skipstart; /* Column index of padding start */
void *rm_datacopy; /* rm_asize-buffer of copied data */
uintptr_t rm_reports; /* # of referencing checksum reports */
uint8_t rm_freed; /* map no longer has referencing ZIO */
uint8_t rm_ecksuminjected; /* checksum error was injected */
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
} raidz_map_t;

#define VDEV_RAIDZ_P 0
#define VDEV_RAIDZ_Q 1
#define VDEV_RAIDZ_R 2

#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))

/*
* We provide a mechanism to perform the field multiplication operation on a
* 64-bit value all at once rather than a byte at a time. This works by
* creating a mask from the top bit in each byte and using that to
* conditionally apply the XOR of 0x1d.
*/
#define VDEV_RAIDZ_64MUL_2(x, mask) \
{ \
(mask) = (x) & 0x8080808080808080ULL; \
(mask) = ((mask) << 1) - ((mask) >> 7); \
(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
((mask) & 0x1d1d1d1d1d1d1d1dULL); \
}

#define VDEV_RAIDZ_64MUL_4(x, mask) \
{ \
VDEV_RAIDZ_64MUL_2((x), mask); \
VDEV_RAIDZ_64MUL_2((x), mask); \
}
3 changes: 3 additions & 0 deletions lib/libzpool/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ libzpool_la_SOURCES = \
$(top_srcdir)/module/zfs/vdev_missing.c \
$(top_srcdir)/module/zfs/vdev_queue.c \
$(top_srcdir)/module/zfs/vdev_raidz.c \
$(top_srcdir)/module/zfs/vdev_raidz_sse.c \
$(top_srcdir)/module/zfs/vdev_raidz_avx128.c \
$(top_srcdir)/module/zfs/vdev_raidz_avx2.c \
$(top_srcdir)/module/zfs/vdev_root.c \
$(top_srcdir)/module/zfs/zap.c \
$(top_srcdir)/module/zfs/zap_leaf.c \
Expand Down
3 changes: 3 additions & 0 deletions module/zfs/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_mirror.o
$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_missing.o
$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_queue.o
$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_raidz.o
$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_raidz_sse.o
$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_raidz_avx128.o
$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_raidz_avx2.o
$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_root.o
$(MODULE)-objs += @top_srcdir@/module/zfs/zap.o
$(MODULE)-objs += @top_srcdir@/module/zfs/zap_leaf.o
Expand Down
135 changes: 78 additions & 57 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/

#if defined(_KERNEL)
#include <linux/kernel.h>
#endif
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
Expand Down Expand Up @@ -99,60 +102,7 @@
* or in concert to recover missing data columns.
*/

typedef struct raidz_col {
uint64_t rc_devidx; /* child device index for I/O */
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
void *rc_data; /* I/O data */
void *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */
} raidz_col_t;

typedef struct raidz_map {
uint64_t rm_cols; /* Regular column count */
uint64_t rm_scols; /* Count including skipped columns */
uint64_t rm_bigcols; /* Number of oversized columns */
uint64_t rm_asize; /* Actual total I/O size */
uint64_t rm_missingdata; /* Count of missing data devices */
uint64_t rm_missingparity; /* Count of missing parity devices */
uint64_t rm_firstdatacol; /* First data column/parity count */
uint64_t rm_nskip; /* Skipped sectors for padding */
uint64_t rm_skipstart; /* Column index of padding start */
void *rm_datacopy; /* rm_asize-buffer of copied data */
uintptr_t rm_reports; /* # of referencing checksum reports */
uint8_t rm_freed; /* map no longer has referencing ZIO */
uint8_t rm_ecksuminjected; /* checksum error was injected */
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
} raidz_map_t;

#define VDEV_RAIDZ_P 0
#define VDEV_RAIDZ_Q 1
#define VDEV_RAIDZ_R 2

#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))

/*
* We provide a mechanism to perform the field multiplication operation on a
* 64-bit value all at once rather than a byte at a time. This works by
* creating a mask from the top bit in each byte and using that to
* conditionally apply the XOR of 0x1d.
*/
#define VDEV_RAIDZ_64MUL_2(x, mask) \
{ \
(mask) = (x) & 0x8080808080808080ULL; \
(mask) = ((mask) << 1) - ((mask) >> 7); \
(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
((mask) & 0x1d1d1d1d1d1d1d1dULL); \
}

#define VDEV_RAIDZ_64MUL_4(x, mask) \
{ \
VDEV_RAIDZ_64MUL_2((x), mask); \
VDEV_RAIDZ_64MUL_2((x), mask); \
}
#include <sys/vdev_raidz.h>

/*
* Force reconstruction to use the general purpose method.
Expand Down Expand Up @@ -582,8 +532,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
return (rm);
}

void (*vdev_raidz_generate_parity_p)(raidz_map_t *rm);

static void
vdev_raidz_generate_parity_p(raidz_map_t *rm)
vdev_raidz_generate_parity_p_c(raidz_map_t *rm)
{
uint64_t *p, *src, pcount, ccount, i;
int c;
Expand All @@ -609,8 +561,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
}
}

void (*vdev_raidz_generate_parity_pq)(raidz_map_t *rm);

static void
vdev_raidz_generate_parity_pq(raidz_map_t *rm)
vdev_raidz_generate_parity_pq_c(raidz_map_t *rm)
{
uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
int c;
Expand Down Expand Up @@ -661,8 +615,10 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
}
}

void (*vdev_raidz_generate_parity_pqr)(raidz_map_t *rm);

static void
vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
vdev_raidz_generate_parity_pqr_c(raidz_map_t *rm)
{
uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
int c;
Expand Down Expand Up @@ -722,6 +678,63 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
}
}

#if defined(__x86_64__)
void vdev_raidz_generate_parity_p_avx128(raidz_map_t *rm);
void vdev_raidz_generate_parity_pq_avx128(raidz_map_t *rm);
void vdev_raidz_generate_parity_pqr_avx128(raidz_map_t *rm);
void vdev_raidz_generate_parity_p_avx2(raidz_map_t *rm);
void vdev_raidz_generate_parity_pq_avx2(raidz_map_t *rm);
void vdev_raidz_generate_parity_pqr_avx2(raidz_map_t *rm);
void vdev_raidz_generate_parity_p_sse(raidz_map_t *rm);
void vdev_raidz_generate_parity_pq_sse(raidz_map_t *rm);
void vdev_raidz_generate_parity_pqr_sse(raidz_map_t *rm);
#endif

static void vdev_raidz_pick_parity_functions(void) {
vdev_raidz_generate_parity_p = &vdev_raidz_generate_parity_p_c;
vdev_raidz_generate_parity_pq = &vdev_raidz_generate_parity_pq_c;
vdev_raidz_generate_parity_pqr = &vdev_raidz_generate_parity_pqr_c;
#if defined(__x86_64__)
#if defined(_KERNEL) && defined(CONFIG_AS_AVX2)
if (boot_cpu_has(X86_FEATURE_AVX2)) {
vdev_raidz_generate_parity_p =
&vdev_raidz_generate_parity_p_avx2;
vdev_raidz_generate_parity_pq =
&vdev_raidz_generate_parity_pq_avx2;
vdev_raidz_generate_parity_pqr =
&vdev_raidz_generate_parity_pqr_avx2;
printk(KERN_INFO \
"ZFS: using vdev_raidz_generate_parity_*_avx2\n");
} else
#endif
#if defined(_KERNEL) && defined(CONFIG_AS_AVX)
if (boot_cpu_has(X86_FEATURE_AVX)) {
vdev_raidz_generate_parity_p =
&vdev_raidz_generate_parity_p_avx128;
vdev_raidz_generate_parity_pq =
&vdev_raidz_generate_parity_pq_avx128;
vdev_raidz_generate_parity_pqr =
&vdev_raidz_generate_parity_pqr_avx128;
printk(KERN_INFO \
"ZFS: using vdev_raidz_generate_parity_*_avx128\n");
} else
#endif
{
/* x86-64 always has SSE2 */
vdev_raidz_generate_parity_p =
&vdev_raidz_generate_parity_p_sse;
vdev_raidz_generate_parity_pq =
&vdev_raidz_generate_parity_pq_sse;
vdev_raidz_generate_parity_pqr =
&vdev_raidz_generate_parity_pqr_sse;
#if defined(_KERNEL)
printk(KERN_INFO \
"ZFS: using vdev_raidz_generate_parity_*_sse\n");
#endif
}
#endif
}

/*
* Generate RAID parity in the first virtual columns according to the number of
* parity columns available.
Expand Down Expand Up @@ -1481,6 +1494,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
int lasterror = 0;
int numerrors = 0;

/*
* Should probably be done elsewhere,
* to be done once per module load.
* This could cause a race condition
* on which function is used.
*/
vdev_raidz_pick_parity_functions();

ASSERT(nparity > 0);

if (nparity > VDEV_RAIDZ_MAXPARITY ||
Expand Down
Loading

0 comments on commit fa494ef

Please sign in to comment.