From 6a7c44f23c2101f3e766270f2d2c287d38023105 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 01/16] Add compatibility layer for {kmap,kunmap}_atomic Starting from linux-2.6.37, {kmap,kunmap}_atomic takes 1 argument instead of 2. Signed-off-by: Chunwei Chen --- config/kernel-kmap-atomic-args.m4 | 24 +++++++++++++++++++ config/kernel.m4 | 1 + include/linux/Makefile.am | 3 ++- include/linux/kmap_compat.h | 40 +++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 config/kernel-kmap-atomic-args.m4 create mode 100644 include/linux/kmap_compat.h diff --git a/config/kernel-kmap-atomic-args.m4 b/config/kernel-kmap-atomic-args.m4 new file mode 100644 index 000000000000..f7228907af49 --- /dev/null +++ b/config/kernel-kmap-atomic-args.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # 2.6.37 API change +dnl # kmap_atomic changed from assigning hard-coded named slot to using +dnl # push/pop based dynamical allocation. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS], [ + AC_MSG_CHECKING([whether kmap_atomic wants 1 args]) + ZFS_LINUX_TRY_COMPILE([ + #include + + void test_kmap_atomic(void) + { + struct page page; + kmap_atomic(&page); + } + ],[ + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_1ARG_KMAP_ATOMIC, 1, + [kmap_atomic wants 1 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index a4e43db442b5..3d5a90d80c5f 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -99,6 +99,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_LSEEK_EXECUTE ZFS_AC_KERNEL_VFS_ITERATE ZFS_AC_KERNEL_VFS_RW_ITERATE + ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ" diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am index d00b1c8ad798..595d1db01128 100644 --- a/include/linux/Makefile.am +++ b/include/linux/Makefile.am @@ -5,7 +5,8 @@ KERNEL_H = \ $(top_srcdir)/include/linux/xattr_compat.h \ $(top_srcdir)/include/linux/vfs_compat.h \ $(top_srcdir)/include/linux/blkdev_compat.h \ - $(top_srcdir)/include/linux/utsname_compat.h + $(top_srcdir)/include/linux/utsname_compat.h \ + $(top_srcdir)/include/linux/kmap_compat.h USER_H = diff --git a/include/linux/kmap_compat.h b/include/linux/kmap_compat.h new file mode 100644 index 000000000000..f581fb20965a --- /dev/null +++ b/include/linux/kmap_compat.h @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + */ + +#ifndef _ZFS_KMAP_H +#define _ZFS_KMAP_H + +#include + +#ifdef HAVE_1ARG_KMAP_ATOMIC +/* 2.6.37 API change */ +#define zfs_kmap_atomic(page, km_type) kmap_atomic(page) +#define zfs_kunmap_atomic(addr, km_type) kunmap_atomic(addr) +#else +#define zfs_kmap_atomic(page, km_type) kmap_atomic(page, km_type) +#define zfs_kunmap_atomic(addr, km_type) kunmap_atomic(addr, km_type) +#endif + +#endif /* _ZFS_KMAP_H */ From 47e5fe3257625d4d7c958904e49b59a856679cf4 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 02/16] Introduce ABD: linear/scatter dual typed buffer for ARC zfsolinux currently uses vmalloc backed slab for ARC buffers. There are some major problems with this approach. One is that 32-bit system have only a handful of vmalloc space. Another is that the fragmentation in slab will easily trigger OOM in busy system. With ABD, we use scatterlist to allocate data buffers. In this approach we can allocate in HIGHMEM, which alleviates vmalloc space pressure on 32-bit. Also, we don't have to rely on slab, so there's no fragmentation issue. But for metadata buffers, we still uses linear buffer from slab. The reason for this is that there are a lot of *_phys pointers directly point to metadata buffers. So it's kind of impractical to change all those code. Currently, ABD is not enabled and its API will treat them as normal buffers. We will enable it once all relevant code is modified to use the API. Signed-off-by: Chunwei Chen --- include/sys/Makefile.am | 1 + include/sys/abd.h | 303 +++++++++++ lib/libzpool/Makefile.am | 1 + module/zfs/abd.c | 1116 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 1421 insertions(+) create mode 100644 include/sys/abd.h create mode 100644 module/zfs/abd.c diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 5211e656456d..8b3fd524c555 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -3,6 +3,7 @@ SUBDIRS = fm fs COMMON_H = \ $(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc_impl.h \ + $(top_srcdir)/include/sys/abd.h \ $(top_srcdir)/include/sys/avl.h \ $(top_srcdir)/include/sys/avl_impl.h \ $(top_srcdir)/include/sys/blkptr.h \ diff --git a/include/sys/abd.h b/include/sys/abd.h new file mode 100644 index 000000000000..561b43e18188 --- /dev/null +++ b/include/sys/abd.h @@ -0,0 +1,303 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + */ + +/* + * ABD - ARC buffer data + * ABD is an abstract data structure for ARC. There are two types of ABD: + * linear for metadata and scatter for data. + * Their type is determined by the lowest bit of abd_t pointer. + * The public API will automatically determine the type + */ + +#ifndef _ABD_H +#define _ABD_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif +#if 0 +#define ARC_BUF_DATA_MAGIC 0xa7cb0fda + +#if defined(ZFS_DEBUG) && !defined(_KERNEL) +#define DEBUG_ABD +#endif + +typedef struct arc_buf_data { +#ifdef DEBUG_ABD + char pad[PAGE_SIZE]; /* debug, coredumps when accessed */ +#endif + uint32_t abd_magic; /* ARC_BUF_DATA_MAGIC */ + uint32_t abd_flags; + size_t abd_size; /* buffer size, excluding offset */ + size_t abd_offset; /* offset in the first segment */ + int abd_nents; /* num of sgl entries */ + union { + struct scatterlist *abd_sgl; + void *abd_buf; + }; + uint64_t __abd_sgl[0]; +} abd_t; + +#define ABD_F_SCATTER (0x0) +#define ABD_F_LINEAR (0x1) +#define ABD_F_OWNER (0x2) + +/* + * Convert an linear ABD to normal buffer + */ +#define ABD_TO_BUF(abd) \ +( \ +{ \ + ASSERT((abd)->abd_magic == ARC_BUF_DATA_MAGIC); \ + ASSERT_ABD_LINEAR(abd); \ + abd->abd_buf; \ +} \ +) + +#define ABD_IS_SCATTER(abd) (!((abd)->abd_flags & ABD_F_LINEAR)) +#define ABD_IS_LINEAR(abd) (!ABD_IS_SCATTER(abd)) +#define ASSERT_ABD_SCATTER(abd) ASSERT(ABD_IS_SCATTER(abd)) +#define ASSERT_ABD_LINEAR(abd) ASSERT(ABD_IS_LINEAR(abd)) + +/* + * Allocations and deallocations + */ +abd_t *abd_alloc_scatter(size_t); +abd_t *abd_alloc_linear(size_t); +void abd_free(abd_t *, size_t); +abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_from_buf(void *, size_t); +void abd_put(abd_t *); + +/* + * ABD operations + */ +void abd_iterate_rfunc(abd_t *, size_t, + int (*)(const void *, uint64_t, void *), void *); +void abd_iterate_wfunc(abd_t *, size_t, + int (*)(void *, uint64_t, void *), void *); +void abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, + int (*)(void *, void *, uint64_t, uint64_t, void *), void *); +void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); +void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); +void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); +int abd_cmp(abd_t *, abd_t *, size_t); +int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); +void abd_zero_off(abd_t *, size_t, size_t); +#ifdef _KERNEL +int abd_copy_to_user_off(void __user *, abd_t *, size_t, size_t); +int abd_copy_from_user_off(abd_t *, const void __user *, size_t, size_t); +int abd_uiomove_off(abd_t *, size_t, enum uio_rw, uio_t *, size_t); +int abd_uiocopy_off(abd_t *, size_t, enum uio_rw, uio_t *, size_t *, + size_t); +unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, + size_t); +unsigned long abd_bio_nr_pages_off(abd_t *, unsigned int, size_t); + +#define abd_bio_map_off(bio, abd, size, off) \ +( \ +{ \ + unsigned int ___ret; \ + if (ABD_IS_LINEAR(abd)) \ + ___ret = bio_map(bio, ABD_TO_BUF(abd) + (off), size); \ + else \ + ___ret = abd_scatter_bio_map_off(bio, abd, size, off); \ + ___ret; \ +} \ +) +#endif /* _KERNEL */ + +/* + * Borrow a linear buffer for an ABD + * Will allocate if ABD is scatter + */ +#define abd_borrow_buf(a, n) \ +( \ +{ \ + void *___b; \ + if (ABD_IS_LINEAR(a)) { \ + ___b = ABD_TO_BUF(a); \ + } else { \ + ___b = zio_buf_alloc(n); \ + } \ + ___b; \ +} \ +) + +/* + * Borrow a linear buffer for an ABD + * Will allocate and copy if ABD is scatter + */ +#define abd_borrow_buf_copy(a, n) \ +( \ +{ \ + void *___b = abd_borrow_buf(a, n); \ + if (!ABD_IS_LINEAR(a)) \ + abd_copy_to_buf(___b, a, n); \ + ___b; \ +} \ +) + +/* + * Return the borrowed linear buffer + */ +#define abd_return_buf(a, b, n) \ +do { \ + if (ABD_IS_LINEAR(a)) \ + ASSERT((b) == ABD_TO_BUF(a)); \ + else \ + zio_buf_free(b, n); \ +} while (0) + +/* + * Copy back to ABD and return the borrowed linear buffer + */ +#define abd_return_buf_copy(a, b, n) \ +do { \ + if (!ABD_IS_LINEAR(a)) \ + abd_copy_from_buf(a, b, n); \ + abd_return_buf(a, b, n); \ +} while (0) +#else /* 0 */ +typedef void abd_t; +#define ABD_TO_BUF(abd) ((void *)abd) +#define ABD_IS_SCATTER(abd) (0) +#define ABD_IS_LINEAR(abd) (1) +#define ASSERT_ABD_SCATTER(abd) ((void)0) +#define ASSERT_ABD_LINEAR(abd) ((void)0) +void *zio_buf_alloc(size_t); +void zio_buf_free(void *, size_t); +static inline abd_t *abd_alloc_linear(size_t size) +{ + return ((abd_t *)zio_buf_alloc(size)); +} +static inline void abd_free(abd_t *abd, size_t size) +{ + zio_buf_free((void *)abd, size); +} +#define abd_alloc_scatter abd_alloc_linear +#define abd_get_offset(abd, off) ((void *)(abd)+(off)) +#define abd_get_from_buf(buf, size) (buf) +#define abd_put(abd) do { } while (0) + +#define abd_iterate_rfunc(a, n, f, p) \ + (void) f(a, n, p) + +#define abd_iterate_wfunc(a, n, f, p) \ + (void) f(a, n, p) + +#define abd_iterate_func2(a, b, an, bn, f, p) \ + (void) f(a, b, an, bn, p) + +#define abd_copy_off(a, b, n, aoff, boff) \ + (void) memcpy((void *)(a)+(aoff), (void *)(b)+(boff), n) + +#define abd_copy_from_buf_off(a, b, n, off) \ + (void) memcpy((void *)(a)+(off), b, n) + +#define abd_copy_to_buf_off(a, b, n, off) \ + (void) memcpy(a, (void *)(b)+(off), n) + +#define abd_cmp(a, b, n) \ + memcmp(a, b, n) + +#define abd_cmp_buf_off(a, b, n, off) \ + memcmp((void *)(a)+(off), b, n) + +#define abd_zero_off(a, n, off) \ + (void) memset((void *)(a)+(off), 0, n) + +#ifdef _KERNEL +#define abd_copy_to_user_off(a, b, n, off) \ + copy_to_user(a, (void *)(b)+(off), n) + +#define abd_copy_from_user_off(a, b, n, off) \ + copy_from_user((void *)(a)+(off), b, n) + +#define abd_uiomove_off(p, n, rw, uio, off) \ + uiomove((void *)(p)+(off), n, rw, uio) + +#define abd_uiocopy_off(p, n, rw, uio, c, off) \ + uiocopy((void *)(p)+(off), n, rw, uio, c) + +#define abd_bio_map_off(bio, a, n, off) \ + bio_map(bio, (void *)(a)+(off), n) + +#define abd_bio_nr_pages_off(a, n, off) \ + bio_nr_pages((void *)(a)+(off), n) +#endif /* _KERNEL */ + +#define abd_borrow_buf(a, n) \ + ((void *)a) + +#define abd_borrow_buf_copy(a, n) \ + ((void *)a) + +#define abd_return_buf(a, b, n) \ + do { } while (0) + +#define abd_return_buf_copy(a, b, n) \ + do { } while (0) +#endif /* 0 */ + +/* + * Wrappers for zero off functions + */ +#define abd_copy(dabd, sabd, size) \ + abd_copy_off(dabd, sabd, size, 0, 0) + +#define abd_copy_from_buf(abd, buf, size) \ + abd_copy_from_buf_off(abd, buf, size, 0) + +#define abd_copy_to_buf(buf, abd, size) \ + abd_copy_to_buf_off(buf, abd, size, 0) + +#define abd_cmp_buf(abd, buf, size) \ + abd_cmp_buf_off(abd, buf, size, 0) + +#define abd_zero(abd, size) \ + abd_zero_off(abd, size, 0) + +#ifdef _KERNEL +#define abd_copy_to_user(buf, abd, size) \ + abd_copy_to_user_off(buf, abd, size, 0) + +#define abd_copy_from_user(abd, buf, size) \ + abd_copy_from_user_off(abd, buf, size, 0) + +#define abd_uiomove(abd, n, rw, uio) \ + abd_uiomove_off(abd, n, rw, uio, 0) + +#define abd_uiocopy(abd, n, rw, uio, c) \ + abd_uiocopy_off(abd, n, rw, uio, c, 0) +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 85bc0510a81d..caa64787a95b 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -110,6 +110,7 @@ libzpool_la_LIBADD += $(ZLIB) libzpool_la_LDFLAGS = -version-info 2:0:0 EXTRA_DIST = \ + $(top_srcdir)/module/zfs/abd.c \ $(top_srcdir)/module/zfs/vdev_disk.c \ $(top_srcdir)/module/zfs/zfs_acl.c \ $(top_srcdir)/module/zfs/zfs_ctldir.c \ diff --git a/module/zfs/abd.c b/module/zfs/abd.c new file mode 100644 index 000000000000..8d599e8d1b07 --- /dev/null +++ b/module/zfs/abd.c @@ -0,0 +1,1116 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + */ + +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include + +#else /* _KERNEL */ + +/* + * Userspace compatibility layer + */ + +/* + * page + */ +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +struct page; + +#define alloc_page(gfp) \ + ((struct page *)umem_alloc_aligned(PAGE_SIZE, PAGE_SIZE, UMEM_DEFAULT)) + +#define __free_page(page) \ + umem_free(page, PAGE_SIZE) + +/* + * scatterlist + */ +struct scatterlist { + struct page *page; + int length; + int end; +}; + +static void +sg_init_table(struct scatterlist *sg, int nr) { + memset(sg, 0, nr * sizeof (struct scatterlist)); + sg[nr - 1].end = 1; +} + +static inline void +sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, + unsigned int offset) { + /* currently we don't use offset */ + ASSERT(offset == 0); + sg->page = page; + sg->length = len; +} + +static inline struct page * +sg_page(struct scatterlist *sg) { + return (sg->page); +} + +static inline struct scatterlist * +sg_next(struct scatterlist *sg) +{ + if (sg->end) + return (NULL); + return (sg + 1); +} + +/* + * misc + */ +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#ifndef unlikely +#define unlikely(x) (x) +#endif + +#define kmap(page) ((void *)page) +#define kunmap(page) do { } while (0) +#define zfs_kmap_atomic(page, type) ((void *)page) +#define zfs_kunmap_atomic(addr, type) do { } while (0) +#define pagefault_disable() do { } while (0) +#define pagefault_enable() do { } while (0) +#define flush_kernel_dcache_page(page) do { } while (0) +#define set_current_state(state) do { } while (0) +static inline long +schedule_timeout(long timeout) +{ + sleep(timeout); + return (0); +} + +#endif /* _KERNEL */ + + +struct abd_miter { + void *addr; /* mapped addr, adjusted by offset */ + int length; /* current segment length, adjusted by offset */ + int offset; /* offset in current segment */ + int is_linear; /* the type of the abd */ + union { + struct scatterlist *sg; + void *buf; + }; + int nents; /* num of sg entries */ + int rw; /* r/w access, whether to flush cache */ +#ifndef HAVE_1ARG_KMAP_ATOMIC + int km_type; /* KM_USER0 or KM_USER1 */ +#endif +}; + +#define ABD_MITER_W (1) +#define ABD_MITER_R (0) + +/* + * Initialize the abd_miter. + * Pass ABD_MITER_W to rw if you will write to the abd buffer. + * Please use abd_miter_init or abd_miter_init2 for one or two iterators + * respectively, they will setup KM_USERx accordingly. + */ +static void +abd_miter_init_km(struct abd_miter *aiter, abd_t *abd, int rw, int km) +{ + ASSERT(abd->abd_nents != 0); + aiter->addr = NULL; + if (ABD_IS_LINEAR(abd)) { + ASSERT(abd->abd_nents == 1); + aiter->is_linear = 1; + aiter->buf = abd->abd_buf; + aiter->length = abd->abd_size; + } else { + aiter->is_linear = 0; + aiter->sg = abd->abd_sgl; + aiter->length = aiter->sg->length - abd->abd_offset; + } + aiter->offset = abd->abd_offset; + aiter->nents = abd->abd_nents; + aiter->rw = rw; +#ifndef HAVE_1ARG_KMAP_ATOMIC + aiter->km_type = km; +#endif +} + + +#define abd_miter_init(a, abd, rw) abd_miter_init_km(a, abd, rw, 0) +#define abd_miter_init2(a, aabd, arw, b, babd, brw) \ +do { \ + abd_miter_init_km(a, aabd, arw, 0); \ + abd_miter_init_km(b, babd, brw, 1); \ +} while (0); + +/* + * Map the current page in abd_miter. + * Pass 1 to atmoic if you want to use kmap_atomic. + * This can be safely called when the aiter has already exhausted, in which + * case this does nothing. + * The mapped address and length will be aiter->addr and aiter->length. + */ +static void +abd_miter_map_x(struct abd_miter *aiter, int atomic) +{ + void *paddr; + + ASSERT(!aiter->addr); + + if (!aiter->nents) + return; + + if (aiter->is_linear) { + paddr = aiter->buf; + /* + * Turn of pagefault to keep the context the same as + * kmap_atomic. + */ + if (atomic) + pagefault_disable(); + } else { + ASSERT(aiter->length == aiter->sg->length - aiter->offset); + + if (atomic) + paddr = zfs_kmap_atomic(sg_page(aiter->sg), + (aiter->km_type ? KM_USER1 : KM_USER0)); + else + paddr = kmap(sg_page(aiter->sg)); + } + aiter->addr = paddr + aiter->offset; +} + +/* + * Unmap the current page in abd_miter. + * Pass 1 to atmoic if you want to use kmap_atomic. + * This can be safely called when the aiter has already exhausted, in which + * case this does nothing. + */ +static void +abd_miter_unmap_x(struct abd_miter *aiter, int atomic) +{ + void *paddr; + + if (!aiter->nents) + return; + + ASSERT(aiter->addr); + + if (aiter->is_linear) { + pagefault_enable(); + } else { + paddr = aiter->addr - aiter->offset; + if (atomic) { + if (aiter->rw == ABD_MITER_W) + flush_kernel_dcache_page(sg_page(aiter->sg)); + zfs_kunmap_atomic(paddr, + (aiter->km_type ? KM_USER1 : KM_USER0)); + } else { + kunmap(sg_page(aiter->sg)); + } + } + aiter->addr = NULL; +} + +#define abd_miter_map_atomic(a) abd_miter_map_x(a, 1) +#define abd_miter_map(a) abd_miter_map_x(a, 0) +#define abd_miter_unmap_atomic(a) abd_miter_unmap_x(a, 1) +#define abd_miter_unmap(a) abd_miter_unmap_x(a, 0) + +/* + * Use abd_miter_{,un}map_atomic2 if you want to map 2 abd_miters. + * You need to pass the arguments in the same order for these two. + */ +#define abd_miter_map_atomic2(a, b) \ +do { \ + abd_miter_map_atomic(a); \ + abd_miter_map_atomic(b); \ +} while (0) + +#define abd_miter_unmap_atomic2(a, b) \ +do { \ + abd_miter_unmap_atomic(b); \ + abd_miter_unmap_atomic(a); \ +} while (0) + +/* + * Advance the iterator by offset. + * Cannot be called when a page is mapped. + * Returns 0 if exhausted. + * This can be safely called when the aiter has already exhausted, in which + * case this does nothing. + */ +static int +abd_miter_advance(struct abd_miter *aiter, int offset) +{ + ASSERT(!aiter->addr); + + if (!aiter->nents) + return (0); + + aiter->offset += offset; + if (aiter->is_linear) { + aiter->length -= offset; + if (aiter->length <= 0) { + aiter->nents--; + aiter->length = 0; + return (0); + } + } else { + while (aiter->offset >= aiter->sg->length) { + aiter->offset -= aiter->sg->length; + aiter->nents--; + aiter->sg = sg_next(aiter->sg); + if (!aiter->nents) { + aiter->length = 0; + return (0); + } + } + aiter->length = aiter->sg->length - aiter->offset; + } + return (1); +} + +#define ABD_CHECK(abd) \ +( \ +{ \ + ASSERT((abd)->abd_magic == ARC_BUF_DATA_MAGIC); \ + ASSERT((abd)->abd_size > 0); \ + if (ABD_IS_LINEAR(abd)) { \ + ASSERT((abd)->abd_offset == 0); \ + ASSERT((abd)->abd_nents == 1); \ + } else { \ + ASSERT((abd)->abd_offset < PAGE_SIZE); \ + ASSERT((abd)->abd_nents > 0); \ + } \ +} \ +) + +static void +abd_iterate_func(abd_t *abd, size_t size, + int (*func)(void *, uint64_t, void *), void *private, int rw) +{ + size_t len; + int stop; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size); + + abd_miter_init(&aiter, abd, rw); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 16. + */ + ASSERT(size == len || (len & 15) == 0); + + abd_miter_map_atomic(&aiter); + + stop = func(aiter.addr, len, private); + + abd_miter_unmap_atomic(&aiter); + + if (stop) + break; + size -= len; + abd_miter_advance(&aiter, len); + } +} + +/* + * Iterate over ABD and call a read function @func. + * @func should be implemented so that its behaviour is the same when taking + * linear and when taking scatter + */ +void +abd_iterate_rfunc(abd_t *abd, size_t size, + int (*func)(const void *, uint64_t, void *), void *private) +{ + /* skip type checking on func */ + abd_iterate_func(abd, size, (void *)func, private, ABD_MITER_R); +} + +/* + * Iterate over ABD and call a write function @func. + * @func should be implemented so that its behaviour is the same when taking + * linear and when taking scatter + */ +void +abd_iterate_wfunc(abd_t *abd, size_t size, + int (*func)(void *, uint64_t, void *), void *private) +{ + abd_iterate_func(abd, size, func, private, ABD_MITER_W); +} + +/* + * Iterate over two ABD and call @func2. + * @func2 should be implemented so that its behaviour is the same when taking + * linear and when taking scatter + */ +void +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t dsize, size_t ssize, + int (*func2)(void *, void *, uint64_t, uint64_t, void *), void *private) +{ + size_t dlen, slen; + int stop; + struct abd_miter daiter, saiter; + + ABD_CHECK(dabd); + ABD_CHECK(sabd); + + ASSERT(dsize <= dabd->abd_size); + ASSERT(ssize <= sabd->abd_size); + + abd_miter_init2(&daiter, dabd, ABD_MITER_W, + &saiter, sabd, ABD_MITER_W); + + while (dsize > 0 || ssize > 0) { + dlen = MIN(daiter.length, dsize); + slen = MIN(saiter.length, ssize); + + /* there are remainings after this run, use equal len */ + if (dsize > dlen || ssize > slen) { + if (MIN(dlen, slen) > 0) + slen = dlen = MIN(dlen, slen); + } + + /* must be progressive */ + ASSERT(dlen > 0 || slen > 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 16. + */ + ASSERT(dsize == dlen || (dlen & 15) == 0); + ASSERT(ssize == slen || (slen & 15) == 0); + + abd_miter_map_atomic2(&daiter, &saiter); + + stop = func2(daiter.addr, saiter.addr, dlen, slen, private); + + abd_miter_unmap_atomic2(&daiter, &saiter); + + if (stop) + break; + + dsize -= dlen; + ssize -= slen; + abd_miter_advance(&daiter, dlen); + abd_miter_advance(&saiter, slen); + } +} + +/* + * Copy from @sabd to @dabd + * @doff is offset in dabd + * @soff is offset in sabd + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t size, size_t doff, + size_t soff) +{ + size_t len; + struct abd_miter daiter, saiter; + + ABD_CHECK(dabd); + ABD_CHECK(sabd); + + ASSERT(size <= dabd->abd_size); + ASSERT(size <= sabd->abd_size); + + abd_miter_init2(&daiter, dabd, ABD_MITER_W, + &saiter, sabd, ABD_MITER_R); + abd_miter_advance(&daiter, doff); + abd_miter_advance(&saiter, soff); + + while (size > 0) { + len = MIN(daiter.length, size); + len = MIN(len, saiter.length); + ASSERT(len > 0); + + abd_miter_map_atomic2(&daiter, &saiter); + + memcpy(daiter.addr, saiter.addr, len); + + abd_miter_unmap_atomic2(&daiter, &saiter); + + size -= len; + abd_miter_advance(&daiter, len); + abd_miter_advance(&saiter, len); + } +} + +/* + * Copy from @buf to @abd + * @off is the offset in @abd + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t size, + size_t off) +{ + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_W); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + memcpy(aiter.addr, buf, len); + + abd_miter_unmap_atomic(&aiter); + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } +} + +/* + * Copy from @abd to @buf + * @off is the offset in @abd + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t size, size_t off) +{ + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + memcpy(buf, aiter.addr, len); + + abd_miter_unmap_atomic(&aiter); + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } +} + +/* + * Compare between @dabd and @sabd. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd, size_t size) +{ + size_t len; + int ret = 0; + struct abd_miter daiter, saiter; + + ABD_CHECK(dabd); + ABD_CHECK(sabd); + ASSERT(size <= dabd->abd_size); + ASSERT(size <= sabd->abd_size); + + abd_miter_init2(&daiter, dabd, ABD_MITER_R, + &saiter, sabd, ABD_MITER_R); + + while (size > 0) { + len = MIN(daiter.length, size); + len = MIN(len, saiter.length); + ASSERT(len > 0); + + abd_miter_map_atomic2(&daiter, &saiter); + + ret = memcmp(daiter.addr, saiter.addr, len); + + abd_miter_unmap_atomic2(&daiter, &saiter); + + if (ret) + break; + + size -= len; + abd_miter_advance(&daiter, len); + abd_miter_advance(&saiter, len); + } + return (ret); +} + +/* + * Compare between @abd and @buf. + * @off is the offset in @abd + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t size, size_t off) +{ + size_t len; + int ret = 0; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + ret = memcmp(aiter.addr, buf, len); + + abd_miter_unmap_atomic(&aiter); + + if (ret) + break; + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } + return (ret); +} + +/* + * Zero out @abd. + * @off is the offset in @abd + */ +void +abd_zero_off(abd_t *abd, size_t size, size_t off) +{ + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_W); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + memset(aiter.addr, 0, len); + + abd_miter_unmap_atomic(&aiter); + + size -= len; + abd_miter_advance(&aiter, len); + } +} + +#ifdef _KERNEL +/* + * Copy from @abd to user buffer @buf. + * @off is the offset in @abd + */ +int +abd_copy_to_user_off(void __user *buf, abd_t *abd, size_t size, + size_t off) +{ + int ret = 0; + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + ret = __copy_to_user_inatomic(buf, aiter.addr, len); + + abd_miter_unmap_atomic(&aiter); + if (ret) { + abd_miter_map(&aiter); + ret = copy_to_user(buf, aiter.addr, len); + abd_miter_unmap(&aiter); + if (ret) + break; + } + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } + return (ret ? EFAULT : 0); +} + +/* + * Copy from user buffer @buf to @abd. + * @off is the offset in @abd + */ +int +abd_copy_from_user_off(abd_t *abd, const void __user *buf, size_t size, + size_t off) +{ + int ret = 0; + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_W); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + ret = __copy_from_user_inatomic(aiter.addr, buf, len); + + abd_miter_unmap_atomic(&aiter); + if (ret) { + abd_miter_map(&aiter); + ret = copy_from_user(aiter.addr, buf, len); + abd_miter_unmap(&aiter); + if (ret) + break; + } + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } + return (ret ? EFAULT : 0); +} + +/* + * uiomove for ABD. + * @off is the offset in @abd + */ +int +abd_uiomove_off(abd_t *abd, size_t n, enum uio_rw rw, uio_t *uio, + size_t off) +{ + struct iovec *iov; + ulong_t cnt; + + while (n && uio->uio_resid) { + iov = uio->uio_iov; + cnt = MIN(iov->iov_len, n); + if (cnt == 0l) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + switch (uio->uio_segflg) { + case UIO_USERSPACE: + case UIO_USERISPACE: + /* + * p = kernel data pointer + * iov->iov_base = user data pointer + */ + if (rw == UIO_READ) { + if (abd_copy_to_user_off(iov->iov_base, + abd, cnt, off)) + return (EFAULT); + } else { + if (abd_copy_from_user_off(abd, + iov->iov_base, cnt, off)) + return (EFAULT); + } + break; + case UIO_SYSSPACE: + if (rw == UIO_READ) + abd_copy_to_buf_off(iov->iov_base, abd, + cnt, off); + else + abd_copy_from_buf_off(abd, iov->iov_base, + cnt, off); + break; + } + iov->iov_base += cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + off += cnt; + n -= cnt; + } + return (0); +} + +/* + * uiocopy for ABD. + * @off is the offset in @abd + */ +int +abd_uiocopy_off(abd_t *abd, size_t n, enum uio_rw rw, uio_t *uio, + size_t *cbytes, size_t off) +{ + struct iovec *iov; + ulong_t cnt; + int iovcnt; + + iovcnt = uio->uio_iovcnt; + *cbytes = 0; + + for (iov = uio->uio_iov; n && iovcnt; iov++, iovcnt--) { + cnt = MIN(iov->iov_len, n); + if (cnt == 0) + continue; + + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + case UIO_USERISPACE: + /* + * p = kernel data pointer + * iov->iov_base = user data pointer + */ + if (rw == UIO_READ) { + /* UIO_READ = copy data from kernel to user */ + if (abd_copy_to_user_off(iov->iov_base, + abd, cnt, off)) + return (EFAULT); + } else { + /* UIO_WRITE = copy data from user to kernel */ + if (abd_copy_from_user_off(abd, + iov->iov_base, cnt, off)) + return (EFAULT); + } + break; + + case UIO_SYSSPACE: + if (rw == UIO_READ) + abd_copy_to_buf_off(iov->iov_base, abd, + cnt, off); + else + abd_copy_from_buf_off(abd, iov->iov_base, + cnt, off); + break; + } + off += cnt; + n -= cnt; + *cbytes += cnt; + } + return (0); +} + +/* + * bio_map for scatter ABD. + * @off is the offset in @abd + * You should use abd_bio_map_off, it will choose the right function according + * to the ABD type. + */ +unsigned int +abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, unsigned int bio_size, + size_t off) +{ + int i; + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT_ABD_SCATTER(abd); + ASSERT(bio_size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + for (i = 0; i < bio->bi_max_vecs; i++) { + if (bio_size <= 0) + break; + + len = MIN(bio_size, aiter.length); + ASSERT(len > 0); + + if (bio_add_page(bio, sg_page(aiter.sg), len, + aiter.offset) != len) + break; + + bio_size -= len; + abd_miter_advance(&aiter, len); + } + return (bio_size); +} + +/* + * bio_nr_pages for ABD. + * @off is the offset in @abd + */ +unsigned long +abd_bio_nr_pages_off(abd_t *abd, unsigned int bio_size, size_t off) +{ + unsigned long pos = 0; + ABD_CHECK(abd); + + if (ABD_IS_LINEAR(abd)) + pos = (unsigned long)abd->abd_buf + off; + else + pos = abd->abd_offset + off; + return ((pos + bio_size + PAGE_SIZE-1)>>PAGE_SHIFT)-(pos>>PAGE_SHIFT); +} +#endif /* _KERNEL */ + +static inline abd_t * +abd_alloc_struct(int nr_pages) +{ + abd_t *abd; + size_t asize = sizeof (abd_t) + nr_pages*sizeof (struct scatterlist); + /* + * If the maximum block size increases, inline sgl might not fit into + * a single page. We might want to consider using chained sgl if + * that's the case. + */ + ASSERT(nr_pages * sizeof (struct scatterlist) <= PAGE_SIZE); +#ifndef DEBUG_ABD + abd = kmem_alloc(asize, KM_PUSHPAGE); +#else + abd = umem_alloc_aligned(asize, PAGE_SIZE, UMEM_DEFAULT); + /* deny access to padding */ + if (mprotect(abd, PAGE_SIZE, PROT_NONE) != 0) { + perror("mprotect failed"); + ASSERT(0); + } +#endif + ASSERT(abd); + + return (abd); +} + +static inline void +abd_free_struct(abd_t *abd, int nr_pages) +{ +#ifndef DEBUG_ABD + kmem_free(abd, sizeof (abd_t) + nr_pages*sizeof (struct scatterlist)); +#else + if (mprotect(abd, PAGE_SIZE, PROT_READ|PROT_WRITE) != 0) { + perror("mprotect failed"); + ASSERT(0); + } + umem_free(abd, sizeof (abd_t) + nr_pages*sizeof (struct scatterlist)); +#endif +} + +/* + * Allocate a new ABD to point to offset @off of the original ABD. + * It shares the underlying buffer with the original ABD. + * Use abd_put to free. The original ABD(allocated from abd_alloc) must + * not be freed before any of its derived ABD. + */ +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + size_t offset; + abd_t *abd; + + ABD_CHECK(sabd); + ASSERT(off <= sabd->abd_size); + + abd = abd_alloc_struct(0); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_size = sabd->abd_size - off; + + if (ABD_IS_LINEAR(sabd)) { + abd->abd_flags = ABD_F_LINEAR; + abd->abd_offset = 0; + abd->abd_nents = 1; + abd->abd_buf = sabd->abd_buf + off; + } else { + abd->abd_flags = ABD_F_SCATTER; + offset = sabd->abd_offset + off; + abd->abd_offset = offset & (PAGE_SIZE - 1); + /* make sure the new abd start as sgl[0] */ + abd->abd_sgl = &sabd->abd_sgl[offset >> PAGE_SHIFT]; + abd->abd_nents = sabd->abd_nents - (offset >> PAGE_SHIFT); + } + + return (abd); +} + +/* + * Allocate a linear ABD structure for @buf + * Use abd_put to free. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd; + + abd = abd_alloc_struct(0); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_flags = ABD_F_LINEAR; + abd->abd_size = size; + abd->abd_offset = 0; + abd->abd_nents = 1; + abd->abd_buf = buf; + + return (abd); +} + +/* + * Free an ABD allocated from abd_get_{offset,from_buf}. + * Must not be used on ABD from elsewhere. + * Will not free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + ABD_CHECK(abd); + ASSERT(!(abd->abd_flags & ABD_F_OWNER)); + + abd->abd_magic = 0; + abd_free_struct(abd, 0); +} + +/* + * Allocate a scatter ABD + */ +abd_t * +abd_alloc_scatter(size_t size) +{ + abd_t *abd; + struct page *page; + int i, n = DIV_ROUND_UP(size, PAGE_SIZE); + size_t last_size = size - ((n-1) << PAGE_SHIFT); + + abd = abd_alloc_struct(n); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_flags = ABD_F_SCATTER|ABD_F_OWNER; + abd->abd_size = size; + abd->abd_offset = 0; + abd->abd_nents = n; + abd->abd_sgl = (struct scatterlist *)&abd->__abd_sgl[0]; + sg_init_table(abd->abd_sgl, n); + + for (i = 0; i < n; i++) { +retry: + page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); + if (unlikely(page == NULL)) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + goto retry; + } + sg_set_page(&abd->abd_sgl[i], page, + (i == n-1 ? last_size : PAGE_SIZE), 0); + } + + return (abd); +} + +/* + * Allocate a linear ABD + */ +abd_t * +abd_alloc_linear(size_t size) +{ + abd_t *abd; + + abd = abd_alloc_struct(0); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_flags = ABD_F_LINEAR|ABD_F_OWNER; + abd->abd_size = size; + abd->abd_offset = 0; + abd->abd_nents = 1; + + abd->abd_buf = zio_buf_alloc(size); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd, size_t size) +{ + int i, n; + struct page *page; + + ASSERT(abd->abd_sgl == (struct scatterlist *)&abd->__abd_sgl[0]); + ASSERT(abd->abd_size == size); + ASSERT(abd->abd_nents == DIV_ROUND_UP(abd->abd_size, PAGE_SIZE)); + + n = abd->abd_nents; + abd->abd_magic = 0; + for (i = 0; i < n; i++) { + page = sg_page(&abd->abd_sgl[i]); + if (page) + __free_page(page); + } + abd_free_struct(abd, n); +} + +static void +abd_free_linear(abd_t *abd, size_t size) +{ + abd->abd_magic = 0; + zio_buf_free(abd->abd_buf, size); + abd_free_struct(abd, 0); +} + +/* + * Free a ABD. + * Only use this on ABD allocated with abd_alloc_{scatter,linear}. + */ +void +abd_free(abd_t *abd, size_t size) +{ + ABD_CHECK(abd); + ASSERT(abd->abd_flags & ABD_F_OWNER); + if (ABD_IS_LINEAR(abd)) + abd_free_linear(abd, size); + else + abd_free_scatter(abd, size); +} From b22370c6d247ee694744791ef076579660ca43c0 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 03/16] Modify/Add incremental checksum function for abd_iterate_rfunc Modify/Add incremental fletcher function prototype to match abd_iterate_rfunc callback type. Also, reduce duplicated code a bit in zfs_fletcher.c. Signed-off-by: Chunwei Chen --- include/zfs_fletcher.h | 13 +++-- module/zcommon/zfs_fletcher.c | 93 ++++++++++++++++++++--------------- 2 files changed, 62 insertions(+), 44 deletions(-) diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index b49df0cf4f0f..8f2e7bb09fcd 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -37,14 +37,19 @@ extern "C" { * fletcher checksum functions */ +#define fletcher_init(zcp) ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0) +#define fletcher_2_native_init fletcher_init +#define fletcher_2_byteswap_init fletcher_init +#define fletcher_4_native_init fletcher_init +#define fletcher_4_byteswap_init fletcher_init void fletcher_2_native(const void *, uint64_t, zio_cksum_t *); void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *); +int fletcher_2_incremental_native(const void *, uint64_t, void *); +int fletcher_2_incremental_byteswap(const void *, uint64_t, void *); void fletcher_4_native(const void *, uint64_t, zio_cksum_t *); void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *); -void fletcher_4_incremental_native(const void *, uint64_t, - zio_cksum_t *); -void fletcher_4_incremental_byteswap(const void *, uint64_t, - zio_cksum_t *); +int fletcher_4_incremental_native(const void *, uint64_t, void *); +int fletcher_4_incremental_byteswap(const void *, uint64_t, void *); #ifdef __cplusplus } diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index edd0cbe6c611..0a7082894866 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -130,15 +130,22 @@ #include #include #include +#include -void -fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +int +fletcher_2_incremental_native(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += ip[0]; a1 += ip[1]; b0 += a0; @@ -146,16 +153,30 @@ fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); } void -fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_2_native_init(zcp); + fletcher_2_incremental_native(buf, size, zcp); +} + +int +fletcher_2_incremental_byteswap(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += BSWAP_64(ip[0]); a1 += BSWAP_64(ip[1]); b0 += a0; @@ -163,46 +184,20 @@ fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); } void -fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - for (a = b = c = d = 0; ip < ipend; ip++) { - a += ip[0]; - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} - -void -fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) { - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - for (a = b = c = d = 0; ip < ipend; ip++) { - a += BSWAP_32(ip[0]); - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); + fletcher_2_byteswap_init(zcp); + fletcher_2_incremental_byteswap(buf, size, zcp); } -void -fletcher_4_incremental_native(const void *buf, uint64_t size, - zio_cksum_t *zcp) +int +fletcher_4_incremental_native(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; @@ -220,12 +215,20 @@ fletcher_4_incremental_native(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a, b, c, d); + return (0); } void -fletcher_4_incremental_byteswap(const void *buf, uint64_t size, - zio_cksum_t *zcp) +fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_native_init(zcp); + fletcher_4_incremental_native(buf, size, zcp); +} + +int +fletcher_4_incremental_byteswap(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; @@ -243,11 +246,21 @@ fletcher_4_incremental_byteswap(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a, b, c, d); + return (0); +} + +void +fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_byteswap_init(zcp); + fletcher_4_incremental_byteswap(buf, size, zcp); } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(fletcher_2_native); EXPORT_SYMBOL(fletcher_2_byteswap); +EXPORT_SYMBOL(fletcher_2_incremental_native); +EXPORT_SYMBOL(fletcher_2_incremental_byteswap); EXPORT_SYMBOL(fletcher_4_native); EXPORT_SYMBOL(fletcher_4_byteswap); EXPORT_SYMBOL(fletcher_4_incremental_native); From d8b85eae14e698decabf5aa0746279267e5a4087 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 04/16] Use abd_t in arc.h, ddt.h, dmu.h and zio.h 1. Use abd_t in arc_buf_t->b_data, dmu_buf_t->db_data, zio_t->io_data and zio_transform_t->zt_orig_data 2. zio_* function take abd_t for data Signed-off-by: Chunwei Chen --- include/sys/arc.h | 3 ++- include/sys/ddt.h | 2 +- include/sys/dmu.h | 3 ++- include/sys/zio.h | 23 ++++++++++++----------- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/include/sys/arc.h b/include/sys/arc.h index 215c75b6dfa3..8ffc893e0d28 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -28,6 +28,7 @@ #define _SYS_ARC_H #include +#include #ifdef __cplusplus extern "C" { @@ -61,7 +62,7 @@ struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; kmutex_t b_evict_lock; - void *b_data; + abd_t *b_data; arc_evict_func_t *b_efunc; void *b_private; }; diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 3befcb84427c..b7977aae0357 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -108,7 +108,7 @@ struct ddt_entry { ddt_key_t dde_key; ddt_phys_t dde_phys[DDT_PHYS_TYPES]; zio_t *dde_lead_zio[DDT_PHYS_TYPES]; - void *dde_repair_data; + abd_t *dde_repair_data; enum ddt_type dde_type; enum ddt_class dde_class; uint8_t dde_loading; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index c9c687b5aa62..3996b07cbaea 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -46,6 +46,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -285,7 +286,7 @@ typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ - void *db_data; /* data in buffer */ + abd_t *db_data; /* data in buffer */ } dmu_buf_t; typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); diff --git a/include/sys/zio.h b/include/sys/zio.h index 18e7a40a3080..7bed94a6036c 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -353,10 +354,10 @@ typedef struct zio_gang_node { typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, zio_gang_node_t *gn, void *data); -typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); +typedef void zio_transform_func_t(zio_t *zio, abd_t *data, uint64_t size); typedef struct zio_transform { - void *zt_orig_data; + abd_t *zt_orig_data; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; @@ -412,8 +413,8 @@ struct zio { blkptr_t io_bp_orig; /* Data represented by this I/O */ - void *io_data; - void *io_orig_data; + abd_t *io_data; + abd_t *io_orig_data; uint64_t io_size; uint64_t io_orig_size; @@ -463,18 +464,18 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags); -extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, + abd_t *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, @@ -490,12 +491,12 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); @@ -526,12 +527,12 @@ extern void zio_data_buf_free(void *buf, size_t size); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, - uint64_t offset, void *data, uint64_t size, int type, + uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); From fa766949b922d328b574284aa44933da99214bb3 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 05/16] Convert zio_checksum to ABD version 1. Add checksum function for abd_t 2. Use abd_t version checksum function in zio_checksum_table 3. Make zio_checksum_compute and zio_checksum_error handle abd_t Signed-off-by: Chunwei Chen --- include/sys/zio_checksum.h | 12 +++++-- module/zfs/zio_checksum.c | 73 +++++++++++++++++++++++++++++--------- 2 files changed, 65 insertions(+), 20 deletions(-) diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index de89bc9a7967..dc01673e653a 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -34,7 +34,7 @@ extern "C" { /* * Signature for checksum functions. */ -typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); +typedef void zio_checksum_t(abd_t *data, uint64_t size, zio_cksum_t *zcp); /* * Information about each checksum function. @@ -61,10 +61,16 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. */ -extern zio_checksum_t zio_checksum_SHA256; +extern void abd_checksum_SHA256(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_2_native(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_2_byteswap(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_4_native(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_4_byteswap(abd_t *, uint64_t, zio_cksum_t *); + +extern void zio_checksum_SHA256(const void *, uint64_t, zio_cksum_t *); extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size); + abd_t *data, uint64_t size); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 3a5c73a6a1e9..b96149f96a00 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -28,6 +28,7 @@ #include #include #include +#include #include /* @@ -62,22 +63,58 @@ /*ARGSUSED*/ static void -zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) +abd_checksum_off(abd_t *abd, uint64_t size, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } +void +abd_checksum_SHA256(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + void *buf = abd_borrow_buf_copy(abd, size); + zio_checksum_SHA256(buf, size, zcp); + abd_return_buf(abd, buf, size); +} + +void +abd_fletcher_2_native(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_2_native_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_2_incremental_native, zcp); +} + +void +abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_2_byteswap_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_2_incremental_byteswap, zcp); +} + +void +abd_fletcher_4_native(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_native_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_4_incremental_native, zcp); +} + +void +abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_byteswap_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_4_incremental_byteswap, zcp); +} + zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { - {{NULL, NULL}, 0, 0, 0, "inherit"}, - {{NULL, NULL}, 0, 0, 0, "on"}, - {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, - {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"}, - {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, +{{NULL, NULL}, 0, 0, 0, "inherit"}, +{{NULL, NULL}, 0, 0, 0, "on"}, +{{abd_checksum_off, abd_checksum_off}, 0, 0, 0, "off"}, +{{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 1, 0, "label"}, +{{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 1, 0, "gang_header"}, +{{abd_fletcher_2_native, abd_fletcher_2_byteswap}, 0, 1, 0, "zilog"}, +{{abd_fletcher_2_native, abd_fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, +{{abd_fletcher_4_native, abd_fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, +{{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 0, 1, "sha256"}, +{{abd_fletcher_4_native, abd_fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, }; enum zio_checksum @@ -150,7 +187,7 @@ zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) */ void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size) + abd_t *data, uint64_t size) { blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; @@ -162,15 +199,16 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, if (ci->ci_eck) { zio_eck_t *eck; + void *buf = ABD_TO_BUF(data); if (checksum == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = data; + zil_chain_t *zilc = buf; size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ, uint64_t); eck = &zilc->zc_eck; } else { - eck = (zio_eck_t *)((char *)data + size) - 1; + eck = (zio_eck_t *)((char *)buf + size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&eck->zec_cksum, bp); @@ -197,7 +235,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; - void *data = zio->io_data; + abd_t *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum, verifier; @@ -206,9 +244,10 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) if (ci->ci_eck) { zio_eck_t *eck; + void *buf = ABD_TO_BUF(data); if (checksum == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = data; + zil_chain_t *zilc = buf; uint64_t nused; eck = &zilc->zc_eck; @@ -224,7 +263,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { - eck = (zio_eck_t *)((char *)data + size) - 1; + eck = (zio_eck_t *)((char *)buf + size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) From f1aba6b2b20105e3f5a60ed4391ca20d07853192 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 06/16] Handle abd_t in arc.c, bpobj.c and bptree.c Signed-off-by: Chunwei Chen --- module/zfs/arc.c | 105 ++++++++++++++++++++++++++++---------------- module/zfs/bpobj.c | 13 +++--- module/zfs/bptree.c | 10 ++--- 3 files changed, 79 insertions(+), 49 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 421c81e1cfe9..4c12cb6de002 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -135,6 +135,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -643,14 +644,14 @@ struct l2arc_buf_hdr { uint32_t b_hits; uint64_t b_asize; /* temporary buffer holder for in-flight compressed data */ - void *b_tmp_cdata; + abd_t *b_tmp_cdata; }; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ - void *l2df_data; + abd_t *l2df_data; size_t l2df_size; - void (*l2df_func)(void *, size_t); + void (*l2df_func)(abd_t *, size_t); list_node_t l2df_list_node; } l2arc_data_free_t; @@ -953,7 +954,7 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&buf->b_hdr->b_freeze_lock); return; } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&buf->b_hdr->b_freeze_lock); @@ -966,7 +967,7 @@ arc_cksum_equal(arc_buf_t *buf) int equal; mutex_enter(&buf->b_hdr->b_freeze_lock); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); mutex_exit(&buf->b_hdr->b_freeze_lock); @@ -986,7 +987,7 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, + abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); mutex_exit(&buf->b_hdr->b_freeze_lock); arc_buf_watch(buf); @@ -998,6 +999,13 @@ arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr); } + +static int +arc_abd_watch(const void *buf, uint64_t len, void *private) +{ + ASSERT0(mprotect((void *)buf, len, *(int *)private)); + return (0); +} #endif /* ARGSUSED */ @@ -1006,8 +1014,9 @@ arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { - ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, - PROT_READ | PROT_WRITE)); + int prot = PROT_READ | PROT_WRITE; + abd_iterate_rfunc(buf->b_data, buf->b_hdr->b_size, + arc_abd_watch, &prot); } #endif } @@ -1017,8 +1026,11 @@ static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL - if (arc_watch) - ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ)); + if (arc_watch) { + int prot = PROT_READ; + abd_iterate_rfunc(buf->b_data, buf->b_hdr->b_size, + arc_abd_watch, &prot); + } #endif } @@ -1425,7 +1437,7 @@ arc_buf_clone(arc_buf_t *from) buf->b_next = hdr->b_buf; hdr->b_buf = buf; arc_get_data_buf(buf); - bcopy(from->b_data, buf->b_data, size); + abd_copy(buf->b_data, from->b_data, size); /* * This buffer already exists in the arc so create a duplicate @@ -1475,8 +1487,8 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) } static void -arc_buf_free_on_write(void *data, size_t size, - void (*free_func)(void *, size_t)) +arc_buf_free_on_write(abd_t *data, size_t size, + void (*free_func)(abd_t *, size_t)) { l2arc_data_free_t *df; @@ -1494,7 +1506,7 @@ arc_buf_free_on_write(void *data, size_t size, * the buffer is placed on l2arc_free_on_write to be freed later. */ static void -arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) +arc_buf_data_free(arc_buf_t *buf, void (*free_func)(abd_t *, size_t)) { arc_buf_hdr_t *hdr = buf->b_hdr; @@ -1522,7 +1534,7 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) ASSERT(HDR_L2_WRITING(hdr)); arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size, - zio_data_buf_free); + abd_free); ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); l2hdr->b_tmp_cdata = NULL; } @@ -1543,11 +1555,11 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) if (!recycle) { if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, zio_buf_free); + arc_buf_data_free(buf, abd_free); arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, zio_data_buf_free); + arc_buf_data_free(buf, abd_free); arc_space_return(size, ARC_SPACE_DATA); } } @@ -1814,7 +1826,7 @@ arc_buf_eviction_needed(arc_buf_t *buf) * it can't get a hash_lock on, and so may not catch all candidates. * It may also return without evicting as much space as requested. */ -static void * +static abd_t * arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { @@ -1824,7 +1836,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; boolean_t have_lock; - void *stolen = NULL; + abd_t *stolen = NULL; arc_buf_hdr_t marker = {{{ 0 }}}; int count = 0; @@ -2755,11 +2767,11 @@ arc_get_data_buf(arc_buf_t *buf) */ if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); + buf->b_data = abd_alloc_linear(size); arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); + buf->b_data = abd_alloc_scatter(size); arc_space_consume(size, ARC_SPACE_DATA); } goto out; @@ -2804,7 +2816,7 @@ arc_get_data_buf(arc_buf_t *buf) if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) { if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); + buf->b_data = abd_alloc_linear(size); arc_space_consume(size, ARC_SPACE_META); /* @@ -2819,7 +2831,7 @@ arc_get_data_buf(arc_buf_t *buf) cv_signal(&arc_reclaim_thr_cv); } else { ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); + buf->b_data = abd_alloc_scatter(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -2997,7 +3009,7 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, buf->b_hdr->b_size); + abd_copy_to_buf(arg, buf->b_data, buf->b_hdr->b_size); VERIFY(arc_buf_remove_ref(buf, arg)); } @@ -3065,10 +3077,15 @@ arc_read_done(zio_t *zio) if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + + void *ziobuf = abd_borrow_buf_copy(zio->io_data, zio->io_size); + if (BP_GET_LEVEL(zio->io_bp) > 0) - byteswap_uint64_array(buf->b_data, hdr->b_size); + byteswap_uint64_array(ziobuf, hdr->b_size); else - dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size); + dmu_ot_byteswap[bswap].ob_func(ziobuf, hdr->b_size); + + abd_return_buf_copy(zio->io_data, ziobuf, zio->io_size); } arc_cksum_compute(buf, B_FALSE); @@ -4959,7 +4976,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, list_t *list; uint64_t write_asize, write_psize, write_sz, headroom, buf_compress_minsz; - void *buf_data; + abd_t *buf_data; kmutex_t *list_lock = NULL; boolean_t full; l2arc_write_callback_t *cb; @@ -5233,26 +5250,32 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) { - void *cdata; + abd_t *cdata; + void *ddata; size_t csize, len, rounded; ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); ASSERT(l2hdr->b_tmp_cdata != NULL); len = l2hdr->b_asize; - cdata = zio_data_buf_alloc(len); - csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, - cdata, l2hdr->b_asize); + cdata = abd_alloc_linear(len); + + ddata = abd_borrow_buf_copy(l2hdr->b_tmp_cdata, l2hdr->b_asize); + + csize = zio_compress_data(ZIO_COMPRESS_LZ4, ddata, + ABD_TO_BUF(cdata), l2hdr->b_asize); + + abd_return_buf(l2hdr->b_tmp_cdata, ddata, l2hdr->b_asize); rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); if (rounded > csize) { - bzero((char *)cdata + csize, rounded - csize); + abd_zero_off(cdata, rounded - csize, csize); csize = rounded; } if (csize == 0) { /* zero block, indicate that there's nothing to write */ - zio_data_buf_free(cdata, len); + abd_free(cdata, len); l2hdr->b_compress = ZIO_COMPRESS_EMPTY; l2hdr->b_asize = 0; l2hdr->b_tmp_cdata = NULL; @@ -5273,7 +5296,7 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) * Compression failed, release the compressed buffer. * l2hdr will be left unmodified. */ - zio_data_buf_free(cdata, len); + abd_free(cdata, len); ARCSTAT_BUMP(arcstat_l2_compress_failures); return (B_FALSE); } @@ -5314,9 +5337,10 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) * buffer's contents. */ ASSERT(hdr->b_buf != NULL); - bzero(hdr->b_buf->b_data, hdr->b_size); + abd_zero(hdr->b_buf->b_data, hdr->b_size); zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; } else { + void *ddata; ASSERT(zio->io_data != NULL); /* * We copy the compressed data from the start of the arc buffer @@ -5330,10 +5354,15 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) */ csize = zio->io_size; cdata = zio_data_buf_alloc(csize); - bcopy(zio->io_data, cdata, csize); - if (zio_decompress_data(c, cdata, zio->io_data, csize, + + abd_copy_to_buf(cdata, zio->io_data, csize); + ddata = abd_borrow_buf(zio->io_data, hdr->b_size); + + if (zio_decompress_data(c, cdata, ddata, csize, hdr->b_size) != 0) zio->io_error = SET_ERROR(EIO); + + abd_return_buf_copy(zio->io_data, ddata, hdr->b_size); zio_data_buf_free(cdata, csize); } @@ -5359,7 +5388,7 @@ l2arc_release_cdata_buf(arc_buf_hdr_t *ab) * temporary buffer for it, so now we need to release it. */ ASSERT(l2hdr->b_tmp_cdata != NULL); - zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); + abd_free(l2hdr->b_tmp_cdata, ab->b_size); l2hdr->b_tmp_cdata = NULL; } else { ASSERT(l2hdr->b_tmp_cdata == NULL); diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 7c8f932f5cf3..166eb33e4d07 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -126,7 +126,7 @@ bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - objarray = dbuf->db_data; + objarray = ABD_TO_BUF(dbuf->db_data); bpobj_free(os, objarray[blkoff], tx); } if (dbuf) { @@ -170,7 +170,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); - bpo->bpo_phys = bpo->bpo_dbuf->db_data; + bpo->bpo_phys = ABD_TO_BUF(bpo->bpo_dbuf->db_data); return (0); } @@ -234,7 +234,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - bparray = dbuf->db_data; + bparray = ABD_TO_BUF(dbuf->db_data); bp = &bparray[blkoff]; err = func(arg, bp, tx); if (err) @@ -294,7 +294,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - objarray = dbuf->db_data; + objarray = ABD_TO_BUF(dbuf->db_data); err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); if (err) break; @@ -433,7 +433,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) numsubsub * sizeof (subobj)); dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - numsubsub * sizeof (subobj), subdb->db_data, tx); + numsubsub * sizeof (subobj), + ABD_TO_BUF(subdb->db_data), tx); dmu_buf_rele(subdb, FTAG); bpo->bpo_phys->bpo_num_subobjs += numsubsub; @@ -501,7 +502,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) } dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); - bparray = bpo->bpo_cached_dbuf->db_data; + bparray = ABD_TO_BUF(bpo->bpo_cached_dbuf->db_data); bparray[blkoff] = stored_bp; dmu_buf_will_dirty(bpo->bpo_dbuf, tx); diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index d6ea9d7c6451..acd22f7d8f70 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -74,7 +74,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx) */ VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); bt->bt_begin = 0; bt->bt_end = 0; bt->bt_bytes = 0; @@ -92,7 +92,7 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) bptree_phys_t *bt; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); ASSERT3U(bt->bt_begin, ==, bt->bt_end); ASSERT0(bt->bt_bytes); ASSERT0(bt->bt_comp); @@ -110,7 +110,7 @@ bptree_is_empty(objset_t *os, uint64_t obj) boolean_t rv; VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); rv = (bt->bt_begin == bt->bt_end); dmu_buf_rele(db, FTAG); return (rv); @@ -132,7 +132,7 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, ASSERT(dmu_tx_is_syncing(tx)); VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); bte = kmem_zalloc(sizeof (*bte), KM_SLEEP); bte->be_birth_txg = birth_txg; @@ -203,7 +203,7 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, if (free) dmu_buf_will_dirty(db, tx); - ba.ba_phys = db->db_data; + ba.ba_phys = ABD_TO_BUF(db->db_data); ba.ba_free = free; ba.ba_func = func; ba.ba_arg = arg; From 81de98822b4e81a96b8939fa35ad5c9eb3e90162 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 07/16] Handle abd_t in dbuf.c, ddt.c, dmu*.c Signed-off-by: Chunwei Chen --- module/zfs/dbuf.c | 90 ++++++++++++++++++++++++--------------- module/zfs/ddt.c | 3 +- module/zfs/dmu.c | 35 +++++++++------ module/zfs/dmu_diff.c | 2 +- module/zfs/dmu_objset.c | 14 +++--- module/zfs/dmu_send.c | 49 ++++++++++++++------- module/zfs/dmu_traverse.c | 10 ++--- module/zfs/dmu_tx.c | 2 +- 8 files changed, 127 insertions(+), 78 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index ed6a8fd2a4dc..1c380e86f3ec 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -255,8 +255,9 @@ dbuf_evict_user(dmu_buf_impl_t *db) if (db->db_level != 0 || db->db_evict_func == NULL) return; - if (db->db_user_data_ptr_ptr) - *db->db_user_data_ptr_ptr = db->db.db_data; + /* XXX ABD: db_data might be freed */ +// if (db->db_user_data_ptr_ptr) +// *db->db_user_data_ptr_ptr = ABD_TO_BUF(db->db.db_data); db->db_evict_func(&db->db, db->db_user_ptr); db->db_user_ptr = NULL; db->db_user_data_ptr_ptr = NULL; @@ -363,6 +364,21 @@ dbuf_fini(void) */ #ifdef ZFS_DEBUG +static int +checkzero(const void *p, uint64_t size, void *private) +{ + int i; + const uint64_t *x = p; + uint64_t *t = private; + for (i = 0; i < size >> 3; i++) { + if (x[i] != 0) { + *t = x[i]; + return (1); + } + } + return (0); +} + static void dbuf_verify(dmu_buf_impl_t *db) { @@ -446,7 +462,8 @@ dbuf_verify(dmu_buf_impl_t *db) */ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, - ((blkptr_t *)db->db_parent->db.db_data + + ((blkptr_t *) + ABD_TO_BUF(db->db_parent->db.db_data) + db->db_blkid % epb)); } } @@ -461,12 +478,10 @@ dbuf_verify(dmu_buf_impl_t *db) * data when we evict this buffer. */ if (db->db_dirtycnt == 0) { - ASSERTV(uint64_t *buf = db->db.db_data); - int i; - - for (i = 0; i < db->db.db_size >> 3; i++) { - ASSERT(buf[i] == 0); - } + uint64_t tmp = 0; + abd_iterate_rfunc(db->db.db_data, db->db.db_size, + checkzero, &tmp); + ASSERT(tmp == 0); } } DB_DNODE_EXIT(db); @@ -479,7 +494,7 @@ dbuf_update_data(dmu_buf_impl_t *db) ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level == 0 && db->db_user_data_ptr_ptr) { ASSERT(!refcount_is_zero(&db->db_holds)); - *db->db_user_data_ptr_ptr = db->db.db_data; + *db->db_user_data_ptr_ptr = ABD_TO_BUF(db->db.db_data); } } @@ -517,7 +532,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, blksz); - bcopy(db->db.db_data, abuf->b_data, blksz); + abd_copy(abuf->b_data, db->db.db_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); @@ -554,7 +569,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0 && db->db_freed_in_flight) { /* we were freed in flight; disregard any error */ arc_release(buf, db); - bzero(buf->b_data, db->db.db_size); + abd_zero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); @@ -593,12 +608,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); - db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); + db->db.db_data = abd_alloc_linear(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) - bzero(db->db.db_data, DN_MAX_BONUSLEN); + abd_zero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + abd_copy_from_buf(db->db.db_data, + DN_BONUS(dn->dn_phys), bonuslen); DB_DNODE_EXIT(db); dbuf_update_data(db); db->db_state = DB_CACHED; @@ -619,7 +635,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) DB_DNODE_EXIT(db); dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, db->db.db_size, db, type)); - bzero(db->db.db_data, db->db.db_size); + abd_zero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; mutex_exit(&db->db_mtx); @@ -794,7 +810,8 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) if (dr == NULL || (dr->dt.dl.dr_data != - ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + ((db->db_blkid == DMU_BONUS_BLKID) ? ABD_TO_BUF(db->db.db_data) : + db->db_buf))) return; /* @@ -809,14 +826,15 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + abd_copy_to_buf(dr->dt.dl.dr_data, db->db.db_data, + DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); - bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + abd_copy(dr->dt.dl.dr_data->b_data, db->db.db_data, size); } else { dbuf_set_data(db, NULL); } @@ -962,7 +980,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); - bzero(db->db.db_data, db->db.db_size); + abd_zero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); } @@ -1039,10 +1057,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) /* copy old block data to the new block */ obuf = db->db_buf; - bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); + abd_copy(buf->b_data, obuf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) - bzero((uint8_t *)buf->b_data + osize, size - osize); + abd_zero_off(buf->b_data, size - osize, osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); @@ -1211,7 +1229,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db.db_data; + data_old = ABD_TO_BUF(db->db.db_data); } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so @@ -1498,7 +1516,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ - bzero(db->db.db_data, db->db.db_size); + abd_zero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; } db->db_state = DB_CACHED; @@ -1568,7 +1586,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); - bcopy(buf->b_data, db->db.db_data, db->db.db_size); + abd_copy(db->db.db_data, buf->b_data, db->db.db_size); VERIFY(arc_buf_remove_ref(buf, db)); xuio_stat_wbuf_copied(); return; @@ -1634,7 +1652,7 @@ dbuf_clear(dmu_buf_impl_t *db) if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); if (db->db_blkid == DMU_BONUS_BLKID) { - zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + abd_free(db->db.db_data, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; @@ -1742,7 +1760,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, *parentp = NULL; return (err); } - *bpp = ((blkptr_t *)(*parentp)->db.db_data) + + *bpp = ((blkptr_t *)ABD_TO_BUF((*parentp)->db.db_data)) + (blkid & ((1ULL << epbs) - 1)); return (0); } else { @@ -2028,8 +2046,9 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) dbuf_set_data(dh->dh_db, arc_buf_alloc(dh->dh_dn->dn_objset->os_spa, dh->dh_db->db.db_size, dh->dh_db, dh->dh_type)); - bcopy(dh->dh_dr->dt.dl.dr_data->b_data, - dh->dh_db->db.db_data, dh->dh_db->db.db_size); + abd_copy(dh->dh_db->db.db_data, + dh->dh_dr->dt.dl.dr_data->b_data, + dh->dh_db->db.db_size); } } @@ -2435,7 +2454,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); db->db_parent = parent; } - db->db_blkptr = (blkptr_t *)parent->db.db_data + + db->db_blkptr = (blkptr_t *)ABD_TO_BUF(parent->db.db_data) + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } @@ -2520,7 +2539,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ - ASSERT(db->db.db_data != dr->dt.dl.dr_data); + ASSERT(!ABD_IS_LINEAR(db->db.db_data) || + ABD_TO_BUF(db->db.db_data) != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } @@ -2550,7 +2570,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); DB_DNODE_EXIT(db); - if (*datap != db->db.db_data) { + if (*datap != ABD_TO_BUF(db->db.db_data)) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } @@ -2611,7 +2631,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); *datap = arc_buf_alloc(os->os_spa, blksz, db, type); - bcopy(db->db.db_data, (*datap)->b_data, blksz); + abd_copy((*datap)->b_data, db->db.db_data, blksz); } db->db_data_pending = dr; @@ -2710,7 +2730,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { - dnode_phys_t *dnp = db->db.db_data; + dnode_phys_t *dnp = ABD_TO_BUF(db->db.db_data); for (i = db->db.db_size >> DNODE_SHIFT; i > 0; i--, dnp++) { if (dnp->dn_type != DMU_OT_NONE) @@ -2724,7 +2744,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) } } } else { - blkptr_t *ibp = db->db.db_data; + blkptr_t *ibp = ABD_TO_BUF(db->db.db_data); ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 18557ffb5c1f..4d9fab6392b8 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -705,8 +705,7 @@ ddt_free(ddt_entry_t *dde) ASSERT(dde->dde_lead_zio[p] == NULL); if (dde->dde_repair_data != NULL) - zio_buf_free(dde->dde_repair_data, - DDK_GET_PSIZE(&dde->dde_key)); + abd_free(dde->dde_repair_data, DDK_GET_PSIZE(&dde->dde_key)); cv_destroy(&dde->dde_cv); kmem_cache_free(ddt_entry_cache, dde); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 3b7bbefc2f73..3973b7aef1b9 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -793,7 +793,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - bcopy((char *)db->db_data + bufoff, buf, tocpy); + abd_copy_to_buf_off(buf, db->db_data, tocpy, bufoff); offset += tocpy; size -= tocpy; @@ -835,7 +835,7 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, else dmu_buf_will_dirty(db, tx); - (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); + abd_copy_from_buf_off(db->db_data, buf, tocpy, bufoff); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); @@ -960,6 +960,7 @@ dmu_xuio_fini(xuio_t *xuio) * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } * and increase priv->next by 1. */ +/* TODO: abd handle xuio */ int dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) { @@ -1044,7 +1045,8 @@ xuio_stat_wbuf_nocopy() * return value is the number of bytes successfully copied to arg_buf. */ static int -dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset) +dmu_req_copy(abd_t *db_data, int db_offset, int size, struct request *req, + size_t req_offset) { struct bio_vec bv, *bvp; struct req_iterator iter; @@ -1078,9 +1080,11 @@ dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset) ASSERT3P(bv_buf, !=, NULL); if (rq_data_dir(req) == WRITE) - memcpy(arg_buf + offset, bv_buf, tocpy); + abd_copy_from_buf_off(db_data, bv_buf, tocpy, + db_offset + offset); else - memcpy(bv_buf, arg_buf + offset, tocpy); + abd_copy_to_buf_off(bv_buf, db_data, tocpy, + db_offset + offset); offset += tocpy; } @@ -1118,7 +1122,7 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) if (tocpy == 0) break; - didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, + didcpy = dmu_req_copy(db->db_data, bufoff, tocpy, req, req_offset); if (didcpy < tocpy) @@ -1173,7 +1177,7 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) else dmu_buf_will_dirty(db, tx); - didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, + didcpy = dmu_req_copy(db->db_data, bufoff, tocpy, req, req_offset); if (tocpy == db->db_size) @@ -1236,8 +1240,8 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else { - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); + err = abd_uiomove_off(db->db_data, tocpy, UIO_READ, + uio, bufoff); } if (err) break; @@ -1285,8 +1289,8 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) * to lock the pages in memory, so that uiomove won't * block. */ - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); + err = abd_uiomove_off(db->db_data, tocpy, UIO_WRITE, uio, + bufoff); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); @@ -1399,6 +1403,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, } else { objset_t *os; uint64_t object; + void *tmp_buf; DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); @@ -1407,7 +1412,13 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); - dmu_write(os, object, offset, blksz, buf->b_data, tx); + + tmp_buf = abd_borrow_buf_copy(buf->b_data, blksz); + + dmu_write(os, object, offset, blksz, tmp_buf, tx); + + abd_return_buf(buf->b_data, tmp_buf, blksz); + dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index 30fabbb07957..c7cbf55bdde6 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -138,7 +138,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - blk = abuf->b_data; + blk = ABD_TO_BUF(abuf->b_data); for (i = 0; i < blksz >> DNODE_SHIFT; i++) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f438ca62a11f..ad8082d3dfef 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -312,22 +312,22 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, arc_buf_t *buf = arc_buf_alloc(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); - bzero(buf->b_data, sizeof (objset_phys_t)); - bcopy(os->os_phys_buf->b_data, buf->b_data, + abd_zero(buf->b_data, sizeof (objset_phys_t)); + abd_copy(buf->b_data, os->os_phys_buf->b_data, arc_buf_size(os->os_phys_buf)); (void) arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } - os->os_phys = os->os_phys_buf->b_data; + os->os_phys = ABD_TO_BUF(os->os_phys_buf->b_data); os->os_flags = os->os_phys->os_flags; } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; os->os_phys_buf = arc_buf_alloc(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); - os->os_phys = os->os_phys_buf->b_data; + os->os_phys = ABD_TO_BUF(os->os_phys_buf->b_data); bzero(os->os_phys, size); } @@ -1219,7 +1219,7 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) void *data; if (db->db_dirtycnt == 0) - return (db->db.db_data); /* Nothing is changing */ + return (ABD_TO_BUF(db->db.db_data)); /* Nothing is changing */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg == tx->tx_txg) @@ -1235,7 +1235,7 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dn->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) - data = dr->dt.dl.dr_data->b_data; + data = ABD_TO_BUF(dr->dt.dl.dr_data->b_data); else data = dr->dt.dl.dr_data; @@ -1284,7 +1284,7 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); - data = (before) ? db->db.db_data : + data = (before) ? ABD_TO_BUF(db->db.db_data) : dmu_objset_userquota_find_data(db, tx); have_spill = B_TRUE; } else { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 1f61368c5d65..4f3c07fd5ac2 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -446,6 +446,16 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) +static int +fillbadblock(void *p, uint64_t size, void *private) +{ + int i; + uint64_t *x = p; + for (i = 0; i < size >> 3; i++) + x[i] = 0x2f5baddb10cULL; + return (0); +} + /* ARGSUSED */ static int backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, @@ -489,7 +499,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - blk = abuf->b_data; + blk = ABD_TO_BUF(abuf->b_data); for (i = 0; i < blksz >> DNODE_SHIFT; i++) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; @@ -508,7 +518,8 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); + err = dump_spill(dsp, zb->zb_object, blksz, + ABD_TO_BUF(abuf->b_data)); (void) arc_buf_remove_ref(abuf, &abuf); } else if (backup_do_embed(dsp, bp)) { /* it's an embedded level-0 block of a regular object */ @@ -519,6 +530,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); + void *buf; ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT0(zb->zb_level); @@ -526,21 +538,21 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { if (zfs_send_corrupt_data) { - uint64_t *ptr; /* Send a block filled with 0x"zfs badd bloc" */ abuf = arc_buf_alloc(spa, blksz, &abuf, ARC_BUFC_DATA); - for (ptr = abuf->b_data; - (char *)ptr < (char *)abuf->b_data + blksz; - ptr++) - *ptr = 0x2f5baddb10cULL; + + abd_iterate_wfunc(abuf->b_data, blksz, + fillbadblock, NULL); } else { return (SET_ERROR(EIO)); } } + buf = abd_borrow_buf_copy(abuf->b_data, blksz); err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz, - blksz, bp, abuf->b_data); + blksz, bp, buf); + abd_return_buf(abuf->b_data, buf, blksz); (void) arc_buf_remove_ref(abuf, &abuf); } @@ -1431,12 +1443,12 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); - bcopy(data, db->db_data, drro->drr_bonuslen); + bcopy(data, ABD_TO_BUF(db->db_data), drro->drr_bonuslen); if (ra->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); - dmu_ot_byteswap[byteswap].ob_func(db->db_data, - drro->drr_bonuslen); + dmu_ot_byteswap[byteswap].ob_func( + ABD_TO_BUF(db->db_data), drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); } @@ -1476,7 +1488,7 @@ restore_write(struct restorearg *ra, objset_t *os, dmu_tx_t *tx; dmu_buf_t *bonus; arc_buf_t *abuf; - void *data; + void *data, *lbuf; int err; if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || @@ -1490,9 +1502,11 @@ restore_write(struct restorearg *ra, objset_t *os, return (SET_ERROR(EINVAL)); abuf = dmu_request_arcbuf(bonus, drrw->drr_length); + lbuf = abd_borrow_buf(abuf->b_data, drrw->drr_length); - data = restore_read(ra, drrw->drr_length, abuf->b_data); + data = restore_read(ra, drrw->drr_length, lbuf); if (data == NULL) { + abd_return_buf(abuf->b_data, lbuf, drrw->drr_length); dmu_return_arcbuf(abuf); dmu_buf_rele(bonus, FTAG); return (ra->err); @@ -1504,6 +1518,7 @@ restore_write(struct restorearg *ra, objset_t *os, drrw->drr_offset, drrw->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { + abd_return_buf(abuf->b_data, lbuf, drrw->drr_length); dmu_return_arcbuf(abuf); dmu_buf_rele(bonus, FTAG); dmu_tx_abort(tx); @@ -1514,6 +1529,7 @@ restore_write(struct restorearg *ra, objset_t *os, DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); } + abd_return_buf_copy(abuf->b_data, lbuf, drrw->drr_length); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); @@ -1538,6 +1554,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, avl_index_t where; objset_t *ref_os = NULL; dmu_buf_t *dbp; + void *buf; if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) return (SET_ERROR(EINVAL)); @@ -1572,8 +1589,10 @@ restore_write_byref(struct restorearg *ra, objset_t *os, dmu_tx_abort(tx); return (err); } + buf = abd_borrow_buf_copy(dbp->db_data, drrwbr->drr_length); dmu_write(os, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + drrwbr->drr_offset, drrwbr->drr_length, buf, tx); + abd_return_buf(dbp->db_data, buf, drrwbr->drr_length); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); return (0); @@ -1662,7 +1681,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) if (db_spill->db_size < drrs->drr_length) VERIFY(0 == dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); - bcopy(data, db_spill->db_data, drrs->drr_length); + abd_copy_from_buf(db_spill->db_data, data, drrs->drr_length); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 7cabc8a6ef3c..c4c0c56cd715 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -294,7 +294,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_level - 1, zb->zb_blkid * epb + i); traverse_prefetch_metadata(td, - &((blkptr_t *)buf->b_data)[i], czb); + &((blkptr_t *)ABD_TO_BUF(buf->b_data))[i], czb); } /* recursively visitbp() blocks below this */ @@ -303,7 +303,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, - &((blkptr_t *)buf->b_data)[i], czb); + &((blkptr_t *)ABD_TO_BUF(buf->b_data))[i], czb); if (err != 0) break; } @@ -319,7 +319,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; - dnp = buf->b_data; + dnp = ABD_TO_BUF(buf->b_data); for (i = 0; i < epb; i++) { prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset, @@ -343,7 +343,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (err != 0) goto post; - osp = buf->b_data; + osp = ABD_TO_BUF(buf->b_data); dnp = &osp->os_meta_dnode; prefetch_dnode_metadata(td, dnp, zb->zb_objset, DMU_META_DNODE_OBJECT); @@ -550,7 +550,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, if (err != 0) return (err); - osp = buf->b_data; + osp = ABD_TO_BUF(buf->b_data); traverse_zil(td, &osp->os_zil_header); (void) arc_buf_remove_ref(buf, &buf); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 3d6dcc70f305..f186a4b56985 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -548,7 +548,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) break; } - bp = dbuf->db.db_data; + bp = ABD_TO_BUF(dbuf->db.db_data); bp += blkoff; for (i = 0; i < tochk; i++) { From ee9408eeb918341d9abae2f75320db9f5a6b35f8 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 08/16] Handle abd_t in dnode*.c, dsl_*.c Signed-off-by: Chunwei Chen --- module/zfs/dnode.c | 14 +++++----- module/zfs/dnode_sync.c | 56 ++++++++++++++++++++++++++------------- module/zfs/dsl_dataset.c | 10 +++---- module/zfs/dsl_deadlist.c | 4 +-- module/zfs/dsl_dir.c | 6 ++--- module/zfs/dsl_scan.c | 18 ++++++++----- 6 files changed, 64 insertions(+), 44 deletions(-) diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index ef74621a0f6c..eb8de04470d4 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -235,7 +235,7 @@ dnode_verify(dnode_t *dn) ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, - (dnode_phys_t *)dn->dn_dbuf->db.db_data + + (dnode_phys_t *)ABD_TO_BUF(dn->dn_dbuf->db.db_data) + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); } if (drop_struct_lock) @@ -1089,7 +1089,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); if ((dn = dnh->dnh_dnode) == NULL) { - dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; + dnode_phys_t *phys = + (dnode_phys_t *)ABD_TO_BUF(db->db.db_data) + idx; dnode_t *winner; dn = dnode_create(os, phys, db, object, dnh); @@ -1509,16 +1510,13 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) head = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, FTAG, &db) == 0) { - caddr_t data; - /* don't dirty if it isn't on disk and isn't dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - data = db->db.db_data; - bzero(data + blkoff, head); + abd_zero_off(db->db.db_data, head, blkoff); } dbuf_rele(db, FTAG); } @@ -1553,7 +1551,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - bzero(db->db.db_data, tail); + abd_zero(db->db.db_data, tail); } dbuf_rele(db, FTAG); } @@ -1780,7 +1778,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dbuf_rele(db, FTAG); return (error); } - data = db->db.db_data; + data = ABD_TO_BUF(db->db.db_data); } diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 1825e983551c..efd52c8f6f1f 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -69,7 +69,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); - bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, + bcopy(dn->dn_phys->dn_blkptr, ABD_TO_BUF(db->db.db_data), sizeof (blkptr_t) * nblkptr); arc_buf_freeze(db->db_buf); } @@ -98,7 +98,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) child->db_parent = db; dbuf_add_ref(db, child); if (db->db.db_data) - child->db_blkptr = (blkptr_t *)db->db.db_data + i; + child->db_blkptr = + (blkptr_t *)ABD_TO_BUF(db->db.db_data) + i; else child->db_blkptr = NULL; dprintf_dbuf_bp(child, child->db_blkptr, @@ -159,6 +160,21 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) } #ifdef ZFS_DEBUG +static int +checkzero(const void *p, uint64_t size, void *private) +{ + int i; + const uint64_t *x = p; + uint64_t *t = private; + for (i = 0; i < size >> 3; i++) { + if (x[i] != 0) { + *t = x[i]; + return (1); + } + } + return (0); +} + static void free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) { @@ -181,10 +197,9 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) ASSERT(db->db_blkptr != NULL); for (i = off; i < off+num; i++) { - uint64_t *buf; + abd_t *buf; dmu_buf_impl_t *child; dbuf_dirty_record_t *dr; - int j; ASSERT(db->db_level == 1); @@ -203,13 +218,14 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) /* data_old better be zeroed */ if (dr) { + uint64_t tmp = 0; buf = dr->dt.dl.dr_data->b_data; - for (j = 0; j < child->db.db_size >> 3; j++) { - if (buf[j] != 0) { - panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - (void *)child, i, off, num); - } + abd_iterate_rfunc(buf, child->db.db_size, + checkzero, &tmp); + if (tmp) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); } } @@ -221,12 +237,13 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && child->db_last_dirty == NULL) { - for (j = 0; j < child->db.db_size >> 3; j++) { - if (buf[j] != 0) { - panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - (void *)child, i, off, num); - } + uint64_t tmp = 0; + abd_iterate_rfunc(buf, child->db.db_size, + checkzero, &tmp); + if (tmp) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); } } mutex_exit(&child->db_mtx); @@ -257,7 +274,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); dbuf_release_bp(db); - bp = db->db.db_data; + bp = (blkptr_t *)ABD_TO_BUF(db->db.db_data); DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -296,13 +313,14 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, } /* If this whole block is free, free ourself too. */ - for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { + bp = ABD_TO_BUF(db->db.db_data); + for (i = 0; i < 1 << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) break; } if (i == 1 << epbs) { /* didn't find any non-holes */ - bzero(db->db.db_data, db->db.db_size); + bzero(ABD_TO_BUF(db->db.db_data), db->db.db_size); free_blocks(dn, db->db_blkptr, 1, tx); } else { /* diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 79cb6a3a25e5..322809e8ecd3 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -290,7 +290,7 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) FTAG, &headdbuf); if (err != 0) return (err); - headphys = headdbuf->db_data; + headphys = ABD_TO_BUF(headdbuf->db_data); err = zap_value_search(dp->dp_meta_objset, headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); dmu_buf_rele(headdbuf, FTAG); @@ -368,7 +368,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; - ds->ds_phys = dbuf->db_data; + ds->ds_phys = ABD_TO_BUF(dbuf->db_data); list_link_init(&ds->ds_synced_link); mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); @@ -459,7 +459,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } } ASSERT3P(ds->ds_dbuf, ==, dbuf); - ASSERT3P(ds->ds_phys, ==, dbuf->db_data); + ASSERT3P(ds->ds_phys, ==, ABD_TO_BUF(dbuf->db_data)); ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); @@ -658,7 +658,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; + dsphys = ABD_TO_BUF(dbuf->db_data); bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; dsphys->ds_flags = flags; @@ -1064,7 +1064,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; + dsphys = ABD_TO_BUF(dbuf->db_data); bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 8a4362ff9c1a..6a34f4e7a01e 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -111,7 +111,7 @@ dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) } dl->dl_oldfmt = B_FALSE; - dl->dl_phys = dl->dl_dbuf->db_data; + dl->dl_phys = ABD_TO_BUF(dl->dl_dbuf->db_data); dl->dl_havetree = B_FALSE; } @@ -482,7 +482,7 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) zap_cursor_fini(&zc); VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); - dlp = bonus->db_data; + dlp = ABD_TO_BUF(bonus->db_data); dmu_buf_will_dirty(bonus, tx); bzero(dlp, sizeof (*dlp)); dmu_buf_rele(bonus, FTAG); diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index b94b68e15774..b61b3a67ff41 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -101,7 +101,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; - dd->dd_phys = dbuf->db_data; + dd->dd_phys = ABD_TO_BUF(dbuf->db_data); mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), @@ -148,7 +148,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); if (err != 0) goto errout; - origin_phys = origin_bonus->db_data; + origin_phys = ABD_TO_BUF(origin_bonus->db_data); dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); @@ -405,7 +405,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - ddphys = dbuf->db_data; + ddphys = ABD_TO_BUF(dbuf->db_data); ddphys->dd_creation_time = gethrestime_sec(); if (pds) diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 8b166bcc68eb..34233b801b01 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -648,11 +648,13 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, scn->scn_phys.scn_errors++; return (err); } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + cbp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cbp++) { dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_object, zb->zb_blkid * epb + i); } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + cbp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, @@ -675,14 +677,16 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, scn->scn_phys.scn_errors++; return (err); } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + cdnp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cdnp++) { for (j = 0; j < cdnp->dn_nblkptr; j++) { blkptr_t *cbp = &cdnp->dn_blkptr[j]; dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_blkid * epb + i, j); } } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + cdnp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cdnp++) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } @@ -700,7 +704,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, return (err); } - osp = buf->b_data; + osp = ABD_TO_BUF(buf->b_data); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); @@ -1717,7 +1721,7 @@ dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1801,7 +1805,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (needs_io && !zfs_no_scrub_io) { vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; - void *data = zio_data_buf_alloc(size); + abd_t *data = abd_alloc_linear(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= maxinflight) From 28cdc597822288d3dda993b77af7dbef6e744d2d Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 09/16] Handle abd_t in sa_impl.h, sa.c, space_map.c, spa.c, spa_history.c and zil.c Signed-off-by: Chunwei Chen --- include/sys/sa_impl.h | 3 +-- module/zfs/sa.c | 13 ++++++++----- module/zfs/spa.c | 10 +++++----- module/zfs/spa_history.c | 6 +++--- module/zfs/space_map.c | 2 +- module/zfs/zil.c | 17 +++++++++++------ 6 files changed, 29 insertions(+), 22 deletions(-) diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h index fcbd8eb34e91..9dbe8f105d91 100644 --- a/include/sys/sa_impl.h +++ b/include/sys/sa_impl.h @@ -221,8 +221,7 @@ struct sa_handle { (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) #define SA_GET_HDR(hdl, type) \ - ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ - type))->db.db_data)) + ((sa_hdr_phys_t *)ABD_TO_BUF((SA_GET_DB(hdl, type))->db.db_data)) #define SA_IDX_TAB_GET(hdl, type) \ (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 9063d1dae449..60a4c586fb9f 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -724,8 +724,9 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, } /* setup starting pointers to lay down data */ - data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); - sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; + data_start = + (void *)((uintptr_t)ABD_TO_BUF(hdl->sa_bonus->db_data) + hdrsize); + sahdr = (sa_hdr_phys_t *)ABD_TO_BUF(hdl->sa_bonus->db_data); buftype = SA_BONUS; attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, @@ -753,7 +754,9 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, hash = -1ULL; len_idx = 0; - sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; + sahdr = (sa_hdr_phys_t *) + ABD_TO_BUF(hdl->sa_spill->db_data); + sahdr->sa_magic = SA_MAGIC; data_start = (void *)((uintptr_t)sahdr + spillhdrsize); @@ -1676,7 +1679,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, if (dn->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); - bcopy(hdl->sa_bonus->db_data, old_data[0], + abd_copy_to_buf(old_data[0], hdl->sa_bonus->db_data, hdl->sa_bonus->db_size); bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; } else { @@ -1689,7 +1692,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, if ((error = sa_get_spill(hdl)) == 0) { spill_data_size = hdl->sa_spill->db_size; old_data[1] = zio_buf_alloc(spill_data_size); - bcopy(hdl->sa_spill->db_data, old_data[1], + abd_copy_to_buf(old_data[1], hdl->sa_spill->db_data, hdl->sa_spill->db_size); spill_attr_count = hdl->sa_spill_tab->sa_layout->lot_attr_count; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 998ec3e543f9..617146052f2e 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1583,7 +1583,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) if (error) return (error); - nvsize = *(uint64_t *)db->db_data; + nvsize = *(uint64_t *)ABD_TO_BUF(db->db_data); dmu_buf_rele(db, FTAG); packed = vmem_alloc(nvsize, KM_SLEEP); @@ -1866,7 +1866,7 @@ spa_load_verify_done(zio_t *zio) else atomic_add_64(&sle->sle_data_count, 1); } - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1889,7 +1889,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, { zio_t *rio; size_t size; - void *data; + abd_t *data; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); @@ -1905,7 +1905,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, rio = arg; size = BP_GET_PSIZE(bp); - data = zio_data_buf_alloc(size); + data = abd_alloc_linear(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) @@ -5966,7 +5966,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = nvsize; + *(uint64_t *)ABD_TO_BUF(db->db_data) = nvsize; dmu_buf_rele(db, FTAG); } diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index 14e681e77d8b..875c671e5bd8 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -99,7 +99,7 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); - shpp = dbp->db_data; + shpp = ABD_TO_BUF(dbp->db_data); dmu_buf_will_dirty(dbp, tx); /* @@ -222,7 +222,7 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx) * Update the offset when the write completes. */ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); - shpp = dbp->db_data; + shpp = ABD_TO_BUF(dbp->db_data); dmu_buf_will_dirty(dbp, tx); @@ -361,7 +361,7 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) return (err); - shpp = dbp->db_data; + shpp = ABD_TO_BUF(dbp->db_data); #ifdef ZFS_DEBUG { diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index b3aa469bf45b..d13f55ee3f18 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -349,7 +349,7 @@ space_map_open_impl(space_map_t *sm) return (error); dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); - sm->sm_phys = sm->sm_dbuf->db_data; + sm->sm_phys = ABD_TO_BUF(sm->sm_dbuf->db_data); return (0); } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 15897b363de6..d2ae8336c536 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -235,7 +235,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, cksum.zc_word[ZIL_ZC_SEQ]++; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = abuf->b_data; + zil_chain_t *zilc = ABD_TO_BUF(abuf->b_data); char *lr = (char *)(zilc + 1); uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); @@ -243,12 +243,12 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { error = SET_ERROR(ECKSUM); } else { - bcopy(lr, dst, len); + memcpy(dst, lr, len); *end = (char *)dst + len; *nbp = zilc->zc_next_blk; } } else { - char *lr = abuf->b_data; + char *lr = ABD_TO_BUF(abuf->b_data); uint64_t size = BP_GET_LSIZE(bp); zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; @@ -257,7 +257,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { - bcopy(lr, dst, zilc->zc_nused); + memcpy(dst, lr, zilc->zc_nused); *end = (char *)dst + zilc->zc_nused; *nbp = zilc->zc_next_blk; } @@ -299,7 +299,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (error == 0) { if (wbuf != NULL) - bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); + abd_copy_to_buf(wbuf, abuf->b_data, arc_buf_size(abuf)); (void) arc_buf_remove_ref(abuf, &abuf); } @@ -887,6 +887,7 @@ zil_lwb_write_done(zio_t *zio) * one in zil_commit_writer(). zil_sync() will only remove * the lwb if lwb_buf is null. */ + abd_put(zio->io_data); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_zio = NULL; @@ -923,12 +924,16 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); if (lwb->lwb_zio == NULL) { + abd_t *lwb_abd; if (!lwb->lwb_fastwrite) { metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); lwb->lwb_fastwrite = 1; } + + lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), + 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_FASTWRITE, &zb); From 33d1f213aaadd89788bc4fdcdd4f110c57ed8020 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 10/16] Handle abd_t in zap*.c, zfs_fuid.c, zfs_sa.c and zfs_vnops.c Signed-off-by: Chunwei Chen --- module/zfs/zap.c | 29 +++++++++++++++-------------- module/zfs/zap_micro.c | 6 +++--- module/zfs/zfs_fuid.c | 4 ++-- module/zfs/zfs_sa.c | 28 ++++++++++++++-------------- module/zfs/zfs_vnops.c | 5 +++-- 5 files changed, 37 insertions(+), 35 deletions(-) diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 5ffa138a6b4b..5ebd3b3d3988 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -91,7 +91,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) * explicitly zero it since it might be coming from an * initialized microzap */ - bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); + bzero(ABD_TO_BUF(zap->zap_dbuf->db_data), zap->zap_dbuf->db_size); zp->zap_block_type = ZBT_HEADER; zp->zap_magic = ZAP_MAGIC; @@ -117,7 +117,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); l->l_dbuf = db; - l->l_phys = db->db_data; + l->l_phys = ABD_TO_BUF(db->db_data); zap_leaf_init(l, zp->zap_normflags != 0); @@ -181,15 +181,16 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); - transfer_func(db_old->db_data, db_new->db_data, hepb); + transfer_func(ABD_TO_BUF(db_old->db_data), + ABD_TO_BUF(db_new->db_data), hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); - transfer_func((uint64_t *)db_old->db_data + hepb, - db_new->db_data, hepb); + transfer_func((uint64_t *)ABD_TO_BUF(db_old->db_data) + hepb, + ABD_TO_BUF(db_new->db_data), hepb); dmu_buf_rele(db_new, FTAG); dmu_buf_rele(db_old, FTAG); @@ -253,12 +254,11 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, return (err); } dmu_buf_will_dirty(db2, tx); - ((uint64_t *)db2->db_data)[off2] = val; - ((uint64_t *)db2->db_data)[off2+1] = val; + ((uint64_t *)ABD_TO_BUF(db2->db_data))[off2] = val; + ((uint64_t *)ABD_TO_BUF(db2->db_data))[off2+1] = val; dmu_buf_rele(db2, FTAG); } - - ((uint64_t *)db->db_data)[off] = val; + ((uint64_t *)ABD_TO_BUF(db->db_data))[off] = val; dmu_buf_rele(db, FTAG); return (0); @@ -281,7 +281,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err) return (err); - *valp = ((uint64_t *)db->db_data)[off]; + *valp = ((uint64_t *)ABD_TO_BUF(db->db_data))[off]; dmu_buf_rele(db, FTAG); if (tbl->zt_nextblk != 0) { @@ -350,7 +350,8 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) return (err); dmu_buf_will_dirty(db_new, tx); zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), - db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + ABD_TO_BUF(db_new->db_data), + 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); dmu_buf_rele(db_new, FTAG); zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; @@ -530,7 +531,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, dmu_buf_will_dirty(db, tx); ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); - ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data); + ASSERT3P(l->l_phys, ==, ABD_TO_BUF(l->l_dbuf->db_data)); ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF); ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); @@ -576,7 +577,7 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) int err; ASSERT(zap->zap_dbuf == NULL || - zap->zap_f.zap_phys == zap->zap_dbuf->db_data); + zap->zap_f.zap_phys == ABD_TO_BUF(zap->zap_dbuf->db_data)); ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); err = zap_idx_to_blk(zap, idx, &blk); @@ -1324,7 +1325,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { - zap_stats_ptrtbl(zap, db->db_data, + zap_stats_ptrtbl(zap, ABD_TO_BUF(db->db_data), 1<<(bs-3), zs); dmu_buf_rele(db, FTAG); } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index dfa7c6615659..b7f1654f7a0a 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -372,7 +372,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap->zap_object = obj; zap->zap_dbuf = db; - if (*(uint64_t *)db->db_data != ZBT_MICRO) { + if (*(uint64_t *)ABD_TO_BUF(db->db_data) != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; } else { @@ -531,7 +531,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) sz = zap->zap_dbuf->db_size; mzp = zio_buf_alloc(sz); - bcopy(zap->zap_dbuf->db_data, mzp, sz); + bcopy(ABD_TO_BUF(zap->zap_dbuf->db_data), mzp, sz); nchunks = zap->zap_m.zap_num_chunks; if (!flags) { @@ -587,7 +587,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, #endif dmu_buf_will_dirty(db, tx); - zp = db->db_data; + zp = ABD_TO_BUF(db->db_data); zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 6ca61b87242f..67154cf93dab 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -120,7 +120,7 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, ASSERT(fuid_obj != 0); VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db)); - fuid_size = *(uint64_t *)db->db_data; + fuid_size = *(uint64_t *)ABD_TO_BUF(db->db_data); dmu_buf_rele(db, FTAG); if (fuid_size) { @@ -281,7 +281,7 @@ zfs_fuid_sync(zfs_sb_t *zsb, dmu_tx_t *tx) VERIFY(0 == dmu_bonus_hold(zsb->z_os, zsb->z_fuid_obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = zsb->z_fuid_size; + *(uint64_t *)ABD_TO_BUF(db->db_data) = zsb->z_fuid_size; dmu_buf_rele(db, FTAG); zsb->z_fuid_dirty = B_FALSE; diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index 257ab4254bbd..c15098373c9b 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -77,14 +77,14 @@ zfs_sa_readlink(znode_t *zp, uio_t *uio) bufsz = zp->z_size; if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { - error = uiomove((caddr_t)db->db_data + - ZFS_OLD_ZNODE_PHYS_SIZE, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + error = abd_uiomove_off(db->db_data, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio, + ZFS_OLD_ZNODE_PHYS_SIZE); } else { dmu_buf_t *dbp; if ((error = dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { - error = uiomove(dbp->db_data, + error = abd_uiomove(dbp->db_data, MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); dmu_buf_rele(dbp, FTAG); } @@ -101,8 +101,8 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) VERIFY(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0); if (len) { - bcopy(link, (caddr_t)db->db_data + - ZFS_OLD_ZNODE_PHYS_SIZE, len); + abd_copy_from_buf_off(db->db_data, link, len, + ZFS_OLD_ZNODE_PHYS_SIZE); } } else { dmu_buf_t *dbp; @@ -114,7 +114,7 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) dmu_buf_will_dirty(dbp, tx); ASSERT3U(len, <=, dbp->db_size); - bcopy(link, dbp->db_data, len); + abd_copy_from_buf(dbp->db_data, link, len); dmu_buf_rele(dbp, FTAG); } } @@ -145,9 +145,9 @@ zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) ZFS_OLD_ZNODE_PHYS_SIZE; if (len <= doi.doi_bonus_size) { - (void) memcpy(xoap->xoa_av_scanstamp, - (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - sizeof (xoap->xoa_av_scanstamp)); + abd_copy_to_buf_off(xoap->xoa_av_scanstamp, + db->db_data, sizeof (xoap->xoa_av_scanstamp), + ZFS_OLD_ZNODE_PHYS_SIZE); } } XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); @@ -175,8 +175,8 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) ZFS_OLD_ZNODE_PHYS_SIZE; if (len > doi.doi_bonus_size) VERIFY(dmu_set_bonus(db, len, tx) == 0); - (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); + abd_copy_from_buf_off(db->db_data, xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp), ZFS_OLD_ZNODE_PHYS_SIZE); zp->z_pflags |= ZFS_BONUS_SCANSTAMP; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zsb), @@ -375,8 +375,8 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) /* if scanstamp then add scanstamp */ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { - bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - scanstamp, AV_SCANSTAMP_SZ); + abd_copy_to_buf_off(scanstamp, db->db_data, + AV_SCANSTAMP_SZ, ZFS_OLD_ZNODE_PHYS_SIZE); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zsb), NULL, scanstamp, AV_SCANSTAMP_SZ); zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 723d6210f26f..f14387aaa36d 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -711,7 +711,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) error = SET_ERROR(EDQUOT); break; } - + /* TODO: abd can't handle xuio */ if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; @@ -738,7 +738,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = uiocopy(abuf->b_data, max_blksz, + if ((error = abd_uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; @@ -800,6 +800,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ + /* TODO: abd can't handle xuio */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); From 41165d1fa901b575303e3f442b37b40ea6a33b10 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 11/16] Handle abd_t in zio.c Signed-off-by: Chunwei Chen --- include/sys/zio.h | 2 +- module/zfs/zio.c | 240 +++++++++++++++++++++++++++++++--------------- 2 files changed, 166 insertions(+), 76 deletions(-) diff --git a/include/sys/zio.h b/include/sys/zio.h index 7bed94a6036c..e1b1b8c705cc 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -352,7 +352,7 @@ typedef struct zio_gang_node { } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, - zio_gang_node_t *gn, void *data); + zio_gang_node_t *gn, abd_t *data, uint64_t offset); typedef void zio_transform_func_t(zio_t *zio, abd_t *data, uint64_t size); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 066f04f1864c..6dc8e263b30d 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -256,11 +256,16 @@ zio_data_buf_free(void *buf, size_t size) * ========================================================================== */ static void -zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, +zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); + if (ABD_IS_LINEAR(zio->io_data)) + ASSERT_ABD_LINEAR(data); + else + ASSERT_ABD_SCATTER(data); + zt->zt_orig_data = zio->io_data; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; @@ -284,7 +289,7 @@ zio_pop_transforms(zio_t *zio) zt->zt_orig_data, zt->zt_orig_size); if (zt->zt_bufsize != 0) - zio_buf_free(zio->io_data, zt->zt_bufsize); + abd_free(zio->io_data, zt->zt_bufsize); zio->io_data = zt->zt_orig_data; zio->io_size = zt->zt_orig_size; @@ -300,21 +305,29 @@ zio_pop_transforms(zio_t *zio) * ========================================================================== */ static void -zio_subblock(zio_t *zio, void *data, uint64_t size) +zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) - bcopy(zio->io_data, data, size); + abd_copy(data, zio->io_data, size); } static void -zio_decompress(zio_t *zio, void *data, uint64_t size) +zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { - if (zio->io_error == 0 && - zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_data, data, zio->io_size, size) != 0) - zio->io_error = SET_ERROR(EIO); + void *buf1, *buf2; + if (zio->io_error == 0) { + buf1 = abd_borrow_buf_copy(zio->io_data, zio->io_size); + buf2 = abd_borrow_buf(data, size); + + if (zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + buf1, buf2, zio->io_size, size) != 0) + zio->io_error = SET_ERROR(EIO); + + abd_return_buf_copy(data, buf2, size); + abd_return_buf(zio->io_data, buf1, zio->io_size); + } } /* @@ -483,7 +496,7 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) @@ -595,7 +608,7 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -611,7 +624,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, + abd_t *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) @@ -650,7 +663,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } zio_t * -zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, +zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { @@ -802,7 +815,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -823,7 +836,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -846,8 +859,13 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ - void *wbuf = zio_buf_alloc(size); - bcopy(data, wbuf, size); + abd_t *wbuf; + if (ABD_IS_LINEAR(data)) + wbuf = abd_alloc_linear(size); + else + wbuf = abd_alloc_scatter(size); + abd_copy(wbuf, data, size); + zio_push_transform(zio, wbuf, size, size, NULL); } @@ -859,7 +877,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; @@ -903,7 +921,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, } zio_t * -zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, +zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { @@ -961,14 +979,24 @@ zio_read_bp_init(zio_t *zio) !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); - void *cbuf = zio_buf_alloc(psize); + abd_t *cbuf; + if (ABD_IS_LINEAR(zio->io_data)) + cbuf = abd_alloc_linear(psize); + else + cbuf = abd_alloc_scatter(psize); zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { + void *data; + int psize; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - decode_embedded_bp_compressed(bp, zio->io_data); + + psize = BPE_GET_PSIZE(bp); + data = abd_borrow_buf(zio->io_data, psize); + decode_embedded_bp_compressed(bp, data); + abd_return_buf_copy(zio->io_data, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } @@ -1070,11 +1098,25 @@ zio_write_bp_init(zio_t *zio) } if (compress != ZIO_COMPRESS_OFF) { - void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + void *cbuf; + void *dbuf; + abd_t *cdata; + + if (ABD_IS_LINEAR(zio->io_data)) + cdata = abd_alloc_linear(lsize); + else + cdata = abd_alloc_scatter(lsize); + + cbuf = abd_borrow_buf(cdata, lsize); + + dbuf = abd_borrow_buf_copy(zio->io_data, lsize); + psize = zio_compress_data(compress, dbuf, cbuf, lsize); + abd_return_buf(zio->io_data, dbuf, lsize); + if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; - zio_buf_free(cbuf, lsize); + abd_return_buf(cdata, cbuf, lsize); + abd_free(cdata, lsize); } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { @@ -1083,7 +1125,8 @@ zio_write_bp_init(zio_t *zio) BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); - zio_buf_free(cbuf, lsize); + abd_return_buf(cdata, cbuf, lsize); + abd_free(cdata, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, @@ -1096,15 +1139,16 @@ zio_write_bp_init(zio_t *zio) */ size_t rounded = P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE); + abd_return_buf_copy(cdata, cbuf, lsize); if (rounded > psize) { - bzero((char *)cbuf + psize, rounded - psize); + abd_zero_off(cdata, rounded - psize, psize); psize = rounded; } if (psize == lsize) { compress = ZIO_COMPRESS_OFF; - zio_buf_free(cbuf, lsize); + abd_free(cdata, lsize); } else { - zio_push_transform(zio, cbuf, + zio_push_transform(zio, cdata, psize, lsize, NULL); } } @@ -1598,26 +1642,38 @@ zio_resume_wait(spa_t *spa) * ========================================================================== */ +static void +zio_gang_issue_func_done(zio_t *zio) +{ + abd_put(zio->io_data); +} + static zio_t * -zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { if (gn != NULL) return (pio); - return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), - NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), + BP_GET_PSIZE(bp), zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } -zio_t * -zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { zio_t *zio; + abd_t *gbh_abd; if (gn != NULL) { + gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will @@ -1628,8 +1684,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { + abd_t *buf = abd_get_offset(data, offset); + zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), - data, BP_GET_PSIZE(bp)); + buf, BP_GET_PSIZE(bp)); + + abd_put(buf); } /* * If we are here to damage data for testing purposes, @@ -1639,7 +1699,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, + abd_get_offset(data, offset), BP_GET_PSIZE(bp), + zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } @@ -1647,16 +1708,18 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) } /* ARGSUSED */ -zio_t * -zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ -zio_t * -zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); @@ -1720,13 +1783,14 @@ static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, - SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, - gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_gang_tree_assemble_done, gn, gio->io_priority, + ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void @@ -1744,12 +1808,14 @@ zio_gang_tree_assemble_done(zio_t *zio) return; if (BP_SHOULD_BYTESWAP(bp)) - byteswap_uint64_array(zio->io_data, zio->io_size); + byteswap_uint64_array(ABD_TO_BUF(zio->io_data), zio->io_size); - ASSERT(zio->io_data == gn->gn_gbh); + ASSERT(ABD_TO_BUF(zio->io_data) == gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + abd_put(zio->io_data); + for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) @@ -1759,7 +1825,8 @@ zio_gang_tree_assemble_done(zio_t *zio) } static void -zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) +zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, + uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -1773,7 +1840,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ - zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); @@ -1782,13 +1849,14 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; - zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); - data = (char *)data + BP_GET_PSIZE(gbp); + zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, + offset); + offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) - ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); + ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); @@ -1821,7 +1889,8 @@ zio_gang_issue(zio_t *zio) ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) - zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data, + 0); else zio_gang_tree_free(&zio->io_gang_tree); @@ -1861,6 +1930,12 @@ zio_write_gang_member_ready(zio_t *zio) mutex_exit(&pio->io_lock); } +static void +zio_write_gang_done(zio_t *zio) +{ + abd_put(zio->io_data); +} + static int zio_write_gang_block(zio_t *pio) { @@ -1870,6 +1945,7 @@ zio_write_gang_block(zio_t *pio) zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; + abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; @@ -1896,12 +1972,14 @@ zio_write_gang_block(zio_t *pio) gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); + gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ - zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_write_gang_done, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. @@ -1921,10 +1999,10 @@ zio_write_gang_block(zio_t *pio) zp.zp_nopwrite = B_FALSE; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, - zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), - &pio->io_bookmark)); + abd_get_offset(pio->io_data, pio->io_size - resid), lsize, + &zp, zio_write_gang_member_ready, NULL, + zio_write_gang_done, &gn->gn_child[g], pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } /* @@ -2015,10 +2093,11 @@ zio_ddt_child_read_done(zio_t *zio) ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ + if (zio->io_error == 0 && dde->dde_repair_data == NULL) dde->dde_repair_data = zio->io_data; else - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_exit(&pio->io_lock); } @@ -2051,10 +2130,10 @@ zio_ddt_read_start(zio_t *zio) ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, - zio_buf_alloc(zio->io_size), zio->io_size, - zio_ddt_child_read_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, - &zio->io_bookmark)); + abd_alloc_linear(zio->io_size), + zio->io_size, zio_ddt_child_read_done, dde, + zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | + ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } @@ -2091,7 +2170,8 @@ zio_ddt_read_done(zio_t *zio) return (ZIO_PIPELINE_STOP); } if (dde->dde_repair_data != NULL) { - bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); + abd_copy(zio->io_data, dde->dde_repair_data, + zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); @@ -2120,7 +2200,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || - bcmp(zio->io_orig_data, lio->io_orig_data, + abd_cmp(zio->io_orig_data, lio->io_orig_data, zio->io_orig_size) != 0); } } @@ -2145,7 +2225,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (error == 0) { if (arc_buf_size(abuf) != zio->io_orig_size || - bcmp(abuf->b_data, zio->io_orig_data, + abd_cmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); VERIFY(arc_buf_remove_ref(abuf, &abuf)); @@ -2572,11 +2652,15 @@ zio_vdev_io_start(zio_t *zio) P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); + abd_t *abuf; + if (ABD_IS_LINEAR(zio->io_data)) + abuf = abd_alloc_linear(asize); + else + abuf = abd_alloc_scatter(asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf + zio->io_size, asize - zio->io_size); + abd_copy(abuf, zio->io_data, zio->io_size); + abd_zero_off(abuf, asize - zio->io_size, zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } @@ -2701,7 +2785,7 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *buf = zio_buf_alloc(zio->io_size); - bcopy(zio->io_data, buf, zio->io_size); + abd_copy_to_buf(buf, zio->io_data, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; @@ -3026,21 +3110,27 @@ zio_done(zio_t *zio) zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio->io_data; + char *abuf; + abd_t *adata = zio->io_data; if (asize != zio->io_size) { - abuf = zio_buf_alloc(asize); - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf+zio->io_size, asize-zio->io_size); + adata = abd_alloc_linear(asize); + abd_copy(adata, zio->io_data, zio->io_size); + abd_zero_off(adata, asize-zio->io_size, + zio->io_size); } + abuf = abd_borrow_buf_copy(adata, asize); + zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); + abd_return_buf(adata, abuf, asize); + if (asize != zio->io_size) - zio_buf_free(abuf, asize); + abd_free(adata, asize); } } From f8f0bdd30bf289c85dd3d3c534ccd0b7d4d034d4 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 12/16] Handle abd_t in vdev*.c sans vdev_raidz.c Signed-off-by: Chunwei Chen --- include/sys/vdev_impl.h | 2 +- module/zfs/vdev.c | 6 ++--- module/zfs/vdev_cache.c | 11 ++++---- module/zfs/vdev_disk.c | 28 ++++++++++++++----- module/zfs/vdev_file.c | 13 ++++++++- module/zfs/vdev_label.c | 58 +++++++++++++++++++++++----------------- module/zfs/vdev_mirror.c | 13 ++++++--- module/zfs/vdev_queue.c | 17 ++++++------ 8 files changed, 94 insertions(+), 54 deletions(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index a8dc9510e3e9..6dae2955b847 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -83,7 +83,7 @@ typedef const struct vdev_ops { * Virtual device properties */ struct vdev_cache_entry { - char *ve_data; + abd_t *ve_data; uint64_t ve_offset; clock_t ve_lastused; avl_node_t ve_offset_node; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 52198261e434..e53af316f3dc 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -945,12 +945,12 @@ vdev_probe_done(zio_t *zio) ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; @@ -1067,7 +1067,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_pad2)), - VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), + VDEV_PAD_SIZE, abd_alloc_linear(VDEV_PAD_SIZE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index 389fa6fd9d07..2d0a878dc653 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -146,7 +146,7 @@ vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) avl_remove(&vc->vc_lastused_tree, ve); avl_remove(&vc->vc_offset_tree, ve); - zio_buf_free(ve->ve_data, VCBS); + abd_free(ve->ve_data, VCBS); kmem_free(ve, sizeof (vdev_cache_entry_t)); } @@ -183,7 +183,7 @@ vdev_cache_allocate(zio_t *zio) ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; ve->ve_lastused = ddi_get_lbolt(); - ve->ve_data = zio_buf_alloc(VCBS); + ve->ve_data = abd_alloc_scatter(VCBS); avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_lastused_tree, ve); @@ -206,7 +206,8 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) } ve->ve_hits++; - bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); + abd_copy_off(zio->io_data, ve->ve_data, zio->io_size, + 0, cache_phase); } /* @@ -357,8 +358,8 @@ vdev_cache_write(zio_t *zio) if (ve->ve_fill_io != NULL) { ve->ve_missed_update = 1; } else { - bcopy((char *)zio->io_data + start - io_start, - ve->ve_data + start - ve->ve_offset, end - start); + abd_copy_off(ve->ve_data, zio->io_data, end - start, + start - ve->ve_offset, start - io_start); } ve = AVL_NEXT(&vc->vc_offset_tree, ve); } diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 7f2263457177..6dfaf2578daa 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -507,12 +507,15 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, size_t kbuf_size, uint64_t kbuf_offset, int flags) { dio_request_t *dr; - caddr_t bio_ptr; + uint64_t zio_offset; uint64_t bio_offset; int bio_size, bio_count = 16; int i = 0, error = 0; + abd_t *zio_data = NULL; ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); + /* we take either zio or kbuf_ptr, not both */ + ASSERT((!zio || !kbuf_ptr) && (zio || kbuf_ptr)); retry: dr = vdev_disk_dio_alloc(bio_count); @@ -532,9 +535,14 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, * their volume block size to match the maximum request size and * the common case will be one bio per vdev IO request. */ - bio_ptr = kbuf_ptr; + if (zio) + zio_data = zio->io_data; + else + zio_data = abd_get_from_buf(kbuf_ptr, kbuf_size); + + zio_offset = 0; bio_offset = kbuf_offset; - bio_size = kbuf_size; + bio_size = kbuf_size; for (i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ @@ -553,9 +561,11 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, } dr->dr_bio[i] = bio_alloc(GFP_NOIO, - bio_nr_pages(bio_ptr, bio_size)); + abd_bio_nr_pages_off(zio_data, bio_size, zio_offset)); /* bio_alloc() with __GFP_WAIT never returns NULL */ if (unlikely(dr->dr_bio[i] == NULL)) { + if (kbuf_ptr) + abd_put(zio_data); vdev_disk_dio_free(dr); return (ENOMEM); } @@ -570,10 +580,11 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, dr->dr_bio[i]->bi_private = dr; /* Remaining size is returned to become the new size */ - bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size); + bio_size = abd_bio_map_off(dr->dr_bio[i], zio_data, + bio_size, zio_offset); /* Advance in buffer and construct another bio if needed */ - bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]); + zio_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); } @@ -601,6 +612,9 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, ASSERT3S(atomic_read(&dr->dr_ref), ==, 1); } + if (kbuf_ptr) + abd_put(zio_data); + (void) vdev_disk_dio_put(dr); return (error); @@ -712,7 +726,7 @@ vdev_disk_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, + error = __vdev_disk_physio(vd->vd_bdev, zio, NULL, zio->io_size, zio->io_offset, flags); if (error) { zio->io_error = error; diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index 7f43ad8001f4..296e27ebd889 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -149,12 +149,23 @@ vdev_file_io_strategy(void *arg) vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; ssize_t resid; + void *buf; + + if (zio->io_type == ZIO_TYPE_READ) + buf = abd_borrow_buf(zio->io_data, zio->io_size); + else + buf = abd_borrow_buf_copy(zio->io_data, zio->io_size); zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, + UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size, zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_data, buf, zio->io_size); + else + abd_return_buf(zio->io_data, buf, zio->io_size); + if (resid != 0 && zio->io_error == 0) zio->io_error = SET_ERROR(ENOSPC); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7f588ed6b0b5..7dcfb95436bd 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -178,7 +178,7 @@ vdev_label_number(uint64_t psize, uint64_t offset) } static void -vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == @@ -192,7 +192,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, } static void -vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || @@ -430,6 +430,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; + abd_t *vp_abd; zio_t *zio; uint64_t best_txg = 0; int error = 0; @@ -442,7 +443,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t)); + vp = ABD_TO_BUF(vp_abd); retry: for (l = 0; l < VDEV_LABELS; l++) { @@ -450,7 +452,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) zio = zio_root(spa, NULL, NULL, flags); - vdev_label_read(zio, vd, l, vp, + vdev_label_read(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -489,7 +491,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) goto retry; } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd, sizeof (vdev_phys_t)); return (config); } @@ -627,6 +629,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) vdev_phys_t *vp; char *pad2; uberblock_t *ub; + abd_t *vp_abd, *pad2_abd, *ub_abd; zio_t *zio; char *buf; size_t buflen; @@ -710,7 +713,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize its label. */ - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t)); + vp = ABD_TO_BUF(vp_abd); bzero(vp, sizeof (vdev_phys_t)); /* @@ -771,7 +775,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd, sizeof (vdev_phys_t)); /* EFAULT means nvlist_pack ran out of room */ return (error == EFAULT ? ENAMETOOLONG : EINVAL); } @@ -779,13 +783,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING); + ub = ABD_TO_BUF(ub_abd); bzero(ub, VDEV_UBERBLOCK_RING); *ub = spa->spa_uberblock; ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = zio_buf_alloc(VDEV_PAD_SIZE); + pad2_abd = abd_alloc_linear(VDEV_PAD_SIZE); + pad2 = ABD_TO_BUF(pad2_abd); bzero(pad2, VDEV_PAD_SIZE); /* @@ -796,7 +802,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) for (l = 0; l < VDEV_LABELS; l++) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -805,11 +811,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ - vdev_label_write(zio, vd, l, pad2, + vdev_label_write(zio, vd, l, pad2_abd, offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); - vdev_label_write(zio, vd, l, ub, + vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } @@ -822,9 +828,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) } nvlist_free(label); - zio_buf_free(pad2, VDEV_PAD_SIZE); - zio_buf_free(ub, VDEV_UBERBLOCK_RING); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(pad2_abd, VDEV_PAD_SIZE); + abd_free(ub_abd, VDEV_UBERBLOCK_RING); + abd_free(vp_abd, sizeof (vdev_phys_t)); /* * If this vdev hasn't been previously identified as a spare, then we @@ -888,7 +894,7 @@ vdev_uberblock_load_done(zio_t *zio) vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; - uberblock_t *ub = zio->io_data; + uberblock_t *ub = ABD_TO_BUF(zio->io_data); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); @@ -909,7 +915,7 @@ vdev_uberblock_load_done(zio_t *zio) mutex_exit(&rio->io_lock); } - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); } static void @@ -925,7 +931,7 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (l = 0; l < VDEV_LABELS; l++) { for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, - zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), + abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd)), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); @@ -993,6 +999,7 @@ vdev_uberblock_sync_done(zio_t *zio) static void vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) { + abd_t *ub_abd; uberblock_t *ubbuf; int c, l, n; @@ -1007,17 +1014,18 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); - ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd)); + ubbuf = ABD_TO_BUF(ub_abd); bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); *ubbuf = *ub; for (l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ubbuf, + vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, zio->io_private, flags | ZIO_FLAG_DONT_PROPAGATE); - zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); + abd_free(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); } /* Sync the uberblocks to all vdevs in svd[] */ @@ -1094,6 +1102,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; + abd_t *vp_abd; char *buf; size_t buflen; int c; @@ -1112,7 +1121,8 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t)); + vp = ABD_TO_BUF(vp_abd); bzero(vp, sizeof (vdev_phys_t)); buf = vp->vp_nvlist; @@ -1120,7 +1130,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { for (; l < VDEV_LABELS; l += 2) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, zio->io_private, @@ -1128,7 +1138,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) } } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd, sizeof (vdev_phys_t)); nvlist_free(label); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 77c3d8d385e9..bbdf923f1692 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -262,13 +262,12 @@ vdev_mirror_scrub_done(zio_t *zio) while ((pio = zio_walk_parents(zio)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); + abd_copy(pio->io_data, zio->io_data, pio->io_size); mutex_exit(&pio->io_lock); } mutex_exit(&zio->io_lock); } - - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mc->mc_error = zio->io_error; mc->mc_tried = 1; @@ -345,10 +344,16 @@ vdev_mirror_io_start(zio_t *zio) * data into zio->io_data in vdev_mirror_scrub_done. */ for (c = 0; c < mm->mm_children; c++) { + abd_t *tmp; mc = &mm->mm_child[c]; + if (ABD_IS_LINEAR(zio->io_data)) + tmp = abd_alloc_linear(zio->io_size); + else + tmp = abd_alloc_scatter(zio->io_size); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio_buf_alloc(zio->io_size), zio->io_size, + tmp, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 3fa4219f260e..6984cce610d1 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -448,12 +448,12 @@ vdev_queue_agg_io_done(zio_t *aio) if (aio->io_type == ZIO_TYPE_READ) { zio_t *pio; while ((pio = zio_walk_parents(aio)) != NULL) { - bcopy((char *)aio->io_data + (pio->io_offset - - aio->io_offset), pio->io_data, pio->io_size); + abd_copy_off(pio->io_data, aio->io_data, pio->io_size, + 0, pio->io_offset - aio->io_offset); } } - zio_buf_free(aio->io_data, aio->io_size); + abd_free(aio->io_data, aio->io_size); } /* @@ -591,7 +591,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) ASSERT3U(size, <=, zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, - zio_buf_alloc(size), size, first->io_type, zio->io_priority, + abd_alloc_scatter(size), size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; @@ -604,12 +604,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - bzero((char *)aio->io_data + (dio->io_offset - - aio->io_offset), dio->io_size); + abd_zero_off(aio->io_data, dio->io_size, + dio->io_offset - aio->io_offset); } else if (dio->io_type == ZIO_TYPE_WRITE) { - bcopy(dio->io_data, (char *)aio->io_data + - (dio->io_offset - aio->io_offset), - dio->io_size); + abd_copy_off(aio->io_data, dio->io_data, dio->io_size, + dio->io_offset - aio->io_offset, 0); } zio_add_child(dio, aio); From b7dfe0d8c737269f278de5abcfaeb6b74cbc5122 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 13/16] Handle abd_t in vdev_raidz.c Signed-off-by: Chunwei Chen --- module/zfs/vdev_raidz.c | 500 ++++++++++++++++++++++++++-------------- 1 file changed, 330 insertions(+), 170 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 493b332c4405..bdf4171445cd 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -103,7 +103,7 @@ typedef struct raidz_col { uint64_t rc_devidx; /* child device index for I/O */ uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ - void *rc_data; /* I/O data */ + abd_t *rc_data; /* I/O data */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ @@ -120,7 +120,7 @@ typedef struct raidz_map { uint64_t rm_firstdatacol; /* First data column/parity count */ uint64_t rm_nskip; /* Skipped sectors for padding */ uint64_t rm_skipstart; /* Column index of padding start */ - void *rm_datacopy; /* rm_asize-buffer of copied data */ + abd_t *rm_datacopy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ uint8_t rm_ecksuminjected; /* checksum error was injected */ @@ -258,7 +258,7 @@ vdev_raidz_map_free(raidz_map_t *rm) size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + abd_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, @@ -266,11 +266,13 @@ vdev_raidz_map_free(raidz_map_t *rm) } size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + abd_put(rm->rm_col[c].rc_data); size += rm->rm_col[c].rc_size; + } if (rm->rm_datacopy != NULL) - zio_buf_free(rm->rm_datacopy, size); + abd_free(rm->rm_datacopy, size); kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } @@ -307,7 +309,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) size_t x; const char *good = NULL; - const char *bad = rm->rm_col[c].rc_data; + char *bad; if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); @@ -321,8 +323,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) * data never changes for a given logical ZIO) */ if (rm->rm_col[0].rc_gdata == NULL) { - char *bad_parity[VDEV_RAIDZ_MAXPARITY]; + abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; char *buf; + int offset; /* * Set up the rm_col[]s to generate the parity for @@ -331,14 +334,19 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) */ for (x = 0; x < rm->rm_firstdatacol; x++) { bad_parity[x] = rm->rm_col[x].rc_data; - rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = + rm->rm_col[x].rc_gdata = zio_buf_alloc(rm->rm_col[x].rc_size); + rm->rm_col[x].rc_data = + abd_get_from_buf(rm->rm_col[x].rc_gdata, + rm->rm_col[x].rc_size); } /* fill in the data columns from good_data */ buf = (char *)good_data; for (; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; + abd_put(rm->rm_col[x].rc_data); + rm->rm_col[x].rc_data = abd_get_from_buf(buf, + rm->rm_col[x].rc_size); buf += rm->rm_col[x].rc_size; } @@ -348,13 +356,17 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) vdev_raidz_generate_parity(rm); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) + for (x = 0; x < rm->rm_firstdatacol; x++) { + abd_put(rm->rm_col[x].rc_data); rm->rm_col[x].rc_data = bad_parity[x]; + } - buf = rm->rm_datacopy; + offset = 0; for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; - buf += rm->rm_col[x].rc_size; + abd_put(rm->rm_col[x].rc_data); + rm->rm_col[x].rc_data = abd_get_offset( + rm->rm_datacopy, offset); + offset += rm->rm_col[x].rc_size; } } @@ -368,8 +380,10 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) good += rm->rm_col[x].rc_size; } + bad = abd_borrow_buf_copy(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_return_buf(rm->rm_col[c].rc_data, bad, rm->rm_col[c].rc_size); } /* @@ -382,7 +396,7 @@ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - caddr_t buf; + size_t offset; raidz_map_t *rm = zio->io_vsd; size_t size; @@ -412,17 +426,22 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) size += rm->rm_col[c].rc_size; - buf = rm->rm_datacopy = zio_buf_alloc(size); + if (ABD_IS_LINEAR(rm->rm_col[rm->rm_firstdatacol].rc_data)) + rm->rm_datacopy = abd_alloc_linear(size); + else + rm->rm_datacopy = abd_alloc_scatter(size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; + abd_t *tmp = abd_get_offset(rm->rm_datacopy, offset); - bcopy(col->rc_data, buf, col->rc_size); - col->rc_data = buf; + abd_copy(tmp, col->rc_data, col->rc_size); + abd_put(col->rc_data); + col->rc_data = tmp; - buf += col->rc_size; + offset += col->rc_size; } - ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); + ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -451,6 +470,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t off = 0; /* * "Quotient": The number of data sectors for this stripe on all but @@ -534,13 +554,16 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + rm->rm_col[c].rc_data = + abd_alloc_linear(rm->rm_col[c].rc_size); - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_data = abd_get_offset(zio->io_data, 0); + off = rm->rm_col[c].rc_size; - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_data = abd_get_offset(zio->io_data, off); + off += rm->rm_col[c].rc_size; + } /* * If all data stored spans all columns, there's a danger that parity @@ -582,29 +605,81 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, return (rm); } +struct pqr_struct { + uint64_t *p; + uint64_t *q; + uint64_t *r; +}; + +static int +vdev_raidz_p_func(const void *buf, uint64_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && !pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++) + *pqr->p ^= *src; + return (0); +} + +static int +vdev_raidz_pq_func(const void *buf, uint64_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + } + return (0); +} + +static int +vdev_raidz_pqr_func(const void *buf, uint64_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + VDEV_RAIDZ_64MUL_4(*pqr->r, mask); + *pqr->r ^= *src; + } + return (0); +} + static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { - uint64_t *p, *src, pcount, ccount, i; + uint64_t *p; int c; - - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + abd_t *src; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + p = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p = *src; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); } else { - ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p ^= *src; - } + struct pqr_struct pqr = { p, NULL, NULL }; + abd_iterate_rfunc(src, rm->rm_col[c].rc_size, + vdev_raidz_p_func, &pqr); } } } @@ -612,50 +687,43 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { - uint64_t *p, *q, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + p = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); + q = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_Q].rc_data); + + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p = *src; - *q = *src; - } - for (; i < pcnt; i++, src++, p++, q++) { - *p = 0; - *q = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + memcpy(q, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p ^= *src; + struct pqr_struct pqr = { p, q, NULL }; + abd_iterate_rfunc(src, rm->rm_col[c].rc_size, + vdev_raidz_pq_func, &pqr); + } - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; } - + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++) { - VDEV_RAIDZ_64MUL_2(*q, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); } } } @@ -664,10 +732,11 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) static void vdev_raidz_generate_parity_pqr(raidz_map_t *rm) { - uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, *r, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == @@ -675,48 +744,36 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + p = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); + q = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_Q].rc_data); + r = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_R].rc_data); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p = *src; - *q = *src; - *r = *src; - } - for (; i < pcnt; i++, src++, p++, q++, r++) { - *p = 0; - *q = 0; - *r = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + memcpy(q, p, rm->rm_col[c].rc_size); + memcpy(r, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p ^= *src; - - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + struct pqr_struct pqr = { p, q, r }; + abd_iterate_rfunc(src, rm->rm_col[c].rc_size, + vdev_raidz_pqr_func, &pqr); + } - VDEV_RAIDZ_64MUL_4(*r, mask); - *r ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; + r[i] = 0; } - + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++, r++) { - VDEV_RAIDZ_64MUL_2(*q, mask); - VDEV_RAIDZ_64MUL_4(*r, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + VDEV_RAIDZ_64MUL_4(r[i], mask); } } } @@ -744,40 +801,126 @@ vdev_raidz_generate_parity(raidz_map_t *rm) } } +static int +vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, uint64_t dsize, + uint64_t ssize, void *private) +{ + uint64_t *dst = dbuf, *src = sbuf; + int i, cnt = dsize / sizeof (src[0]); + + ASSERT(dsize == ssize); + + for (i = 0; i < cnt; i++) { + dst[i] ^= src[i]; + } + + return (0); +} + +static int +vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, uint64_t dsize, + uint64_t ssize, void *private) +{ + uint64_t *dst = dbuf, *src = sbuf; + uint64_t mask; + int i, dcnt = dsize / sizeof (src[0]), scnt = ssize / sizeof (src[0]); + + ASSERT(dsize >= ssize); + + for (i = 0; i < scnt; i++, dst++, src++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + *dst ^= *src; + } + + for (; i < dcnt; i++, dst++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + } + return (0); +} + +struct reconst_q_struct { + uint64_t *q; + int exp; +}; + +static int +vdev_raidz_reconst_q_post_func(void *buf, uint64_t size, void *private) +{ + struct reconst_q_struct *rq = private; + uint64_t *dst = buf; + uint8_t *b; + int i, j, cnt = size / sizeof (dst[0]); + + for (i = 0; i < cnt; i++, dst++, rq->q++) { + *dst ^= *rq->q; + for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { + *b = vdev_raidz_exp2(*b, rq->exp); + } + } + return (0); +} + +struct reconst_pq_struct { + uint8_t *p; + uint8_t *q; + uint8_t *pxy; + uint8_t *qxy; + int aexp; + int bexp; +}; + +static int +vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, uint64_t xsize, + uint64_t ysize, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf, *yd = ybuf; + int i; + + ASSERT(xsize >= ysize); + + for (i = 0; i < xsize; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, + xd++, yd++) { + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + + if (i < ysize) + *yd = *rpq->p ^ *rpq->pxy ^ *xd; + } + return (0); +} + static int vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, i; int x = tgts[0]; int c; + abd_t *dst, *src; ASSERT(ntgts == 1); ASSERT(x >= rm->rm_firstdatacol); ASSERT(x < rm->rm_cols); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); - ASSERT(xcount > 0); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); + ASSERT(rm->rm_col[x].rc_size > 0); src = rm->rm_col[VDEV_RAIDZ_P].rc_data; dst = rm->rm_col[x].rc_data; - for (i = 0; i < xcount; i++, dst++, src++) { - *dst = *src; - } + + abd_copy_from_buf(dst, ABD_TO_BUF(src), rm->rm_col[x].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + uint64_t size = MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + src = rm->rm_col[c].rc_data; dst = rm->rm_col[x].rc_data; if (c == x) continue; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - count = MIN(ccount, xcount); - - for (i = 0; i < count; i++, dst++, src++) { - *dst ^= *src; - } + abd_iterate_func2(dst, src, size, size, + vdev_raidz_reconst_p_func, NULL); } return (1 << VDEV_RAIDZ_P); @@ -786,44 +929,30 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, mask, i; - uint8_t *b; int x = tgts[0]; - int c, j, exp; + int c, exp; + abd_t *dst, *src; + struct reconst_q_struct rq; ASSERT(ntgts == 1); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + src = rm->rm_col[c].rc_data; dst = rm->rm_col[x].rc_data; - if (c == x) - ccount = 0; - else - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - - count = MIN(ccount, xcount); - if (c == rm->rm_firstdatacol) { - for (i = 0; i < count; i++, dst++, src++) { - *dst = *src; - } - for (; i < xcount; i++, dst++) { - *dst = 0; - } - + abd_copy(dst, src, size); + if (rm->rm_col[x].rc_size > size) + abd_zero_off(dst, rm->rm_col[x].rc_size - size, + size); } else { - for (i = 0; i < count; i++, dst++, src++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - *dst ^= *src; - } - - for (; i < xcount; i++, dst++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - } + abd_iterate_func2(dst, src, rm->rm_col[x].rc_size, + size, vdev_raidz_reconst_q_pre_func, NULL); } } @@ -831,12 +960,9 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) dst = rm->rm_col[x].rc_data; exp = 255 - (rm->rm_cols - 1 - x); - for (i = 0; i < xcount; i++, dst++, src++) { - *dst ^= *src; - for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { - *b = vdev_raidz_exp2(*b, exp); - } - } + rq = (struct reconst_q_struct) { ABD_TO_BUF(src), exp }; + abd_iterate_wfunc(dst, rm->rm_col[x].rc_size, + vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } @@ -844,11 +970,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { - uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; - void *pdata, *qdata; - uint64_t xsize, ysize, i; + uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; + abd_t *pdata, *qdata; + uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; + abd_t *xd, *yd; + struct reconst_pq_struct rpq; ASSERT(ntgts == 2); ASSERT(x < y); @@ -870,9 +998,9 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) ysize = rm->rm_col[y].rc_size; rm->rm_col[VDEV_RAIDZ_P].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size); rm->rm_col[VDEV_RAIDZ_Q].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size); rm->rm_col[x].rc_size = 0; rm->rm_col[y].rc_size = 0; @@ -881,10 +1009,10 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) rm->rm_col[x].rc_size = xsize; rm->rm_col[y].rc_size = ysize; - p = pdata; - q = qdata; - pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + p = ABD_TO_BUF(pdata); + q = ABD_TO_BUF(qdata); + pxy = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); + qxy = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_Q].rc_data); xd = rm->rm_col[x].rc_data; yd = rm->rm_col[y].rc_data; @@ -910,17 +1038,13 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; - for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { - *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ - vdev_raidz_exp2(*q ^ *qxy, bexp); + rpq = (struct reconst_pq_struct) { p, q, pxy, qxy, aexp, bexp }; + abd_iterate_func2(xd, yd, xsize, ysize, + vdev_raidz_reconst_pq_func, &rpq); - if (i < ysize) - *yd = *p ^ *pxy ^ *xd; - } - - zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, + abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, rm->rm_col[VDEV_RAIDZ_P].rc_size); - zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, + abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, rm->rm_col[VDEV_RAIDZ_Q].rc_size); /* @@ -1245,7 +1369,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, c = used[i]; ASSERT3U(c, <, rm->rm_cols); - src = rm->rm_col[c].rc_data; + src = ABD_TO_BUF(rm->rm_col[c].rc_data); ccount = rm->rm_col[c].rc_size; for (j = 0; j < nmissing; j++) { cc = missing[j] + rm->rm_firstdatacol; @@ -1253,7 +1377,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, ASSERT3U(cc, <, rm->rm_cols); ASSERT3U(cc, !=, c); - dst[j] = rm->rm_col[cc].rc_data; + dst[j] = ABD_TO_BUF(rm->rm_col[cc].rc_data); dcount[j] = rm->rm_col[cc].rc_size; } @@ -1301,8 +1425,25 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; + abd_t **bufs = NULL; + int code = 0; + /* + * matrix reconstruction can use scatter buffer yet, so we allocate + * temporary linear abds. + */ + if (!ABD_IS_LINEAR(rm->rm_col[rm->rm_firstdatacol].rc_data)) { + bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bufs[c] = col->rc_data; + col->rc_data = abd_alloc_linear(col->rc_size); + abd_copy(col->rc_data, bufs[c], col->rc_size); + } + } n = rm->rm_cols - rm->rm_firstdatacol; @@ -1389,6 +1530,20 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) kmem_free(p, psize); + /* + * copy back from temporary linear abds and free them + */ + if (bufs) { + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + abd_copy(bufs[c], col->rc_data, col->rc_size); + abd_free(col->rc_data, col->rc_size); + col->rc_data = bufs[c]; + } + kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + } + return (code); } @@ -1661,6 +1816,7 @@ vdev_raidz_io_start(zio_t *zio) static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { + void *buf; vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -1674,9 +1830,11 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; + buf = abd_borrow_buf_copy(rc->rc_data, rc->rc_size); zfs_ereport_post_checksum(zio->io_spa, vd, zio, - rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, + rc->rc_offset, rc->rc_size, buf, bad_data, &zbc); + abd_return_buf(rc->rc_data, buf, rc->rc_size); } } @@ -1718,7 +1876,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig[c], rc->rc_size); + abd_copy_to_buf(orig[c], rc->rc_data, rc->rc_size); } vdev_raidz_generate_parity(rm); @@ -1727,7 +1885,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; - if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { + if (bcmp(orig[c], ABD_TO_BUF(rc->rc_data), rc->rc_size) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -1835,7 +1993,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) ASSERT3S(c, >=, 0); ASSERT3S(c, <, rm->rm_cols); rc = &rm->rm_col[c]; - bcopy(rc->rc_data, orig[i], rc->rc_size); + abd_copy_to_buf(orig[i], rc->rc_data, + rc->rc_size); } /* @@ -1866,7 +2025,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; - bcopy(orig[i], rc->rc_data, rc->rc_size); + abd_copy_from_buf(rc->rc_data, orig[i], + rc->rc_size); } do { From c740a3cbfb4356443d04dccf95e71952672c2197 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 14/16] Handle ABD in ztest and zdb Use ABD API on related pointers and functions.(b_data, db_data, zio_*(), etc.) Suggested-by: DHE Signed-off-by: Chunwei Chen --- cmd/zdb/zdb.c | 21 ++++++++++++--------- cmd/zdb/zdb_il.c | 5 ++++- cmd/ztest/ztest.c | 35 ++++++++++++++++++++++------------- 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 1d76f2a7da3e..5e940050ba5c 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1185,7 +1185,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, ASSERT(buf->b_data); /* recursively visit blocks below this */ - cbp = buf->b_data; + cbp = ABD_TO_BUF(buf->b_data); for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; @@ -1357,7 +1357,7 @@ dump_bptree(objset_t *os, uint64_t obj, char *name) return; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); zdb_nicenum(bt->bt_bytes, bytes); (void) printf("\n %s: %llu datasets, %s\n", name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); @@ -1807,7 +1807,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) if (error) fatal("dmu_bonus_hold(%llu) failed, errno %u", object, error); - bonus = db->db_data; + bonus = ABD_TO_BUF(db->db_data); bsize = db->db_size; dn = DB_DNODE((dmu_buf_impl_t *)db); } @@ -2030,7 +2030,7 @@ dump_config(spa_t *spa) spa->spa_config_object, FTAG, &db); if (error == 0) { - nvsize = *(uint64_t *)db->db_data; + nvsize = *(uint64_t *)ABD_TO_BUF(db->db_data); dmu_buf_rele(db, FTAG); (void) printf("\nMOS Configuration:\n"); @@ -2317,7 +2317,7 @@ zdb_blkptr_done(zio_t *zio) zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -2380,7 +2380,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); - void *data = zio_data_buf_alloc(size); + abd_t *data = abd_alloc_linear(size); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ @@ -3143,6 +3143,7 @@ zdb_read_block(char *thing, spa_t *spa) zio_t *zio; vdev_t *vd; void *pbuf, *lbuf, *buf; + abd_t *pbuf_abd; char *s, *p, *dup, *vdev, *flagstr; int i, error; @@ -3214,6 +3215,7 @@ zdb_read_block(char *thing, spa_t *spa) lsize = size; pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, 512, UMEM_NOFAIL); + pbuf_abd = abd_get_from_buf(pbuf, SPA_MAXBLOCKSIZE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); @@ -3241,15 +3243,15 @@ zdb_read_block(char *thing, spa_t *spa) /* * Treat this as a normal block read. */ - zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, + zio_nowait(zio_read(zio, spa, bp, pbuf_abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ - zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, - ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf_abd, + psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); @@ -3323,6 +3325,7 @@ zdb_read_block(char *thing, spa_t *spa) zdb_dump_block(thing, buf, size, flags); out: + abd_put(pbuf_abd); umem_free(pbuf, SPA_MAXBLOCKSIZE); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index b85ef7ddd97e..02e772c0c8e4 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -125,6 +125,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; char buf[SPA_MAXBLOCKSIZE]; + abd_t *abd; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; @@ -158,9 +159,11 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + abd = abd_get_from_buf(buf, BP_GET_LSIZE(bp)); error = zio_wait(zio_read(NULL, zilog->zl_spa, - bp, buf, BP_GET_LSIZE(bp), NULL, NULL, + bp, abd, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); + abd_put(abd); if (error) return; data = buf; diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 0602a7ec54bf..2a0e15765622 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -1305,19 +1305,23 @@ ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) } static void -ztest_pattern_set(void *buf, uint64_t size, uint64_t value) +ztest_pattern_set(abd_t *abd, uint64_t size, uint64_t value) { + void *buf = abd_borrow_buf(abd, size); uint64_t *ip = buf; uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); while (ip < ip_end) *ip++ = value; + + abd_return_buf_copy(abd, buf, size); } #ifndef NDEBUG static boolean_t -ztest_pattern_match(void *buf, uint64_t size, uint64_t value) +ztest_pattern_match(abd_t *abd, uint64_t size, uint64_t value) { + void *buf = abd_borrow_buf_copy(abd, size); uint64_t *ip = buf; uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); uint64_t diff = 0; @@ -1325,6 +1329,7 @@ ztest_pattern_match(void *buf, uint64_t size, uint64_t value) while (ip < ip_end) diff |= (value - *ip++); + abd_return_buf(abd, buf, size); return (diff == 0); } #endif @@ -1364,7 +1369,8 @@ ztest_bt_bonus(dmu_buf_t *db) dmu_object_info_from_db(db, &doi); ASSERT3U(doi.doi_bonus_size, <=, db->db_size); ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); - bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); + bt = (void *)((char *)ABD_TO_BUF(db->db_data) + doi.doi_bonus_size - + sizeof (*bt)); return (bt); } @@ -1726,7 +1732,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) if (abuf == NULL) { dmu_write(os, lr->lr_foid, offset, length, data, tx); } else { - bcopy(data, abuf->b_data, length); + abd_copy_from_buf(abuf->b_data, data, length); dmu_assign_arcbuf(db, offset, abuf, tx); } @@ -4121,16 +4127,19 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; if (i != 5) { - bcopy((caddr_t)bigbuf + (off - bigoff), - bigbuf_arcbufs[j]->b_data, chunksize); + abd_copy_from_buf(bigbuf_arcbufs[j]->b_data, + (caddr_t)bigbuf + (off - bigoff), + chunksize); } else { - bcopy((caddr_t)bigbuf + (off - bigoff), + abd_copy_from_buf( bigbuf_arcbufs[2 * j]->b_data, + (caddr_t)bigbuf + (off - bigoff), chunksize / 2); - bcopy((caddr_t)bigbuf + (off - bigoff) + - chunksize / 2, + + abd_copy_from_buf( bigbuf_arcbufs[2 * j + 1]->b_data, - chunksize / 2); + (caddr_t)bigbuf + (off - bigoff) + + chunksize / 2, chunksize / 2); } if (i == 1) { @@ -5181,7 +5190,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) enum zio_checksum checksum = spa_dedup_checksum(spa); dmu_buf_t *db; dmu_tx_t *tx; - void *buf; + abd_t *buf; blkptr_t blk; int copies = 2 * ZIO_DEDUPDITTO_MIN; int i; @@ -5262,14 +5271,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) * Damage the block. Dedup-ditto will save us when we read it later. */ psize = BP_GET_PSIZE(&blk); - buf = zio_buf_alloc(psize); + buf = abd_alloc_linear(psize); ztest_pattern_set(buf, psize, ~pattern); (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - zio_buf_free(buf, psize); + abd_free(buf, psize); (void) rw_unlock(&ztest_name_lock); umem_free(od, sizeof (ztest_od_t)); From 8a8a3e7a666de636db50e5c6b5799d30fad54207 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 15/16] Enable ABD Signed-off-by: Chunwei Chen --- include/sys/abd.h | 83 +--------------------------------------- lib/libzpool/Makefile.am | 2 +- module/zfs/Makefile.in | 1 + 3 files changed, 3 insertions(+), 83 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index 561b43e18188..64b34510828e 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -39,7 +39,7 @@ #ifdef __cplusplus extern "C" { #endif -#if 0 + #define ARC_BUF_DATA_MAGIC 0xa7cb0fda #if defined(ZFS_DEBUG) && !defined(_KERNEL) @@ -182,87 +182,6 @@ do { \ abd_copy_from_buf(a, b, n); \ abd_return_buf(a, b, n); \ } while (0) -#else /* 0 */ -typedef void abd_t; -#define ABD_TO_BUF(abd) ((void *)abd) -#define ABD_IS_SCATTER(abd) (0) -#define ABD_IS_LINEAR(abd) (1) -#define ASSERT_ABD_SCATTER(abd) ((void)0) -#define ASSERT_ABD_LINEAR(abd) ((void)0) -void *zio_buf_alloc(size_t); -void zio_buf_free(void *, size_t); -static inline abd_t *abd_alloc_linear(size_t size) -{ - return ((abd_t *)zio_buf_alloc(size)); -} -static inline void abd_free(abd_t *abd, size_t size) -{ - zio_buf_free((void *)abd, size); -} -#define abd_alloc_scatter abd_alloc_linear -#define abd_get_offset(abd, off) ((void *)(abd)+(off)) -#define abd_get_from_buf(buf, size) (buf) -#define abd_put(abd) do { } while (0) - -#define abd_iterate_rfunc(a, n, f, p) \ - (void) f(a, n, p) - -#define abd_iterate_wfunc(a, n, f, p) \ - (void) f(a, n, p) - -#define abd_iterate_func2(a, b, an, bn, f, p) \ - (void) f(a, b, an, bn, p) - -#define abd_copy_off(a, b, n, aoff, boff) \ - (void) memcpy((void *)(a)+(aoff), (void *)(b)+(boff), n) - -#define abd_copy_from_buf_off(a, b, n, off) \ - (void) memcpy((void *)(a)+(off), b, n) - -#define abd_copy_to_buf_off(a, b, n, off) \ - (void) memcpy(a, (void *)(b)+(off), n) - -#define abd_cmp(a, b, n) \ - memcmp(a, b, n) - -#define abd_cmp_buf_off(a, b, n, off) \ - memcmp((void *)(a)+(off), b, n) - -#define abd_zero_off(a, n, off) \ - (void) memset((void *)(a)+(off), 0, n) - -#ifdef _KERNEL -#define abd_copy_to_user_off(a, b, n, off) \ - copy_to_user(a, (void *)(b)+(off), n) - -#define abd_copy_from_user_off(a, b, n, off) \ - copy_from_user((void *)(a)+(off), b, n) - -#define abd_uiomove_off(p, n, rw, uio, off) \ - uiomove((void *)(p)+(off), n, rw, uio) - -#define abd_uiocopy_off(p, n, rw, uio, c, off) \ - uiocopy((void *)(p)+(off), n, rw, uio, c) - -#define abd_bio_map_off(bio, a, n, off) \ - bio_map(bio, (void *)(a)+(off), n) - -#define abd_bio_nr_pages_off(a, n, off) \ - bio_nr_pages((void *)(a)+(off), n) -#endif /* _KERNEL */ - -#define abd_borrow_buf(a, n) \ - ((void *)a) - -#define abd_borrow_buf_copy(a, n) \ - ((void *)a) - -#define abd_return_buf(a, b, n) \ - do { } while (0) - -#define abd_return_buf_copy(a, b, n) \ - do { } while (0) -#endif /* 0 */ /* * Wrappers for zero off functions diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index caa64787a95b..60bd06b7d07b 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -20,6 +20,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zcommon/zfs_uio.c \ $(top_srcdir)/module/zcommon/zpool_prop.c \ $(top_srcdir)/module/zcommon/zprop_common.c \ + $(top_srcdir)/module/zfs/abd.c \ $(top_srcdir)/module/zfs/arc.c \ $(top_srcdir)/module/zfs/blkptr.c \ $(top_srcdir)/module/zfs/bplist.c \ @@ -110,7 +111,6 @@ libzpool_la_LIBADD += $(ZLIB) libzpool_la_LDFLAGS = -version-info 2:0:0 EXTRA_DIST = \ - $(top_srcdir)/module/zfs/abd.c \ $(top_srcdir)/module/zfs/vdev_disk.c \ $(top_srcdir)/module/zfs/zfs_acl.c \ $(top_srcdir)/module/zfs/zfs_ctldir.c \ diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 954841f33137..e24a53d5eaf6 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -4,6 +4,7 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ obj-$(CONFIG_ZFS) := $(MODULE).o +$(MODULE)-objs += @top_srcdir@/module/zfs/abd.o $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o $(MODULE)-objs += @top_srcdir@/module/zfs/blkptr.o $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o From c96951878a4e6682ffc48655d65bd07867223eea Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Mon, 17 Aug 2015 15:37:56 +0800 Subject: [PATCH 16/16] Sync with ABD main branch Signed-off-by: Chunwei Chen --- include/sys/abd.h | 141 ++++++++++++---------- module/zfs/abd.c | 250 +++++++++++++++++++++++++++------------- module/zfs/dbuf.c | 2 +- module/zfs/spa_misc.c | 2 + module/zfs/vdev_disk.c | 10 +- module/zfs/vdev_raidz.c | 2 +- module/zfs/zio.c | 2 - 7 files changed, 263 insertions(+), 146 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index 64b34510828e..38a3dbc50f01 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -20,15 +20,14 @@ */ /* - * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ /* * ABD - ARC buffer data * ABD is an abstract data structure for ARC. There are two types of ABD: * linear for metadata and scatter for data. - * Their type is determined by the lowest bit of abd_t pointer. - * The public API will automatically determine the type + * The public API will automatically determine the type. */ #ifndef _ABD_H @@ -42,14 +41,8 @@ extern "C" { #define ARC_BUF_DATA_MAGIC 0xa7cb0fda -#if defined(ZFS_DEBUG) && !defined(_KERNEL) -#define DEBUG_ABD -#endif typedef struct arc_buf_data { -#ifdef DEBUG_ABD - char pad[PAGE_SIZE]; /* debug, coredumps when accessed */ -#endif uint32_t abd_magic; /* ARC_BUF_DATA_MAGIC */ uint32_t abd_flags; size_t abd_size; /* buffer size, excluding offset */ @@ -59,34 +52,40 @@ typedef struct arc_buf_data { struct scatterlist *abd_sgl; void *abd_buf; }; - uint64_t __abd_sgl[0]; } abd_t; -#define ABD_F_SCATTER (0x0) -#define ABD_F_LINEAR (0x1) -#define ABD_F_OWNER (0x2) - -/* - * Convert an linear ABD to normal buffer - */ -#define ABD_TO_BUF(abd) \ -( \ -{ \ - ASSERT((abd)->abd_magic == ARC_BUF_DATA_MAGIC); \ - ASSERT_ABD_LINEAR(abd); \ - abd->abd_buf; \ -} \ -) +#define ABD_F_SCATTER (0) /* abd is scatter */ +#define ABD_F_LINEAR (1) /* abd is linear */ +#define ABD_F_OWNER (1<<1) /* abd owns the buffer */ +#define ABD_F_HIGHMEM (1<<2) /* abd uses highmem */ +#define ABD_F_SG_CHAIN (1<<3) /* scatterlist is chained */ #define ABD_IS_SCATTER(abd) (!((abd)->abd_flags & ABD_F_LINEAR)) #define ABD_IS_LINEAR(abd) (!ABD_IS_SCATTER(abd)) #define ASSERT_ABD_SCATTER(abd) ASSERT(ABD_IS_SCATTER(abd)) #define ASSERT_ABD_LINEAR(abd) ASSERT(ABD_IS_LINEAR(abd)) +/* + * Convert an linear ABD to normal buffer + */ +static inline void * +abd_to_buf(abd_t *abd) +{ + ASSERT(abd->abd_magic == ARC_BUF_DATA_MAGIC); + ASSERT_ABD_LINEAR(abd); + return (abd->abd_buf); +} +#define ABD_TO_BUF(abd) abd_to_buf(abd) + +void abd_init(void); +void abd_fini(void); + /* * Allocations and deallocations */ -abd_t *abd_alloc_scatter(size_t); +abd_t *_abd_alloc_scatter(size_t, int); +#define abd_alloc_scatter(s) _abd_alloc_scatter(s, 1) +#define abd_alloc_meta_scatter(s) _abd_alloc_scatter(s, 0) abd_t *abd_alloc_linear(size_t); void abd_free(abd_t *, size_t); abd_t *abd_get_offset(abd_t *, size_t); @@ -108,6 +107,21 @@ void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); int abd_cmp(abd_t *, abd_t *, size_t); int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); +void *abd_buf_segment(abd_t *, size_t, size_t); +/* + * abd_array_off - returns an object in an array contained in @abd + * + * What this function does is essentially: + * &((type *)(abd + off))[index] + * except that @abd is an ABD buffer, not a normal buffer. + * This function is implement using abd_buf_segment, so all the restriction + * also applies. + * Use abd_array is off is 0. + */ +#define abd_array_off(abd, index, type, off) \ + ((type *)abd_buf_segment(abd, (off) + (index)*sizeof (type), \ + sizeof (type))) + #ifdef _KERNEL int abd_copy_to_user_off(void __user *, abd_t *, size_t, size_t); int abd_copy_from_user_off(abd_t *, const void __user *, size_t, size_t); @@ -131,57 +145,61 @@ unsigned long abd_bio_nr_pages_off(abd_t *, unsigned int, size_t); ) #endif /* _KERNEL */ +/* forward declaration for abd_borrow_buf, etc. */ +void *zio_buf_alloc(size_t size); +void zio_buf_free(void *buf, size_t size); + /* * Borrow a linear buffer for an ABD * Will allocate if ABD is scatter */ -#define abd_borrow_buf(a, n) \ -( \ -{ \ - void *___b; \ - if (ABD_IS_LINEAR(a)) { \ - ___b = ABD_TO_BUF(a); \ - } else { \ - ___b = zio_buf_alloc(n); \ - } \ - ___b; \ -} \ -) +static inline void* +abd_borrow_buf(abd_t *abd, size_t size) +{ + if (!abd) + return (NULL); + if (ABD_IS_LINEAR(abd)) + return (ABD_TO_BUF(abd)); + return (zio_buf_alloc(size)); +} /* * Borrow a linear buffer for an ABD * Will allocate and copy if ABD is scatter */ -#define abd_borrow_buf_copy(a, n) \ -( \ -{ \ - void *___b = abd_borrow_buf(a, n); \ - if (!ABD_IS_LINEAR(a)) \ - abd_copy_to_buf(___b, a, n); \ - ___b; \ -} \ -) +static inline void * +abd_borrow_buf_copy(abd_t *abd, size_t size) +{ + void *buf = abd_borrow_buf(abd, size); + if (buf && !ABD_IS_LINEAR(abd)) + abd_copy_to_buf_off(buf, abd, size, 0); + return (buf); +} /* * Return the borrowed linear buffer */ -#define abd_return_buf(a, b, n) \ -do { \ - if (ABD_IS_LINEAR(a)) \ - ASSERT((b) == ABD_TO_BUF(a)); \ - else \ - zio_buf_free(b, n); \ -} while (0) +static inline void +abd_return_buf(abd_t *abd, void *buf, size_t size) +{ + if (buf) { + if (ABD_IS_LINEAR(abd)) + ASSERT(buf == ABD_TO_BUF(abd)); + else + zio_buf_free(buf, size); + } +} /* * Copy back to ABD and return the borrowed linear buffer */ -#define abd_return_buf_copy(a, b, n) \ -do { \ - if (!ABD_IS_LINEAR(a)) \ - abd_copy_from_buf(a, b, n); \ - abd_return_buf(a, b, n); \ -} while (0) +static inline void +abd_return_buf_copy(abd_t *abd, void *buf, size_t size) +{ + if (buf && !ABD_IS_LINEAR(abd)) + abd_copy_from_buf_off(abd, buf, size, 0); + abd_return_buf(abd, buf, size); +} /* * Wrappers for zero off functions @@ -201,6 +219,9 @@ do { \ #define abd_zero(abd, size) \ abd_zero_off(abd, size, 0) +#define abd_array(abd, index, type) \ + abd_array_off(abd, index, type, 0) + #ifdef _KERNEL #define abd_copy_to_user(buf, abd, size) \ abd_copy_to_user_off(buf, abd, size, 0) diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 8d599e8d1b07..2e44545bfe2b 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #include @@ -43,17 +43,17 @@ /* * page */ -#ifndef PAGE_SIZE -#define PAGE_SIZE 4096 +#ifndef PAGE_SHIFT +#define PAGE_SHIFT (highbit64(PAGESIZE)-1) #endif struct page; #define alloc_page(gfp) \ - ((struct page *)umem_alloc_aligned(PAGE_SIZE, PAGE_SIZE, UMEM_DEFAULT)) + ((struct page *)umem_alloc_aligned(PAGESIZE, PAGESIZE, UMEM_DEFAULT)) #define __free_page(page) \ - umem_free(page, PAGE_SIZE) + umem_free(page, PAGESIZE) /* * scatterlist @@ -70,9 +70,13 @@ sg_init_table(struct scatterlist *sg, int nr) { sg[nr - 1].end = 1; } +#define for_each_sg(sgl, sg, nr, i) \ + for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) + static inline void sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, - unsigned int offset) { + unsigned int offset) +{ /* currently we don't use offset */ ASSERT(offset == 0); sg->page = page; @@ -80,7 +84,8 @@ sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, } static inline struct page * -sg_page(struct scatterlist *sg) { +sg_page(struct scatterlist *sg) +{ return (sg->page); } @@ -103,6 +108,7 @@ sg_next(struct scatterlist *sg) #define unlikely(x) (x) #endif +#define page_address(page) ((void *)page) #define kmap(page) ((void *)page) #define kunmap(page) do { } while (0) #define zfs_kmap_atomic(page, type) ((void *)page) @@ -231,7 +237,8 @@ abd_miter_unmap_x(struct abd_miter *aiter, int atomic) ASSERT(aiter->addr); if (aiter->is_linear) { - pagefault_enable(); + if (atomic) + pagefault_enable(); } else { paddr = aiter->addr - aiter->offset; if (atomic) { @@ -306,19 +313,17 @@ abd_miter_advance(struct abd_miter *aiter, int offset) } #define ABD_CHECK(abd) \ -( \ -{ \ +do { \ ASSERT((abd)->abd_magic == ARC_BUF_DATA_MAGIC); \ ASSERT((abd)->abd_size > 0); \ if (ABD_IS_LINEAR(abd)) { \ ASSERT((abd)->abd_offset == 0); \ ASSERT((abd)->abd_nents == 1); \ } else { \ - ASSERT((abd)->abd_offset < PAGE_SIZE); \ + ASSERT((abd)->abd_offset < PAGESIZE); \ ASSERT((abd)->abd_nents > 0); \ } \ -} \ -) +} while (0) static void abd_iterate_func(abd_t *abd, size_t size, @@ -649,6 +654,49 @@ abd_zero_off(abd_t *abd, size_t size, size_t off) } } +/* + * abd_buf_segment - returns a pointer to a buffer range in ABD. + * @start is the starting offset in the @abd + * @len is the length of the buffer range + * + * @abd must not be highmem scatter ABD. If @abd is linear, the range + * specified by @start and @len should be in the range of @abd. If @abd is + * scatter, the range should not cross page boundary. + * This function is mainly used for allowing *_phys_t to point to scatter ABD. + */ +void * +abd_buf_segment(abd_t *abd, size_t start, size_t len) +{ + struct scatterlist *sg; + struct abd_miter aiter; + size_t offset; + ABD_CHECK(abd); + ASSERT(!(abd->abd_flags & ABD_F_HIGHMEM)); + ASSERT(start + len <= abd->abd_size); + + if (ABD_IS_LINEAR(abd)) + return (abd->abd_buf + start); + + /* + * If the scatterlist is not chained, we can safely treat it as an + * array. Otherwise we need to walk the chained scatterlist via miter. + */ + if (!(abd->abd_flags & ABD_F_SG_CHAIN)) { + offset = abd->abd_offset + start; + sg = &abd->abd_sgl[offset >> PAGE_SHIFT]; + offset &= (PAGESIZE -1); + } else { + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, start); + sg = aiter.sg; + offset = aiter.offset; + } + + ASSERT(offset + len <= sg->length); + + return (page_address(sg_page(sg)) + offset); +} + #ifdef _KERNEL /* * Copy from @abd to user buffer @buf. @@ -898,49 +946,11 @@ abd_bio_nr_pages_off(abd_t *abd, unsigned int bio_size, size_t off) pos = (unsigned long)abd->abd_buf + off; else pos = abd->abd_offset + off; - return ((pos + bio_size + PAGE_SIZE-1)>>PAGE_SHIFT)-(pos>>PAGE_SHIFT); + return ((pos + bio_size + PAGESIZE-1)>>PAGE_SHIFT)-(pos>>PAGE_SHIFT); } #endif /* _KERNEL */ -static inline abd_t * -abd_alloc_struct(int nr_pages) -{ - abd_t *abd; - size_t asize = sizeof (abd_t) + nr_pages*sizeof (struct scatterlist); - /* - * If the maximum block size increases, inline sgl might not fit into - * a single page. We might want to consider using chained sgl if - * that's the case. - */ - ASSERT(nr_pages * sizeof (struct scatterlist) <= PAGE_SIZE); -#ifndef DEBUG_ABD - abd = kmem_alloc(asize, KM_PUSHPAGE); -#else - abd = umem_alloc_aligned(asize, PAGE_SIZE, UMEM_DEFAULT); - /* deny access to padding */ - if (mprotect(abd, PAGE_SIZE, PROT_NONE) != 0) { - perror("mprotect failed"); - ASSERT(0); - } -#endif - ASSERT(abd); - - return (abd); -} - -static inline void -abd_free_struct(abd_t *abd, int nr_pages) -{ -#ifndef DEBUG_ABD - kmem_free(abd, sizeof (abd_t) + nr_pages*sizeof (struct scatterlist)); -#else - if (mprotect(abd, PAGE_SIZE, PROT_READ|PROT_WRITE) != 0) { - perror("mprotect failed"); - ASSERT(0); - } - umem_free(abd, sizeof (abd_t) + nr_pages*sizeof (struct scatterlist)); -#endif -} +static kmem_cache_t *abd_struct_cache = NULL; /* * Allocate a new ABD to point to offset @off of the original ABD. @@ -957,23 +967,35 @@ abd_get_offset(abd_t *sabd, size_t off) ABD_CHECK(sabd); ASSERT(off <= sabd->abd_size); - abd = abd_alloc_struct(0); + abd = kmem_cache_alloc(abd_struct_cache, KM_PUSHPAGE); abd->abd_magic = ARC_BUF_DATA_MAGIC; abd->abd_size = sabd->abd_size - off; + abd->abd_flags = sabd->abd_flags & ~ABD_F_OWNER; if (ABD_IS_LINEAR(sabd)) { - abd->abd_flags = ABD_F_LINEAR; abd->abd_offset = 0; abd->abd_nents = 1; abd->abd_buf = sabd->abd_buf + off; - } else { - abd->abd_flags = ABD_F_SCATTER; + } else if (!(sabd->abd_flags & ABD_F_SG_CHAIN)) { + /* scatterlist is not chained, treat it as an array. */ offset = sabd->abd_offset + off; - abd->abd_offset = offset & (PAGE_SIZE - 1); + abd->abd_offset = offset & (PAGESIZE - 1); /* make sure the new abd start as sgl[0] */ abd->abd_sgl = &sabd->abd_sgl[offset >> PAGE_SHIFT]; abd->abd_nents = sabd->abd_nents - (offset >> PAGE_SHIFT); + } else { + /* Chained scatterlist, need to walk through it. */ + abd->abd_sgl = sabd->abd_sgl; + abd->abd_nents = sabd->abd_nents; + + offset = sabd->abd_offset + off; + while (offset >= PAGESIZE) { + abd->abd_sgl = sg_next(abd->abd_sgl); + abd->abd_nents--; + offset -= PAGESIZE; + } + abd->abd_offset = offset; } return (abd); @@ -988,7 +1010,7 @@ abd_get_from_buf(void *buf, size_t size) { abd_t *abd; - abd = abd_alloc_struct(0); + abd = kmem_cache_alloc(abd_struct_cache, KM_PUSHPAGE); abd->abd_magic = ARC_BUF_DATA_MAGIC; abd->abd_flags = ABD_F_LINEAR; @@ -1012,45 +1034,102 @@ abd_put(abd_t *abd) ASSERT(!(abd->abd_flags & ABD_F_OWNER)); abd->abd_magic = 0; - abd_free_struct(abd, 0); + kmem_cache_free(abd_struct_cache, abd); +} + +static void +abd_sg_alloc_table(abd_t *abd) +{ + int n = abd->abd_nents; +#if defined(_KERNEL) && \ + (defined(CONFIG_ARCH_HAS_SG_CHAIN) || defined(ARCH_HAS_SG_CHAIN)) + struct sg_table table; + while (sg_alloc_table(&table, n, GFP_NOIO)) + schedule_timeout(1); + + ASSERT3U(table.nents, ==, n); + abd->abd_sgl = table.sgl; + /* scatterlist is chained (see sg_alloc_table) */ + if (n > SG_MAX_SINGLE_ALLOC) + abd->abd_flags |= ABD_F_SG_CHAIN; +#else + /* + * Unfortunately, some arch don't support chained scatterlist. For + * them and user space, we use contiguous scatterlist. For a 16MB + * buffer size with 4KB page, this would mean around 128KB of + * scatterlist. + */ + abd->abd_sgl = vmem_alloc(n * sizeof (struct scatterlist), KM_PUSHPAGE); + ASSERT(abd->abd_sgl); + sg_init_table(abd->abd_sgl, n); +#endif +} + +static void +abd_sg_free_table(abd_t *abd) +{ +#if defined(_KERNEL) && \ + (defined(CONFIG_ARCH_HAS_SG_CHAIN) || defined(ARCH_HAS_SG_CHAIN)) + struct sg_table table; + table.sgl = abd->abd_sgl; + table.nents = table.orig_nents = abd->abd_nents; + sg_free_table(&table); +#else + vmem_free(abd->abd_sgl, abd->abd_nents * sizeof (struct scatterlist)); +#endif } /* * Allocate a scatter ABD + * + * @highmem indicate whether the pages should be in highmem. + * Highmem is mainly for userdata, while non-highmem is mainly for metadata + * which allow scatter ABD. */ -abd_t * -abd_alloc_scatter(size_t size) +static abd_t * +__abd_alloc_scatter(size_t size, int highmem) { abd_t *abd; struct page *page; - int i, n = DIV_ROUND_UP(size, PAGE_SIZE); + struct scatterlist *sg; + int i, n = DIV_ROUND_UP(size, PAGESIZE); size_t last_size = size - ((n-1) << PAGE_SHIFT); - abd = abd_alloc_struct(n); + abd = kmem_cache_alloc(abd_struct_cache, KM_PUSHPAGE); abd->abd_magic = ARC_BUF_DATA_MAGIC; abd->abd_flags = ABD_F_SCATTER|ABD_F_OWNER; + if (highmem) + abd->abd_flags |= ABD_F_HIGHMEM; abd->abd_size = size; abd->abd_offset = 0; abd->abd_nents = n; - abd->abd_sgl = (struct scatterlist *)&abd->__abd_sgl[0]; - sg_init_table(abd->abd_sgl, n); - for (i = 0; i < n; i++) { + abd_sg_alloc_table(abd); + + for_each_sg(abd->abd_sgl, sg, n, i) { retry: - page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); + page = alloc_page(GFP_NOIO|(highmem ? __GFP_HIGHMEM : 0)); if (unlikely(page == NULL)) { set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); goto retry; } - sg_set_page(&abd->abd_sgl[i], page, - (i == n-1 ? last_size : PAGE_SIZE), 0); + sg_set_page(sg, page, (i == n-1 ? last_size : PAGESIZE), 0); } return (abd); } +abd_t * +_abd_alloc_scatter(size_t size, int highmem) +{ + /* fallback to linear to save memory */ + if (size < PAGESIZE) + return (abd_alloc_linear(size)); + return (__abd_alloc_scatter(size, highmem)); +} + /* * Allocate a linear ABD */ @@ -1059,7 +1138,7 @@ abd_alloc_linear(size_t size) { abd_t *abd; - abd = abd_alloc_struct(0); + abd = kmem_cache_alloc(abd_struct_cache, KM_PUSHPAGE); abd->abd_magic = ARC_BUF_DATA_MAGIC; abd->abd_flags = ABD_F_LINEAR|ABD_F_OWNER; @@ -1076,20 +1155,21 @@ static void abd_free_scatter(abd_t *abd, size_t size) { int i, n; + struct scatterlist *sg; struct page *page; - ASSERT(abd->abd_sgl == (struct scatterlist *)&abd->__abd_sgl[0]); - ASSERT(abd->abd_size == size); - ASSERT(abd->abd_nents == DIV_ROUND_UP(abd->abd_size, PAGE_SIZE)); + ASSERT(abd->abd_nents == DIV_ROUND_UP(abd->abd_size, PAGESIZE)); n = abd->abd_nents; abd->abd_magic = 0; - for (i = 0; i < n; i++) { - page = sg_page(&abd->abd_sgl[i]); + for_each_sg(abd->abd_sgl, sg, n, i) { + page = sg_page(sg); if (page) __free_page(page); } - abd_free_struct(abd, n); + + abd_sg_free_table(abd); + kmem_cache_free(abd_struct_cache, abd); } static void @@ -1097,7 +1177,7 @@ abd_free_linear(abd_t *abd, size_t size) { abd->abd_magic = 0; zio_buf_free(abd->abd_buf, size); - abd_free_struct(abd, 0); + kmem_cache_free(abd_struct_cache, abd); } /* @@ -1109,8 +1189,22 @@ abd_free(abd_t *abd, size_t size) { ABD_CHECK(abd); ASSERT(abd->abd_flags & ABD_F_OWNER); + ASSERT(abd->abd_size == size); if (ABD_IS_LINEAR(abd)) abd_free_linear(abd, size); else abd_free_scatter(abd, size); } + +void +abd_init(void) +{ + abd_struct_cache = kmem_cache_create("abd_struct", sizeof (abd_t), 0, + NULL, NULL, NULL, NULL, NULL, 0); +} + +void +abd_fini(void) +{ + kmem_cache_destroy(abd_struct_cache); +} diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 1c380e86f3ec..0c6a61191921 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3000,7 +3000,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ - void *contents = (data != NULL) ? data->b_data : NULL; + abd_t *contents = (data != NULL) ? data->b_data : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, contents, db->db.db_size, &zp, diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 7a96ea18bfc5..f51754838c41 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1736,6 +1736,7 @@ spa_init(int mode) } #endif + abd_init(); fm_init(); refcount_init(); unique_init(); @@ -1770,6 +1771,7 @@ spa_fini(void) unique_fini(); refcount_fini(); fm_fini(); + abd_fini(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 6dfaf2578daa..5ffcc275c410 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -535,10 +535,12 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, * their volume block size to match the maximum request size and * the common case will be one bio per vdev IO request. */ - if (zio) - zio_data = zio->io_data; - else - zio_data = abd_get_from_buf(kbuf_ptr, kbuf_size); + if (zio_data == NULL) { + if (zio) + zio_data = zio->io_data; + else + zio_data = abd_get_from_buf(kbuf_ptr, kbuf_size); + } zio_offset = 0; bio_offset = kbuf_offset; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index bdf4171445cd..7fdb668a783e 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1430,7 +1430,7 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) int code = 0; /* - * matrix reconstruction can use scatter buffer yet, so we allocate + * matrix reconstruction cannot use scatter buffer yet, so we allocate * temporary linear abds. */ if (!ABD_IS_LINEAR(rm->rm_col[rm->rm_firstdatacol].rc_data)) { diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 6dc8e263b30d..24a98887ec01 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -263,8 +263,6 @@ zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, if (ABD_IS_LINEAR(zio->io_data)) ASSERT_ABD_LINEAR(data); - else - ASSERT_ABD_SCATTER(data); zt->zt_orig_data = zio->io_data; zt->zt_orig_size = zio->io_size;