From cf1f40f6bb21d618bcdf88f1de632eb4a57839c8 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 01/26] Add compatibility layer for {kmap,kunmap}_atomic Starting from linux-2.6.37, {kmap,kunmap}_atomic takes 1 argument instead of 2. Signed-off-by: Chunwei Chen --- config/kernel-kmap-atomic-args.m4 | 24 +++++++++++++++++++ config/kernel.m4 | 1 + include/linux/Makefile.am | 3 ++- include/linux/kmap_compat.h | 40 +++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 config/kernel-kmap-atomic-args.m4 create mode 100644 include/linux/kmap_compat.h diff --git a/config/kernel-kmap-atomic-args.m4 b/config/kernel-kmap-atomic-args.m4 new file mode 100644 index 000000000000..f7228907af49 --- /dev/null +++ b/config/kernel-kmap-atomic-args.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # 2.6.37 API change +dnl # kmap_atomic changed from assigning hard-coded named slot to using +dnl # push/pop based dynamical allocation. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS], [ + AC_MSG_CHECKING([whether kmap_atomic wants 1 args]) + ZFS_LINUX_TRY_COMPILE([ + #include + + void test_kmap_atomic(void) + { + struct page page; + kmap_atomic(&page); + } + ],[ + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_1ARG_KMAP_ATOMIC, 1, + [kmap_atomic wants 1 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index ce10707e4954..ec25b64b110a 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -96,6 +96,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_5ARG_SGET ZFS_AC_KERNEL_LSEEK_EXECUTE ZFS_AC_KERNEL_VFS_ITERATE + ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ" diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am index d00b1c8ad798..595d1db01128 100644 --- a/include/linux/Makefile.am +++ b/include/linux/Makefile.am @@ -5,7 +5,8 @@ KERNEL_H = \ $(top_srcdir)/include/linux/xattr_compat.h \ $(top_srcdir)/include/linux/vfs_compat.h \ $(top_srcdir)/include/linux/blkdev_compat.h \ - $(top_srcdir)/include/linux/utsname_compat.h + $(top_srcdir)/include/linux/utsname_compat.h \ + $(top_srcdir)/include/linux/kmap_compat.h USER_H = diff --git a/include/linux/kmap_compat.h b/include/linux/kmap_compat.h new file mode 100644 index 000000000000..f581fb20965a --- /dev/null +++ b/include/linux/kmap_compat.h @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + */ + +#ifndef _ZFS_KMAP_H +#define _ZFS_KMAP_H + +#include + +#ifdef HAVE_1ARG_KMAP_ATOMIC +/* 2.6.37 API change */ +#define zfs_kmap_atomic(page, km_type) kmap_atomic(page) +#define zfs_kunmap_atomic(addr, km_type) kunmap_atomic(addr) +#else +#define zfs_kmap_atomic(page, km_type) kmap_atomic(page, km_type) +#define zfs_kunmap_atomic(addr, km_type) kunmap_atomic(addr, km_type) +#endif + +#endif /* _ZFS_KMAP_H */ From 19b05897adb40a37d29a0efd6b8183a32a949862 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 02/26] Introduce ABD: linear/scatter dual typed buffer for ARC zfsolinux currently uses vmalloc backed slab for ARC buffers. There are some major problems with this approach. One is that 32-bit system have only a handful of vmalloc space. Another is that the fragmentation in slab will easily trigger OOM in busy system. With ABD, we use scatterlist to allocate data buffers. In this approach we can allocate in HIGHMEM, which alleviates vmalloc space pressure on 32-bit. Also, we don't have to rely on slab, so there's no fragmentation issue. But for metadata buffers, we still uses linear buffer from slab. The reason for this is that there are a lot of *_phys pointers directly point to metadata buffers. So it's kind of impractical to change all those code. Currently, ABD is not enabled and its API will treat them as normal buffers. We will enable it once all relevant code is modified to use the API. Signed-off-by: Chunwei Chen --- include/sys/Makefile.am | 1 + include/sys/abd.h | 303 +++++++++++ lib/libzpool/Makefile.am | 1 + module/zfs/abd.c | 1116 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 1421 insertions(+) create mode 100644 include/sys/abd.h create mode 100644 module/zfs/abd.c diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 5211e656456d..8b3fd524c555 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -3,6 +3,7 @@ SUBDIRS = fm fs COMMON_H = \ $(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc_impl.h \ + $(top_srcdir)/include/sys/abd.h \ $(top_srcdir)/include/sys/avl.h \ $(top_srcdir)/include/sys/avl_impl.h \ $(top_srcdir)/include/sys/blkptr.h \ diff --git a/include/sys/abd.h b/include/sys/abd.h new file mode 100644 index 000000000000..561b43e18188 --- /dev/null +++ b/include/sys/abd.h @@ -0,0 +1,303 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + */ + +/* + * ABD - ARC buffer data + * ABD is an abstract data structure for ARC. There are two types of ABD: + * linear for metadata and scatter for data. + * Their type is determined by the lowest bit of abd_t pointer. + * The public API will automatically determine the type + */ + +#ifndef _ABD_H +#define _ABD_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif +#if 0 +#define ARC_BUF_DATA_MAGIC 0xa7cb0fda + +#if defined(ZFS_DEBUG) && !defined(_KERNEL) +#define DEBUG_ABD +#endif + +typedef struct arc_buf_data { +#ifdef DEBUG_ABD + char pad[PAGE_SIZE]; /* debug, coredumps when accessed */ +#endif + uint32_t abd_magic; /* ARC_BUF_DATA_MAGIC */ + uint32_t abd_flags; + size_t abd_size; /* buffer size, excluding offset */ + size_t abd_offset; /* offset in the first segment */ + int abd_nents; /* num of sgl entries */ + union { + struct scatterlist *abd_sgl; + void *abd_buf; + }; + uint64_t __abd_sgl[0]; +} abd_t; + +#define ABD_F_SCATTER (0x0) +#define ABD_F_LINEAR (0x1) +#define ABD_F_OWNER (0x2) + +/* + * Convert an linear ABD to normal buffer + */ +#define ABD_TO_BUF(abd) \ +( \ +{ \ + ASSERT((abd)->abd_magic == ARC_BUF_DATA_MAGIC); \ + ASSERT_ABD_LINEAR(abd); \ + abd->abd_buf; \ +} \ +) + +#define ABD_IS_SCATTER(abd) (!((abd)->abd_flags & ABD_F_LINEAR)) +#define ABD_IS_LINEAR(abd) (!ABD_IS_SCATTER(abd)) +#define ASSERT_ABD_SCATTER(abd) ASSERT(ABD_IS_SCATTER(abd)) +#define ASSERT_ABD_LINEAR(abd) ASSERT(ABD_IS_LINEAR(abd)) + +/* + * Allocations and deallocations + */ +abd_t *abd_alloc_scatter(size_t); +abd_t *abd_alloc_linear(size_t); +void abd_free(abd_t *, size_t); +abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_from_buf(void *, size_t); +void abd_put(abd_t *); + +/* + * ABD operations + */ +void abd_iterate_rfunc(abd_t *, size_t, + int (*)(const void *, uint64_t, void *), void *); +void abd_iterate_wfunc(abd_t *, size_t, + int (*)(void *, uint64_t, void *), void *); +void abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, + int (*)(void *, void *, uint64_t, uint64_t, void *), void *); +void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); +void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); +void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); +int abd_cmp(abd_t *, abd_t *, size_t); +int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); +void abd_zero_off(abd_t *, size_t, size_t); +#ifdef _KERNEL +int abd_copy_to_user_off(void __user *, abd_t *, size_t, size_t); +int abd_copy_from_user_off(abd_t *, const void __user *, size_t, size_t); +int abd_uiomove_off(abd_t *, size_t, enum uio_rw, uio_t *, size_t); +int abd_uiocopy_off(abd_t *, size_t, enum uio_rw, uio_t *, size_t *, + size_t); +unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, + size_t); +unsigned long abd_bio_nr_pages_off(abd_t *, unsigned int, size_t); + +#define abd_bio_map_off(bio, abd, size, off) \ +( \ +{ \ + unsigned int ___ret; \ + if (ABD_IS_LINEAR(abd)) \ + ___ret = bio_map(bio, ABD_TO_BUF(abd) + (off), size); \ + else \ + ___ret = abd_scatter_bio_map_off(bio, abd, size, off); \ + ___ret; \ +} \ +) +#endif /* _KERNEL */ + +/* + * Borrow a linear buffer for an ABD + * Will allocate if ABD is scatter + */ +#define abd_borrow_buf(a, n) \ +( \ +{ \ + void *___b; \ + if (ABD_IS_LINEAR(a)) { \ + ___b = ABD_TO_BUF(a); \ + } else { \ + ___b = zio_buf_alloc(n); \ + } \ + ___b; \ +} \ +) + +/* + * Borrow a linear buffer for an ABD + * Will allocate and copy if ABD is scatter + */ +#define abd_borrow_buf_copy(a, n) \ +( \ +{ \ + void *___b = abd_borrow_buf(a, n); \ + if (!ABD_IS_LINEAR(a)) \ + abd_copy_to_buf(___b, a, n); \ + ___b; \ +} \ +) + +/* + * Return the borrowed linear buffer + */ +#define abd_return_buf(a, b, n) \ +do { \ + if (ABD_IS_LINEAR(a)) \ + ASSERT((b) == ABD_TO_BUF(a)); \ + else \ + zio_buf_free(b, n); \ +} while (0) + +/* + * Copy back to ABD and return the borrowed linear buffer + */ +#define abd_return_buf_copy(a, b, n) \ +do { \ + if (!ABD_IS_LINEAR(a)) \ + abd_copy_from_buf(a, b, n); \ + abd_return_buf(a, b, n); \ +} while (0) +#else /* 0 */ +typedef void abd_t; +#define ABD_TO_BUF(abd) ((void *)abd) +#define ABD_IS_SCATTER(abd) (0) +#define ABD_IS_LINEAR(abd) (1) +#define ASSERT_ABD_SCATTER(abd) ((void)0) +#define ASSERT_ABD_LINEAR(abd) ((void)0) +void *zio_buf_alloc(size_t); +void zio_buf_free(void *, size_t); +static inline abd_t *abd_alloc_linear(size_t size) +{ + return ((abd_t *)zio_buf_alloc(size)); +} +static inline void abd_free(abd_t *abd, size_t size) +{ + zio_buf_free((void *)abd, size); +} +#define abd_alloc_scatter abd_alloc_linear +#define abd_get_offset(abd, off) ((void *)(abd)+(off)) +#define abd_get_from_buf(buf, size) (buf) +#define abd_put(abd) do { } while (0) + +#define abd_iterate_rfunc(a, n, f, p) \ + (void) f(a, n, p) + +#define abd_iterate_wfunc(a, n, f, p) \ + (void) f(a, n, p) + +#define abd_iterate_func2(a, b, an, bn, f, p) \ + (void) f(a, b, an, bn, p) + +#define abd_copy_off(a, b, n, aoff, boff) \ + (void) memcpy((void *)(a)+(aoff), (void *)(b)+(boff), n) + +#define abd_copy_from_buf_off(a, b, n, off) \ + (void) memcpy((void *)(a)+(off), b, n) + +#define abd_copy_to_buf_off(a, b, n, off) \ + (void) memcpy(a, (void *)(b)+(off), n) + +#define abd_cmp(a, b, n) \ + memcmp(a, b, n) + +#define abd_cmp_buf_off(a, b, n, off) \ + memcmp((void *)(a)+(off), b, n) + +#define abd_zero_off(a, n, off) \ + (void) memset((void *)(a)+(off), 0, n) + +#ifdef _KERNEL +#define abd_copy_to_user_off(a, b, n, off) \ + copy_to_user(a, (void *)(b)+(off), n) + +#define abd_copy_from_user_off(a, b, n, off) \ + copy_from_user((void *)(a)+(off), b, n) + +#define abd_uiomove_off(p, n, rw, uio, off) \ + uiomove((void *)(p)+(off), n, rw, uio) + +#define abd_uiocopy_off(p, n, rw, uio, c, off) \ + uiocopy((void *)(p)+(off), n, rw, uio, c) + +#define abd_bio_map_off(bio, a, n, off) \ + bio_map(bio, (void *)(a)+(off), n) + +#define abd_bio_nr_pages_off(a, n, off) \ + bio_nr_pages((void *)(a)+(off), n) +#endif /* _KERNEL */ + +#define abd_borrow_buf(a, n) \ + ((void *)a) + +#define abd_borrow_buf_copy(a, n) \ + ((void *)a) + +#define abd_return_buf(a, b, n) \ + do { } while (0) + +#define abd_return_buf_copy(a, b, n) \ + do { } while (0) +#endif /* 0 */ + +/* + * Wrappers for zero off functions + */ +#define abd_copy(dabd, sabd, size) \ + abd_copy_off(dabd, sabd, size, 0, 0) + +#define abd_copy_from_buf(abd, buf, size) \ + abd_copy_from_buf_off(abd, buf, size, 0) + +#define abd_copy_to_buf(buf, abd, size) \ + abd_copy_to_buf_off(buf, abd, size, 0) + +#define abd_cmp_buf(abd, buf, size) \ + abd_cmp_buf_off(abd, buf, size, 0) + +#define abd_zero(abd, size) \ + abd_zero_off(abd, size, 0) + +#ifdef _KERNEL +#define abd_copy_to_user(buf, abd, size) \ + abd_copy_to_user_off(buf, abd, size, 0) + +#define abd_copy_from_user(abd, buf, size) \ + abd_copy_from_user_off(abd, buf, size, 0) + +#define abd_uiomove(abd, n, rw, uio) \ + abd_uiomove_off(abd, n, rw, uio, 0) + +#define abd_uiocopy(abd, n, rw, uio, c) \ + abd_uiocopy_off(abd, n, rw, uio, c, 0) +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 85bc0510a81d..caa64787a95b 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -110,6 +110,7 @@ libzpool_la_LIBADD += $(ZLIB) libzpool_la_LDFLAGS = -version-info 2:0:0 EXTRA_DIST = \ + $(top_srcdir)/module/zfs/abd.c \ $(top_srcdir)/module/zfs/vdev_disk.c \ $(top_srcdir)/module/zfs/zfs_acl.c \ $(top_srcdir)/module/zfs/zfs_ctldir.c \ diff --git a/module/zfs/abd.c b/module/zfs/abd.c new file mode 100644 index 000000000000..8d599e8d1b07 --- /dev/null +++ b/module/zfs/abd.c @@ -0,0 +1,1116 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + */ + +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include + +#else /* _KERNEL */ + +/* + * Userspace compatibility layer + */ + +/* + * page + */ +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +struct page; + +#define alloc_page(gfp) \ + ((struct page *)umem_alloc_aligned(PAGE_SIZE, PAGE_SIZE, UMEM_DEFAULT)) + +#define __free_page(page) \ + umem_free(page, PAGE_SIZE) + +/* + * scatterlist + */ +struct scatterlist { + struct page *page; + int length; + int end; +}; + +static void +sg_init_table(struct scatterlist *sg, int nr) { + memset(sg, 0, nr * sizeof (struct scatterlist)); + sg[nr - 1].end = 1; +} + +static inline void +sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, + unsigned int offset) { + /* currently we don't use offset */ + ASSERT(offset == 0); + sg->page = page; + sg->length = len; +} + +static inline struct page * +sg_page(struct scatterlist *sg) { + return (sg->page); +} + +static inline struct scatterlist * +sg_next(struct scatterlist *sg) +{ + if (sg->end) + return (NULL); + return (sg + 1); +} + +/* + * misc + */ +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#ifndef unlikely +#define unlikely(x) (x) +#endif + +#define kmap(page) ((void *)page) +#define kunmap(page) do { } while (0) +#define zfs_kmap_atomic(page, type) ((void *)page) +#define zfs_kunmap_atomic(addr, type) do { } while (0) +#define pagefault_disable() do { } while (0) +#define pagefault_enable() do { } while (0) +#define flush_kernel_dcache_page(page) do { } while (0) +#define set_current_state(state) do { } while (0) +static inline long +schedule_timeout(long timeout) +{ + sleep(timeout); + return (0); +} + +#endif /* _KERNEL */ + + +struct abd_miter { + void *addr; /* mapped addr, adjusted by offset */ + int length; /* current segment length, adjusted by offset */ + int offset; /* offset in current segment */ + int is_linear; /* the type of the abd */ + union { + struct scatterlist *sg; + void *buf; + }; + int nents; /* num of sg entries */ + int rw; /* r/w access, whether to flush cache */ +#ifndef HAVE_1ARG_KMAP_ATOMIC + int km_type; /* KM_USER0 or KM_USER1 */ +#endif +}; + +#define ABD_MITER_W (1) +#define ABD_MITER_R (0) + +/* + * Initialize the abd_miter. + * Pass ABD_MITER_W to rw if you will write to the abd buffer. + * Please use abd_miter_init or abd_miter_init2 for one or two iterators + * respectively, they will setup KM_USERx accordingly. + */ +static void +abd_miter_init_km(struct abd_miter *aiter, abd_t *abd, int rw, int km) +{ + ASSERT(abd->abd_nents != 0); + aiter->addr = NULL; + if (ABD_IS_LINEAR(abd)) { + ASSERT(abd->abd_nents == 1); + aiter->is_linear = 1; + aiter->buf = abd->abd_buf; + aiter->length = abd->abd_size; + } else { + aiter->is_linear = 0; + aiter->sg = abd->abd_sgl; + aiter->length = aiter->sg->length - abd->abd_offset; + } + aiter->offset = abd->abd_offset; + aiter->nents = abd->abd_nents; + aiter->rw = rw; +#ifndef HAVE_1ARG_KMAP_ATOMIC + aiter->km_type = km; +#endif +} + + +#define abd_miter_init(a, abd, rw) abd_miter_init_km(a, abd, rw, 0) +#define abd_miter_init2(a, aabd, arw, b, babd, brw) \ +do { \ + abd_miter_init_km(a, aabd, arw, 0); \ + abd_miter_init_km(b, babd, brw, 1); \ +} while (0); + +/* + * Map the current page in abd_miter. + * Pass 1 to atmoic if you want to use kmap_atomic. + * This can be safely called when the aiter has already exhausted, in which + * case this does nothing. + * The mapped address and length will be aiter->addr and aiter->length. + */ +static void +abd_miter_map_x(struct abd_miter *aiter, int atomic) +{ + void *paddr; + + ASSERT(!aiter->addr); + + if (!aiter->nents) + return; + + if (aiter->is_linear) { + paddr = aiter->buf; + /* + * Turn of pagefault to keep the context the same as + * kmap_atomic. + */ + if (atomic) + pagefault_disable(); + } else { + ASSERT(aiter->length == aiter->sg->length - aiter->offset); + + if (atomic) + paddr = zfs_kmap_atomic(sg_page(aiter->sg), + (aiter->km_type ? KM_USER1 : KM_USER0)); + else + paddr = kmap(sg_page(aiter->sg)); + } + aiter->addr = paddr + aiter->offset; +} + +/* + * Unmap the current page in abd_miter. + * Pass 1 to atmoic if you want to use kmap_atomic. + * This can be safely called when the aiter has already exhausted, in which + * case this does nothing. + */ +static void +abd_miter_unmap_x(struct abd_miter *aiter, int atomic) +{ + void *paddr; + + if (!aiter->nents) + return; + + ASSERT(aiter->addr); + + if (aiter->is_linear) { + pagefault_enable(); + } else { + paddr = aiter->addr - aiter->offset; + if (atomic) { + if (aiter->rw == ABD_MITER_W) + flush_kernel_dcache_page(sg_page(aiter->sg)); + zfs_kunmap_atomic(paddr, + (aiter->km_type ? KM_USER1 : KM_USER0)); + } else { + kunmap(sg_page(aiter->sg)); + } + } + aiter->addr = NULL; +} + +#define abd_miter_map_atomic(a) abd_miter_map_x(a, 1) +#define abd_miter_map(a) abd_miter_map_x(a, 0) +#define abd_miter_unmap_atomic(a) abd_miter_unmap_x(a, 1) +#define abd_miter_unmap(a) abd_miter_unmap_x(a, 0) + +/* + * Use abd_miter_{,un}map_atomic2 if you want to map 2 abd_miters. + * You need to pass the arguments in the same order for these two. + */ +#define abd_miter_map_atomic2(a, b) \ +do { \ + abd_miter_map_atomic(a); \ + abd_miter_map_atomic(b); \ +} while (0) + +#define abd_miter_unmap_atomic2(a, b) \ +do { \ + abd_miter_unmap_atomic(b); \ + abd_miter_unmap_atomic(a); \ +} while (0) + +/* + * Advance the iterator by offset. + * Cannot be called when a page is mapped. + * Returns 0 if exhausted. + * This can be safely called when the aiter has already exhausted, in which + * case this does nothing. + */ +static int +abd_miter_advance(struct abd_miter *aiter, int offset) +{ + ASSERT(!aiter->addr); + + if (!aiter->nents) + return (0); + + aiter->offset += offset; + if (aiter->is_linear) { + aiter->length -= offset; + if (aiter->length <= 0) { + aiter->nents--; + aiter->length = 0; + return (0); + } + } else { + while (aiter->offset >= aiter->sg->length) { + aiter->offset -= aiter->sg->length; + aiter->nents--; + aiter->sg = sg_next(aiter->sg); + if (!aiter->nents) { + aiter->length = 0; + return (0); + } + } + aiter->length = aiter->sg->length - aiter->offset; + } + return (1); +} + +#define ABD_CHECK(abd) \ +( \ +{ \ + ASSERT((abd)->abd_magic == ARC_BUF_DATA_MAGIC); \ + ASSERT((abd)->abd_size > 0); \ + if (ABD_IS_LINEAR(abd)) { \ + ASSERT((abd)->abd_offset == 0); \ + ASSERT((abd)->abd_nents == 1); \ + } else { \ + ASSERT((abd)->abd_offset < PAGE_SIZE); \ + ASSERT((abd)->abd_nents > 0); \ + } \ +} \ +) + +static void +abd_iterate_func(abd_t *abd, size_t size, + int (*func)(void *, uint64_t, void *), void *private, int rw) +{ + size_t len; + int stop; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size); + + abd_miter_init(&aiter, abd, rw); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 16. + */ + ASSERT(size == len || (len & 15) == 0); + + abd_miter_map_atomic(&aiter); + + stop = func(aiter.addr, len, private); + + abd_miter_unmap_atomic(&aiter); + + if (stop) + break; + size -= len; + abd_miter_advance(&aiter, len); + } +} + +/* + * Iterate over ABD and call a read function @func. + * @func should be implemented so that its behaviour is the same when taking + * linear and when taking scatter + */ +void +abd_iterate_rfunc(abd_t *abd, size_t size, + int (*func)(const void *, uint64_t, void *), void *private) +{ + /* skip type checking on func */ + abd_iterate_func(abd, size, (void *)func, private, ABD_MITER_R); +} + +/* + * Iterate over ABD and call a write function @func. + * @func should be implemented so that its behaviour is the same when taking + * linear and when taking scatter + */ +void +abd_iterate_wfunc(abd_t *abd, size_t size, + int (*func)(void *, uint64_t, void *), void *private) +{ + abd_iterate_func(abd, size, func, private, ABD_MITER_W); +} + +/* + * Iterate over two ABD and call @func2. + * @func2 should be implemented so that its behaviour is the same when taking + * linear and when taking scatter + */ +void +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t dsize, size_t ssize, + int (*func2)(void *, void *, uint64_t, uint64_t, void *), void *private) +{ + size_t dlen, slen; + int stop; + struct abd_miter daiter, saiter; + + ABD_CHECK(dabd); + ABD_CHECK(sabd); + + ASSERT(dsize <= dabd->abd_size); + ASSERT(ssize <= sabd->abd_size); + + abd_miter_init2(&daiter, dabd, ABD_MITER_W, + &saiter, sabd, ABD_MITER_W); + + while (dsize > 0 || ssize > 0) { + dlen = MIN(daiter.length, dsize); + slen = MIN(saiter.length, ssize); + + /* there are remainings after this run, use equal len */ + if (dsize > dlen || ssize > slen) { + if (MIN(dlen, slen) > 0) + slen = dlen = MIN(dlen, slen); + } + + /* must be progressive */ + ASSERT(dlen > 0 || slen > 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 16. + */ + ASSERT(dsize == dlen || (dlen & 15) == 0); + ASSERT(ssize == slen || (slen & 15) == 0); + + abd_miter_map_atomic2(&daiter, &saiter); + + stop = func2(daiter.addr, saiter.addr, dlen, slen, private); + + abd_miter_unmap_atomic2(&daiter, &saiter); + + if (stop) + break; + + dsize -= dlen; + ssize -= slen; + abd_miter_advance(&daiter, dlen); + abd_miter_advance(&saiter, slen); + } +} + +/* + * Copy from @sabd to @dabd + * @doff is offset in dabd + * @soff is offset in sabd + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t size, size_t doff, + size_t soff) +{ + size_t len; + struct abd_miter daiter, saiter; + + ABD_CHECK(dabd); + ABD_CHECK(sabd); + + ASSERT(size <= dabd->abd_size); + ASSERT(size <= sabd->abd_size); + + abd_miter_init2(&daiter, dabd, ABD_MITER_W, + &saiter, sabd, ABD_MITER_R); + abd_miter_advance(&daiter, doff); + abd_miter_advance(&saiter, soff); + + while (size > 0) { + len = MIN(daiter.length, size); + len = MIN(len, saiter.length); + ASSERT(len > 0); + + abd_miter_map_atomic2(&daiter, &saiter); + + memcpy(daiter.addr, saiter.addr, len); + + abd_miter_unmap_atomic2(&daiter, &saiter); + + size -= len; + abd_miter_advance(&daiter, len); + abd_miter_advance(&saiter, len); + } +} + +/* + * Copy from @buf to @abd + * @off is the offset in @abd + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t size, + size_t off) +{ + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_W); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + memcpy(aiter.addr, buf, len); + + abd_miter_unmap_atomic(&aiter); + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } +} + +/* + * Copy from @abd to @buf + * @off is the offset in @abd + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t size, size_t off) +{ + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + memcpy(buf, aiter.addr, len); + + abd_miter_unmap_atomic(&aiter); + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } +} + +/* + * Compare between @dabd and @sabd. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd, size_t size) +{ + size_t len; + int ret = 0; + struct abd_miter daiter, saiter; + + ABD_CHECK(dabd); + ABD_CHECK(sabd); + ASSERT(size <= dabd->abd_size); + ASSERT(size <= sabd->abd_size); + + abd_miter_init2(&daiter, dabd, ABD_MITER_R, + &saiter, sabd, ABD_MITER_R); + + while (size > 0) { + len = MIN(daiter.length, size); + len = MIN(len, saiter.length); + ASSERT(len > 0); + + abd_miter_map_atomic2(&daiter, &saiter); + + ret = memcmp(daiter.addr, saiter.addr, len); + + abd_miter_unmap_atomic2(&daiter, &saiter); + + if (ret) + break; + + size -= len; + abd_miter_advance(&daiter, len); + abd_miter_advance(&saiter, len); + } + return (ret); +} + +/* + * Compare between @abd and @buf. + * @off is the offset in @abd + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t size, size_t off) +{ + size_t len; + int ret = 0; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + ret = memcmp(aiter.addr, buf, len); + + abd_miter_unmap_atomic(&aiter); + + if (ret) + break; + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } + return (ret); +} + +/* + * Zero out @abd. + * @off is the offset in @abd + */ +void +abd_zero_off(abd_t *abd, size_t size, size_t off) +{ + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_W); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + memset(aiter.addr, 0, len); + + abd_miter_unmap_atomic(&aiter); + + size -= len; + abd_miter_advance(&aiter, len); + } +} + +#ifdef _KERNEL +/* + * Copy from @abd to user buffer @buf. + * @off is the offset in @abd + */ +int +abd_copy_to_user_off(void __user *buf, abd_t *abd, size_t size, + size_t off) +{ + int ret = 0; + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + ret = __copy_to_user_inatomic(buf, aiter.addr, len); + + abd_miter_unmap_atomic(&aiter); + if (ret) { + abd_miter_map(&aiter); + ret = copy_to_user(buf, aiter.addr, len); + abd_miter_unmap(&aiter); + if (ret) + break; + } + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } + return (ret ? EFAULT : 0); +} + +/* + * Copy from user buffer @buf to @abd. + * @off is the offset in @abd + */ +int +abd_copy_from_user_off(abd_t *abd, const void __user *buf, size_t size, + size_t off) +{ + int ret = 0; + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_W); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + ret = __copy_from_user_inatomic(aiter.addr, buf, len); + + abd_miter_unmap_atomic(&aiter); + if (ret) { + abd_miter_map(&aiter); + ret = copy_from_user(aiter.addr, buf, len); + abd_miter_unmap(&aiter); + if (ret) + break; + } + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } + return (ret ? EFAULT : 0); +} + +/* + * uiomove for ABD. + * @off is the offset in @abd + */ +int +abd_uiomove_off(abd_t *abd, size_t n, enum uio_rw rw, uio_t *uio, + size_t off) +{ + struct iovec *iov; + ulong_t cnt; + + while (n && uio->uio_resid) { + iov = uio->uio_iov; + cnt = MIN(iov->iov_len, n); + if (cnt == 0l) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + switch (uio->uio_segflg) { + case UIO_USERSPACE: + case UIO_USERISPACE: + /* + * p = kernel data pointer + * iov->iov_base = user data pointer + */ + if (rw == UIO_READ) { + if (abd_copy_to_user_off(iov->iov_base, + abd, cnt, off)) + return (EFAULT); + } else { + if (abd_copy_from_user_off(abd, + iov->iov_base, cnt, off)) + return (EFAULT); + } + break; + case UIO_SYSSPACE: + if (rw == UIO_READ) + abd_copy_to_buf_off(iov->iov_base, abd, + cnt, off); + else + abd_copy_from_buf_off(abd, iov->iov_base, + cnt, off); + break; + } + iov->iov_base += cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + off += cnt; + n -= cnt; + } + return (0); +} + +/* + * uiocopy for ABD. + * @off is the offset in @abd + */ +int +abd_uiocopy_off(abd_t *abd, size_t n, enum uio_rw rw, uio_t *uio, + size_t *cbytes, size_t off) +{ + struct iovec *iov; + ulong_t cnt; + int iovcnt; + + iovcnt = uio->uio_iovcnt; + *cbytes = 0; + + for (iov = uio->uio_iov; n && iovcnt; iov++, iovcnt--) { + cnt = MIN(iov->iov_len, n); + if (cnt == 0) + continue; + + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + case UIO_USERISPACE: + /* + * p = kernel data pointer + * iov->iov_base = user data pointer + */ + if (rw == UIO_READ) { + /* UIO_READ = copy data from kernel to user */ + if (abd_copy_to_user_off(iov->iov_base, + abd, cnt, off)) + return (EFAULT); + } else { + /* UIO_WRITE = copy data from user to kernel */ + if (abd_copy_from_user_off(abd, + iov->iov_base, cnt, off)) + return (EFAULT); + } + break; + + case UIO_SYSSPACE: + if (rw == UIO_READ) + abd_copy_to_buf_off(iov->iov_base, abd, + cnt, off); + else + abd_copy_from_buf_off(abd, iov->iov_base, + cnt, off); + break; + } + off += cnt; + n -= cnt; + *cbytes += cnt; + } + return (0); +} + +/* + * bio_map for scatter ABD. + * @off is the offset in @abd + * You should use abd_bio_map_off, it will choose the right function according + * to the ABD type. + */ +unsigned int +abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, unsigned int bio_size, + size_t off) +{ + int i; + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT_ABD_SCATTER(abd); + ASSERT(bio_size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + for (i = 0; i < bio->bi_max_vecs; i++) { + if (bio_size <= 0) + break; + + len = MIN(bio_size, aiter.length); + ASSERT(len > 0); + + if (bio_add_page(bio, sg_page(aiter.sg), len, + aiter.offset) != len) + break; + + bio_size -= len; + abd_miter_advance(&aiter, len); + } + return (bio_size); +} + +/* + * bio_nr_pages for ABD. + * @off is the offset in @abd + */ +unsigned long +abd_bio_nr_pages_off(abd_t *abd, unsigned int bio_size, size_t off) +{ + unsigned long pos = 0; + ABD_CHECK(abd); + + if (ABD_IS_LINEAR(abd)) + pos = (unsigned long)abd->abd_buf + off; + else + pos = abd->abd_offset + off; + return ((pos + bio_size + PAGE_SIZE-1)>>PAGE_SHIFT)-(pos>>PAGE_SHIFT); +} +#endif /* _KERNEL */ + +static inline abd_t * +abd_alloc_struct(int nr_pages) +{ + abd_t *abd; + size_t asize = sizeof (abd_t) + nr_pages*sizeof (struct scatterlist); + /* + * If the maximum block size increases, inline sgl might not fit into + * a single page. We might want to consider using chained sgl if + * that's the case. + */ + ASSERT(nr_pages * sizeof (struct scatterlist) <= PAGE_SIZE); +#ifndef DEBUG_ABD + abd = kmem_alloc(asize, KM_PUSHPAGE); +#else + abd = umem_alloc_aligned(asize, PAGE_SIZE, UMEM_DEFAULT); + /* deny access to padding */ + if (mprotect(abd, PAGE_SIZE, PROT_NONE) != 0) { + perror("mprotect failed"); + ASSERT(0); + } +#endif + ASSERT(abd); + + return (abd); +} + +static inline void +abd_free_struct(abd_t *abd, int nr_pages) +{ +#ifndef DEBUG_ABD + kmem_free(abd, sizeof (abd_t) + nr_pages*sizeof (struct scatterlist)); +#else + if (mprotect(abd, PAGE_SIZE, PROT_READ|PROT_WRITE) != 0) { + perror("mprotect failed"); + ASSERT(0); + } + umem_free(abd, sizeof (abd_t) + nr_pages*sizeof (struct scatterlist)); +#endif +} + +/* + * Allocate a new ABD to point to offset @off of the original ABD. + * It shares the underlying buffer with the original ABD. + * Use abd_put to free. The original ABD(allocated from abd_alloc) must + * not be freed before any of its derived ABD. + */ +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + size_t offset; + abd_t *abd; + + ABD_CHECK(sabd); + ASSERT(off <= sabd->abd_size); + + abd = abd_alloc_struct(0); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_size = sabd->abd_size - off; + + if (ABD_IS_LINEAR(sabd)) { + abd->abd_flags = ABD_F_LINEAR; + abd->abd_offset = 0; + abd->abd_nents = 1; + abd->abd_buf = sabd->abd_buf + off; + } else { + abd->abd_flags = ABD_F_SCATTER; + offset = sabd->abd_offset + off; + abd->abd_offset = offset & (PAGE_SIZE - 1); + /* make sure the new abd start as sgl[0] */ + abd->abd_sgl = &sabd->abd_sgl[offset >> PAGE_SHIFT]; + abd->abd_nents = sabd->abd_nents - (offset >> PAGE_SHIFT); + } + + return (abd); +} + +/* + * Allocate a linear ABD structure for @buf + * Use abd_put to free. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd; + + abd = abd_alloc_struct(0); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_flags = ABD_F_LINEAR; + abd->abd_size = size; + abd->abd_offset = 0; + abd->abd_nents = 1; + abd->abd_buf = buf; + + return (abd); +} + +/* + * Free an ABD allocated from abd_get_{offset,from_buf}. + * Must not be used on ABD from elsewhere. + * Will not free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + ABD_CHECK(abd); + ASSERT(!(abd->abd_flags & ABD_F_OWNER)); + + abd->abd_magic = 0; + abd_free_struct(abd, 0); +} + +/* + * Allocate a scatter ABD + */ +abd_t * +abd_alloc_scatter(size_t size) +{ + abd_t *abd; + struct page *page; + int i, n = DIV_ROUND_UP(size, PAGE_SIZE); + size_t last_size = size - ((n-1) << PAGE_SHIFT); + + abd = abd_alloc_struct(n); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_flags = ABD_F_SCATTER|ABD_F_OWNER; + abd->abd_size = size; + abd->abd_offset = 0; + abd->abd_nents = n; + abd->abd_sgl = (struct scatterlist *)&abd->__abd_sgl[0]; + sg_init_table(abd->abd_sgl, n); + + for (i = 0; i < n; i++) { +retry: + page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); + if (unlikely(page == NULL)) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + goto retry; + } + sg_set_page(&abd->abd_sgl[i], page, + (i == n-1 ? last_size : PAGE_SIZE), 0); + } + + return (abd); +} + +/* + * Allocate a linear ABD + */ +abd_t * +abd_alloc_linear(size_t size) +{ + abd_t *abd; + + abd = abd_alloc_struct(0); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_flags = ABD_F_LINEAR|ABD_F_OWNER; + abd->abd_size = size; + abd->abd_offset = 0; + abd->abd_nents = 1; + + abd->abd_buf = zio_buf_alloc(size); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd, size_t size) +{ + int i, n; + struct page *page; + + ASSERT(abd->abd_sgl == (struct scatterlist *)&abd->__abd_sgl[0]); + ASSERT(abd->abd_size == size); + ASSERT(abd->abd_nents == DIV_ROUND_UP(abd->abd_size, PAGE_SIZE)); + + n = abd->abd_nents; + abd->abd_magic = 0; + for (i = 0; i < n; i++) { + page = sg_page(&abd->abd_sgl[i]); + if (page) + __free_page(page); + } + abd_free_struct(abd, n); +} + +static void +abd_free_linear(abd_t *abd, size_t size) +{ + abd->abd_magic = 0; + zio_buf_free(abd->abd_buf, size); + abd_free_struct(abd, 0); +} + +/* + * Free a ABD. + * Only use this on ABD allocated with abd_alloc_{scatter,linear}. + */ +void +abd_free(abd_t *abd, size_t size) +{ + ABD_CHECK(abd); + ASSERT(abd->abd_flags & ABD_F_OWNER); + if (ABD_IS_LINEAR(abd)) + abd_free_linear(abd, size); + else + abd_free_scatter(abd, size); +} From 2750a182d86093630e337c6e82195cc45d019959 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 03/26] Modify/Add incremental checksum function for abd_iterate_rfunc Modify/Add incremental fletcher function prototype to match abd_iterate_rfunc callback type. Also, reduce duplicated code a bit in zfs_fletcher.c. Signed-off-by: Chunwei Chen --- include/zfs_fletcher.h | 13 +++-- module/zcommon/zfs_fletcher.c | 93 ++++++++++++++++++++--------------- 2 files changed, 62 insertions(+), 44 deletions(-) diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index b49df0cf4f0f..8f2e7bb09fcd 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -37,14 +37,19 @@ extern "C" { * fletcher checksum functions */ +#define fletcher_init(zcp) ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0) +#define fletcher_2_native_init fletcher_init +#define fletcher_2_byteswap_init fletcher_init +#define fletcher_4_native_init fletcher_init +#define fletcher_4_byteswap_init fletcher_init void fletcher_2_native(const void *, uint64_t, zio_cksum_t *); void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *); +int fletcher_2_incremental_native(const void *, uint64_t, void *); +int fletcher_2_incremental_byteswap(const void *, uint64_t, void *); void fletcher_4_native(const void *, uint64_t, zio_cksum_t *); void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *); -void fletcher_4_incremental_native(const void *, uint64_t, - zio_cksum_t *); -void fletcher_4_incremental_byteswap(const void *, uint64_t, - zio_cksum_t *); +int fletcher_4_incremental_native(const void *, uint64_t, void *); +int fletcher_4_incremental_byteswap(const void *, uint64_t, void *); #ifdef __cplusplus } diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index edd0cbe6c611..0a7082894866 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -130,15 +130,22 @@ #include #include #include +#include -void -fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +int +fletcher_2_incremental_native(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += ip[0]; a1 += ip[1]; b0 += a0; @@ -146,16 +153,30 @@ fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); } void -fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_2_native_init(zcp); + fletcher_2_incremental_native(buf, size, zcp); +} + +int +fletcher_2_incremental_byteswap(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += BSWAP_64(ip[0]); a1 += BSWAP_64(ip[1]); b0 += a0; @@ -163,46 +184,20 @@ fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); } void -fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - for (a = b = c = d = 0; ip < ipend; ip++) { - a += ip[0]; - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} - -void -fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) { - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - for (a = b = c = d = 0; ip < ipend; ip++) { - a += BSWAP_32(ip[0]); - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); + fletcher_2_byteswap_init(zcp); + fletcher_2_incremental_byteswap(buf, size, zcp); } -void -fletcher_4_incremental_native(const void *buf, uint64_t size, - zio_cksum_t *zcp) +int +fletcher_4_incremental_native(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; @@ -220,12 +215,20 @@ fletcher_4_incremental_native(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a, b, c, d); + return (0); } void -fletcher_4_incremental_byteswap(const void *buf, uint64_t size, - zio_cksum_t *zcp) +fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_native_init(zcp); + fletcher_4_incremental_native(buf, size, zcp); +} + +int +fletcher_4_incremental_byteswap(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; @@ -243,11 +246,21 @@ fletcher_4_incremental_byteswap(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a, b, c, d); + return (0); +} + +void +fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_byteswap_init(zcp); + fletcher_4_incremental_byteswap(buf, size, zcp); } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(fletcher_2_native); EXPORT_SYMBOL(fletcher_2_byteswap); +EXPORT_SYMBOL(fletcher_2_incremental_native); +EXPORT_SYMBOL(fletcher_2_incremental_byteswap); EXPORT_SYMBOL(fletcher_4_native); EXPORT_SYMBOL(fletcher_4_byteswap); EXPORT_SYMBOL(fletcher_4_incremental_native); From 4ef44f3ac35e551a1e8b4712f2b48c7003ab2381 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 04/26] Use abd_t in arc.h, ddt.h, dmu.h and zio.h 1. Use abd_t in arc_buf_t->b_data, dmu_buf_t->db_data, zio_t->io_data and zio_transform_t->zt_orig_data 2. zio_* function take abd_t for data Signed-off-by: Chunwei Chen --- include/sys/arc.h | 3 ++- include/sys/ddt.h | 2 +- include/sys/dmu.h | 3 ++- include/sys/zio.h | 23 ++++++++++++----------- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/include/sys/arc.h b/include/sys/arc.h index 215c75b6dfa3..8ffc893e0d28 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -28,6 +28,7 @@ #define _SYS_ARC_H #include +#include #ifdef __cplusplus extern "C" { @@ -61,7 +62,7 @@ struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; kmutex_t b_evict_lock; - void *b_data; + abd_t *b_data; arc_evict_func_t *b_efunc; void *b_private; }; diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 3befcb84427c..b7977aae0357 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -108,7 +108,7 @@ struct ddt_entry { ddt_key_t dde_key; ddt_phys_t dde_phys[DDT_PHYS_TYPES]; zio_t *dde_lead_zio[DDT_PHYS_TYPES]; - void *dde_repair_data; + abd_t *dde_repair_data; enum ddt_type dde_type; enum ddt_class dde_class; uint8_t dde_loading; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index c9c687b5aa62..3996b07cbaea 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -46,6 +46,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -285,7 +286,7 @@ typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ - void *db_data; /* data in buffer */ + abd_t *db_data; /* data in buffer */ } dmu_buf_t; typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); diff --git a/include/sys/zio.h b/include/sys/zio.h index 18e7a40a3080..7bed94a6036c 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -353,10 +354,10 @@ typedef struct zio_gang_node { typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, zio_gang_node_t *gn, void *data); -typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); +typedef void zio_transform_func_t(zio_t *zio, abd_t *data, uint64_t size); typedef struct zio_transform { - void *zt_orig_data; + abd_t *zt_orig_data; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; @@ -412,8 +413,8 @@ struct zio { blkptr_t io_bp_orig; /* Data represented by this I/O */ - void *io_data; - void *io_orig_data; + abd_t *io_data; + abd_t *io_orig_data; uint64_t io_size; uint64_t io_orig_size; @@ -463,18 +464,18 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags); -extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, + abd_t *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, @@ -490,12 +491,12 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); @@ -526,12 +527,12 @@ extern void zio_data_buf_free(void *buf, size_t size); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, - uint64_t offset, void *data, uint64_t size, int type, + uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); From d5e9ab8094c442007c1e5c5b38042dae1460d786 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 05/26] Convert zio_checksum to ABD version 1. Add checksum function for abd_t 2. Use abd_t version checksum function in zio_checksum_table 3. Make zio_checksum_compute and zio_checksum_error handle abd_t Signed-off-by: Chunwei Chen --- include/sys/zio_checksum.h | 12 +++++-- module/zfs/zio_checksum.c | 73 +++++++++++++++++++++++++++++--------- 2 files changed, 65 insertions(+), 20 deletions(-) diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index de89bc9a7967..dc01673e653a 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -34,7 +34,7 @@ extern "C" { /* * Signature for checksum functions. */ -typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); +typedef void zio_checksum_t(abd_t *data, uint64_t size, zio_cksum_t *zcp); /* * Information about each checksum function. @@ -61,10 +61,16 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. */ -extern zio_checksum_t zio_checksum_SHA256; +extern void abd_checksum_SHA256(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_2_native(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_2_byteswap(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_4_native(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_4_byteswap(abd_t *, uint64_t, zio_cksum_t *); + +extern void zio_checksum_SHA256(const void *, uint64_t, zio_cksum_t *); extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size); + abd_t *data, uint64_t size); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 3a5c73a6a1e9..b96149f96a00 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -28,6 +28,7 @@ #include #include #include +#include #include /* @@ -62,22 +63,58 @@ /*ARGSUSED*/ static void -zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) +abd_checksum_off(abd_t *abd, uint64_t size, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } +void +abd_checksum_SHA256(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + void *buf = abd_borrow_buf_copy(abd, size); + zio_checksum_SHA256(buf, size, zcp); + abd_return_buf(abd, buf, size); +} + +void +abd_fletcher_2_native(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_2_native_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_2_incremental_native, zcp); +} + +void +abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_2_byteswap_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_2_incremental_byteswap, zcp); +} + +void +abd_fletcher_4_native(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_native_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_4_incremental_native, zcp); +} + +void +abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_byteswap_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_4_incremental_byteswap, zcp); +} + zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { - {{NULL, NULL}, 0, 0, 0, "inherit"}, - {{NULL, NULL}, 0, 0, 0, "on"}, - {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, - {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"}, - {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, +{{NULL, NULL}, 0, 0, 0, "inherit"}, +{{NULL, NULL}, 0, 0, 0, "on"}, +{{abd_checksum_off, abd_checksum_off}, 0, 0, 0, "off"}, +{{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 1, 0, "label"}, +{{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 1, 0, "gang_header"}, +{{abd_fletcher_2_native, abd_fletcher_2_byteswap}, 0, 1, 0, "zilog"}, +{{abd_fletcher_2_native, abd_fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, +{{abd_fletcher_4_native, abd_fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, +{{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 0, 1, "sha256"}, +{{abd_fletcher_4_native, abd_fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, }; enum zio_checksum @@ -150,7 +187,7 @@ zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) */ void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size) + abd_t *data, uint64_t size) { blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; @@ -162,15 +199,16 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, if (ci->ci_eck) { zio_eck_t *eck; + void *buf = ABD_TO_BUF(data); if (checksum == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = data; + zil_chain_t *zilc = buf; size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ, uint64_t); eck = &zilc->zc_eck; } else { - eck = (zio_eck_t *)((char *)data + size) - 1; + eck = (zio_eck_t *)((char *)buf + size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&eck->zec_cksum, bp); @@ -197,7 +235,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; - void *data = zio->io_data; + abd_t *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum, verifier; @@ -206,9 +244,10 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) if (ci->ci_eck) { zio_eck_t *eck; + void *buf = ABD_TO_BUF(data); if (checksum == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = data; + zil_chain_t *zilc = buf; uint64_t nused; eck = &zilc->zc_eck; @@ -224,7 +263,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { - eck = (zio_eck_t *)((char *)data + size) - 1; + eck = (zio_eck_t *)((char *)buf + size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) From 8180345ada180a0ba776a317a7245fa74280fb45 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 06/26] Handle abd_t in arc.c, bpobj.c and bptree.c Signed-off-by: Chunwei Chen --- module/zfs/arc.c | 105 ++++++++++++++++++++++++++++---------------- module/zfs/bpobj.c | 13 +++--- module/zfs/bptree.c | 10 ++--- 3 files changed, 79 insertions(+), 49 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index bdf116c35c52..8f48658ff370 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -135,6 +135,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -643,14 +644,14 @@ struct l2arc_buf_hdr { uint32_t b_hits; uint64_t b_asize; /* temporary buffer holder for in-flight compressed data */ - void *b_tmp_cdata; + abd_t *b_tmp_cdata; }; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ - void *l2df_data; + abd_t *l2df_data; size_t l2df_size; - void (*l2df_func)(void *, size_t); + void (*l2df_func)(abd_t *, size_t); list_node_t l2df_list_node; } l2arc_data_free_t; @@ -953,7 +954,7 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&buf->b_hdr->b_freeze_lock); return; } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&buf->b_hdr->b_freeze_lock); @@ -966,7 +967,7 @@ arc_cksum_equal(arc_buf_t *buf) int equal; mutex_enter(&buf->b_hdr->b_freeze_lock); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); mutex_exit(&buf->b_hdr->b_freeze_lock); @@ -986,7 +987,7 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, + abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); mutex_exit(&buf->b_hdr->b_freeze_lock); arc_buf_watch(buf); @@ -998,6 +999,13 @@ arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr); } + +static int +arc_abd_watch(const void *buf, uint64_t len, void *private) +{ + ASSERT0(mprotect((void *)buf, len, *(int *)private)); + return (0); +} #endif /* ARGSUSED */ @@ -1006,8 +1014,9 @@ arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { - ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, - PROT_READ | PROT_WRITE)); + int prot = PROT_READ | PROT_WRITE; + abd_iterate_rfunc(buf->b_data, buf->b_hdr->b_size, + arc_abd_watch, &prot); } #endif } @@ -1017,8 +1026,11 @@ static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL - if (arc_watch) - ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ)); + if (arc_watch) { + int prot = PROT_READ; + abd_iterate_rfunc(buf->b_data, buf->b_hdr->b_size, + arc_abd_watch, &prot); + } #endif } @@ -1425,7 +1437,7 @@ arc_buf_clone(arc_buf_t *from) buf->b_next = hdr->b_buf; hdr->b_buf = buf; arc_get_data_buf(buf); - bcopy(from->b_data, buf->b_data, size); + abd_copy(buf->b_data, from->b_data, size); /* * This buffer already exists in the arc so create a duplicate @@ -1475,8 +1487,8 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) } static void -arc_buf_free_on_write(void *data, size_t size, - void (*free_func)(void *, size_t)) +arc_buf_free_on_write(abd_t *data, size_t size, + void (*free_func)(abd_t *, size_t)) { l2arc_data_free_t *df; @@ -1494,7 +1506,7 @@ arc_buf_free_on_write(void *data, size_t size, * the buffer is placed on l2arc_free_on_write to be freed later. */ static void -arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) +arc_buf_data_free(arc_buf_t *buf, void (*free_func)(abd_t *, size_t)) { arc_buf_hdr_t *hdr = buf->b_hdr; @@ -1522,7 +1534,7 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) ASSERT(HDR_L2_WRITING(hdr)); arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size, - zio_data_buf_free); + abd_free); ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); l2hdr->b_tmp_cdata = NULL; } @@ -1543,11 +1555,11 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) if (!recycle) { if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, zio_buf_free); + arc_buf_data_free(buf, abd_free); arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, zio_data_buf_free); + arc_buf_data_free(buf, abd_free); arc_space_return(size, ARC_SPACE_DATA); } } @@ -1814,7 +1826,7 @@ arc_buf_eviction_needed(arc_buf_t *buf) * it can't get a hash_lock on, and so may not catch all candidates. * It may also return without evicting as much space as requested. */ -static void * +static abd_t * arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { @@ -1824,7 +1836,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; boolean_t have_lock; - void *stolen = NULL; + abd_t *stolen = NULL; arc_buf_hdr_t marker = {{{ 0 }}}; int count = 0; @@ -2752,11 +2764,11 @@ arc_get_data_buf(arc_buf_t *buf) */ if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); + buf->b_data = abd_alloc_linear(size); arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); + buf->b_data = abd_alloc_scatter(size); arc_space_consume(size, ARC_SPACE_DATA); } goto out; @@ -2801,7 +2813,7 @@ arc_get_data_buf(arc_buf_t *buf) if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) { if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); + buf->b_data = abd_alloc_linear(size); arc_space_consume(size, ARC_SPACE_META); /* @@ -2816,7 +2828,7 @@ arc_get_data_buf(arc_buf_t *buf) cv_signal(&arc_reclaim_thr_cv); } else { ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); + buf->b_data = abd_alloc_scatter(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -2994,7 +3006,7 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, buf->b_hdr->b_size); + abd_copy_to_buf(arg, buf->b_data, buf->b_hdr->b_size); VERIFY(arc_buf_remove_ref(buf, arg)); } @@ -3062,10 +3074,15 @@ arc_read_done(zio_t *zio) if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + + void *ziobuf = abd_borrow_buf_copy(zio->io_data, zio->io_size); + if (BP_GET_LEVEL(zio->io_bp) > 0) - byteswap_uint64_array(buf->b_data, hdr->b_size); + byteswap_uint64_array(ziobuf, hdr->b_size); else - dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size); + dmu_ot_byteswap[bswap].ob_func(ziobuf, hdr->b_size); + + abd_return_buf_copy(zio->io_data, ziobuf, zio->io_size); } arc_cksum_compute(buf, B_FALSE); @@ -4956,7 +4973,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, list_t *list; uint64_t write_asize, write_psize, write_sz, headroom, buf_compress_minsz; - void *buf_data; + abd_t *buf_data; kmutex_t *list_lock = NULL; boolean_t full; l2arc_write_callback_t *cb; @@ -5230,26 +5247,32 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) { - void *cdata; + abd_t *cdata; + void *ddata; size_t csize, len, rounded; ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); ASSERT(l2hdr->b_tmp_cdata != NULL); len = l2hdr->b_asize; - cdata = zio_data_buf_alloc(len); - csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, - cdata, l2hdr->b_asize); + cdata = abd_alloc_linear(len); + + ddata = abd_borrow_buf_copy(l2hdr->b_tmp_cdata, l2hdr->b_asize); + + csize = zio_compress_data(ZIO_COMPRESS_LZ4, ddata, + ABD_TO_BUF(cdata), l2hdr->b_asize); + + abd_return_buf(l2hdr->b_tmp_cdata, ddata, l2hdr->b_asize); rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); if (rounded > csize) { - bzero((char *)cdata + csize, rounded - csize); + abd_zero_off(cdata, rounded - csize, csize); csize = rounded; } if (csize == 0) { /* zero block, indicate that there's nothing to write */ - zio_data_buf_free(cdata, len); + abd_free(cdata, len); l2hdr->b_compress = ZIO_COMPRESS_EMPTY; l2hdr->b_asize = 0; l2hdr->b_tmp_cdata = NULL; @@ -5270,7 +5293,7 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) * Compression failed, release the compressed buffer. * l2hdr will be left unmodified. */ - zio_data_buf_free(cdata, len); + abd_free(cdata, len); ARCSTAT_BUMP(arcstat_l2_compress_failures); return (B_FALSE); } @@ -5311,9 +5334,10 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) * buffer's contents. */ ASSERT(hdr->b_buf != NULL); - bzero(hdr->b_buf->b_data, hdr->b_size); + abd_zero(hdr->b_buf->b_data, hdr->b_size); zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; } else { + void *ddata; ASSERT(zio->io_data != NULL); /* * We copy the compressed data from the start of the arc buffer @@ -5327,10 +5351,15 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) */ csize = zio->io_size; cdata = zio_data_buf_alloc(csize); - bcopy(zio->io_data, cdata, csize); - if (zio_decompress_data(c, cdata, zio->io_data, csize, + + abd_copy_to_buf(cdata, zio->io_data, csize); + ddata = abd_borrow_buf(zio->io_data, hdr->b_size); + + if (zio_decompress_data(c, cdata, ddata, csize, hdr->b_size) != 0) zio->io_error = SET_ERROR(EIO); + + abd_return_buf_copy(zio->io_data, ddata, hdr->b_size); zio_data_buf_free(cdata, csize); } @@ -5356,7 +5385,7 @@ l2arc_release_cdata_buf(arc_buf_hdr_t *ab) * temporary buffer for it, so now we need to release it. */ ASSERT(l2hdr->b_tmp_cdata != NULL); - zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); + abd_free(l2hdr->b_tmp_cdata, ab->b_size); l2hdr->b_tmp_cdata = NULL; } else { ASSERT(l2hdr->b_tmp_cdata == NULL); diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 7c8f932f5cf3..166eb33e4d07 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -126,7 +126,7 @@ bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - objarray = dbuf->db_data; + objarray = ABD_TO_BUF(dbuf->db_data); bpobj_free(os, objarray[blkoff], tx); } if (dbuf) { @@ -170,7 +170,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); - bpo->bpo_phys = bpo->bpo_dbuf->db_data; + bpo->bpo_phys = ABD_TO_BUF(bpo->bpo_dbuf->db_data); return (0); } @@ -234,7 +234,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - bparray = dbuf->db_data; + bparray = ABD_TO_BUF(dbuf->db_data); bp = &bparray[blkoff]; err = func(arg, bp, tx); if (err) @@ -294,7 +294,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - objarray = dbuf->db_data; + objarray = ABD_TO_BUF(dbuf->db_data); err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); if (err) break; @@ -433,7 +433,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) numsubsub * sizeof (subobj)); dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - numsubsub * sizeof (subobj), subdb->db_data, tx); + numsubsub * sizeof (subobj), + ABD_TO_BUF(subdb->db_data), tx); dmu_buf_rele(subdb, FTAG); bpo->bpo_phys->bpo_num_subobjs += numsubsub; @@ -501,7 +502,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) } dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); - bparray = bpo->bpo_cached_dbuf->db_data; + bparray = ABD_TO_BUF(bpo->bpo_cached_dbuf->db_data); bparray[blkoff] = stored_bp; dmu_buf_will_dirty(bpo->bpo_dbuf, tx); diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index d6ea9d7c6451..acd22f7d8f70 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -74,7 +74,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx) */ VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); bt->bt_begin = 0; bt->bt_end = 0; bt->bt_bytes = 0; @@ -92,7 +92,7 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) bptree_phys_t *bt; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); ASSERT3U(bt->bt_begin, ==, bt->bt_end); ASSERT0(bt->bt_bytes); ASSERT0(bt->bt_comp); @@ -110,7 +110,7 @@ bptree_is_empty(objset_t *os, uint64_t obj) boolean_t rv; VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); rv = (bt->bt_begin == bt->bt_end); dmu_buf_rele(db, FTAG); return (rv); @@ -132,7 +132,7 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, ASSERT(dmu_tx_is_syncing(tx)); VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); bte = kmem_zalloc(sizeof (*bte), KM_SLEEP); bte->be_birth_txg = birth_txg; @@ -203,7 +203,7 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, if (free) dmu_buf_will_dirty(db, tx); - ba.ba_phys = db->db_data; + ba.ba_phys = ABD_TO_BUF(db->db_data); ba.ba_free = free; ba.ba_func = func; ba.ba_arg = arg; From d001d873cc2c903b7777135a47f0b3443542d4b1 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 07/26] Handle abd_t in dbuf.c, ddt.c, dmu*.c Signed-off-by: Chunwei Chen --- module/zfs/dbuf.c | 90 ++++++++++++++++++++++++--------------- module/zfs/ddt.c | 3 +- module/zfs/dmu.c | 35 +++++++++------ module/zfs/dmu_diff.c | 2 +- module/zfs/dmu_objset.c | 14 +++--- module/zfs/dmu_send.c | 49 ++++++++++++++------- module/zfs/dmu_traverse.c | 10 ++--- module/zfs/dmu_tx.c | 2 +- 8 files changed, 127 insertions(+), 78 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index f10a04d112a8..170e7b996469 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -255,8 +255,9 @@ dbuf_evict_user(dmu_buf_impl_t *db) if (db->db_level != 0 || db->db_evict_func == NULL) return; - if (db->db_user_data_ptr_ptr) - *db->db_user_data_ptr_ptr = db->db.db_data; + /* XXX ABD: db_data might be freed */ +// if (db->db_user_data_ptr_ptr) +// *db->db_user_data_ptr_ptr = ABD_TO_BUF(db->db.db_data); db->db_evict_func(&db->db, db->db_user_ptr); db->db_user_ptr = NULL; db->db_user_data_ptr_ptr = NULL; @@ -363,6 +364,21 @@ dbuf_fini(void) */ #ifdef ZFS_DEBUG +static int +checkzero(const void *p, uint64_t size, void *private) +{ + int i; + const uint64_t *x = p; + uint64_t *t = private; + for (i = 0; i < size >> 3; i++) { + if (x[i] != 0) { + *t = x[i]; + return (1); + } + } + return (0); +} + static void dbuf_verify(dmu_buf_impl_t *db) { @@ -446,7 +462,8 @@ dbuf_verify(dmu_buf_impl_t *db) */ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, - ((blkptr_t *)db->db_parent->db.db_data + + ((blkptr_t *) + ABD_TO_BUF(db->db_parent->db.db_data) + db->db_blkid % epb)); } } @@ -461,12 +478,10 @@ dbuf_verify(dmu_buf_impl_t *db) * data when we evict this buffer. */ if (db->db_dirtycnt == 0) { - ASSERTV(uint64_t *buf = db->db.db_data); - int i; - - for (i = 0; i < db->db.db_size >> 3; i++) { - ASSERT(buf[i] == 0); - } + uint64_t tmp = 0; + abd_iterate_rfunc(db->db.db_data, db->db.db_size, + checkzero, &tmp); + ASSERT(tmp == 0); } } DB_DNODE_EXIT(db); @@ -479,7 +494,7 @@ dbuf_update_data(dmu_buf_impl_t *db) ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level == 0 && db->db_user_data_ptr_ptr) { ASSERT(!refcount_is_zero(&db->db_holds)); - *db->db_user_data_ptr_ptr = db->db.db_data; + *db->db_user_data_ptr_ptr = ABD_TO_BUF(db->db.db_data); } } @@ -517,7 +532,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, blksz); - bcopy(db->db.db_data, abuf->b_data, blksz); + abd_copy(abuf->b_data, db->db.db_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); @@ -554,7 +569,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0 && db->db_freed_in_flight) { /* we were freed in flight; disregard any error */ arc_release(buf, db); - bzero(buf->b_data, db->db.db_size); + abd_zero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); @@ -593,12 +608,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); - db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); + db->db.db_data = abd_alloc_linear(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) - bzero(db->db.db_data, DN_MAX_BONUSLEN); + abd_zero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + abd_copy_from_buf(db->db.db_data, + DN_BONUS(dn->dn_phys), bonuslen); DB_DNODE_EXIT(db); dbuf_update_data(db); db->db_state = DB_CACHED; @@ -619,7 +635,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) DB_DNODE_EXIT(db); dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, db->db.db_size, db, type)); - bzero(db->db.db_data, db->db.db_size); + abd_zero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; mutex_exit(&db->db_mtx); @@ -794,7 +810,8 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) if (dr == NULL || (dr->dt.dl.dr_data != - ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + ((db->db_blkid == DMU_BONUS_BLKID) ? ABD_TO_BUF(db->db.db_data) : + db->db_buf))) return; /* @@ -809,14 +826,15 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + abd_copy_to_buf(dr->dt.dl.dr_data, db->db.db_data, + DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); - bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + abd_copy(dr->dt.dl.dr_data->b_data, db->db.db_data, size); } else { dbuf_set_data(db, NULL); } @@ -957,7 +975,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); - bzero(db->db.db_data, db->db.db_size); + abd_zero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); } @@ -1034,10 +1052,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) /* copy old block data to the new block */ obuf = db->db_buf; - bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); + abd_copy(buf->b_data, obuf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) - bzero((uint8_t *)buf->b_data + osize, size - osize); + abd_zero_off(buf->b_data, size - osize, osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); @@ -1206,7 +1224,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db.db_data; + data_old = ABD_TO_BUF(db->db.db_data); } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so @@ -1493,7 +1511,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ - bzero(db->db.db_data, db->db.db_size); + abd_zero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; } db->db_state = DB_CACHED; @@ -1563,7 +1581,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); - bcopy(buf->b_data, db->db.db_data, db->db.db_size); + abd_copy(db->db.db_data, buf->b_data, db->db.db_size); VERIFY(arc_buf_remove_ref(buf, db)); xuio_stat_wbuf_copied(); return; @@ -1629,7 +1647,7 @@ dbuf_clear(dmu_buf_impl_t *db) if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); if (db->db_blkid == DMU_BONUS_BLKID) { - zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + abd_free(db->db.db_data, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; @@ -1737,7 +1755,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, *parentp = NULL; return (err); } - *bpp = ((blkptr_t *)(*parentp)->db.db_data) + + *bpp = ((blkptr_t *)ABD_TO_BUF((*parentp)->db.db_data)) + (blkid & ((1ULL << epbs) - 1)); return (0); } else { @@ -2023,8 +2041,9 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) dbuf_set_data(dh->dh_db, arc_buf_alloc(dh->dh_dn->dn_objset->os_spa, dh->dh_db->db.db_size, dh->dh_db, dh->dh_type)); - bcopy(dh->dh_dr->dt.dl.dr_data->b_data, - dh->dh_db->db.db_data, dh->dh_db->db.db_size); + abd_copy(dh->dh_db->db.db_data, + dh->dh_dr->dt.dl.dr_data->b_data, + dh->dh_db->db.db_size); } } @@ -2430,7 +2449,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); db->db_parent = parent; } - db->db_blkptr = (blkptr_t *)parent->db.db_data + + db->db_blkptr = (blkptr_t *)ABD_TO_BUF(parent->db.db_data) + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } @@ -2515,7 +2534,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ - ASSERT(db->db.db_data != dr->dt.dl.dr_data); + ASSERT(!ABD_IS_LINEAR(db->db.db_data) || + ABD_TO_BUF(db->db.db_data) != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } @@ -2545,7 +2565,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); DB_DNODE_EXIT(db); - if (*datap != db->db.db_data) { + if (*datap != ABD_TO_BUF(db->db.db_data)) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } @@ -2606,7 +2626,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); *datap = arc_buf_alloc(os->os_spa, blksz, db, type); - bcopy(db->db.db_data, (*datap)->b_data, blksz); + abd_copy((*datap)->b_data, db->db.db_data, blksz); } db->db_data_pending = dr; @@ -2705,7 +2725,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { - dnode_phys_t *dnp = db->db.db_data; + dnode_phys_t *dnp = ABD_TO_BUF(db->db.db_data); for (i = db->db.db_size >> DNODE_SHIFT; i > 0; i--, dnp++) { if (dnp->dn_type != DMU_OT_NONE) @@ -2719,7 +2739,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) } } } else { - blkptr_t *ibp = db->db.db_data; + blkptr_t *ibp = ABD_TO_BUF(db->db.db_data); ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 18557ffb5c1f..4d9fab6392b8 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -705,8 +705,7 @@ ddt_free(ddt_entry_t *dde) ASSERT(dde->dde_lead_zio[p] == NULL); if (dde->dde_repair_data != NULL) - zio_buf_free(dde->dde_repair_data, - DDK_GET_PSIZE(&dde->dde_key)); + abd_free(dde->dde_repair_data, DDK_GET_PSIZE(&dde->dde_key)); cv_destroy(&dde->dde_cv); kmem_cache_free(ddt_entry_cache, dde); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 1501ae8046ad..084e0b970bd2 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -793,7 +793,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - bcopy((char *)db->db_data + bufoff, buf, tocpy); + abd_copy_to_buf_off(buf, db->db_data, tocpy, bufoff); offset += tocpy; size -= tocpy; @@ -835,7 +835,7 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, else dmu_buf_will_dirty(db, tx); - (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); + abd_copy_from_buf_off(db->db_data, buf, tocpy, bufoff); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); @@ -960,6 +960,7 @@ dmu_xuio_fini(xuio_t *xuio) * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } * and increase priv->next by 1. */ +/* TODO: abd handle xuio */ int dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) { @@ -1044,7 +1045,8 @@ xuio_stat_wbuf_nocopy() * return value is the number of bytes successfully copied to arg_buf. */ static int -dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset) +dmu_req_copy(abd_t *db_data, int db_offset, int size, struct request *req, + size_t req_offset) { struct bio_vec bv, *bvp; struct req_iterator iter; @@ -1078,9 +1080,11 @@ dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset) ASSERT3P(bv_buf, !=, NULL); if (rq_data_dir(req) == WRITE) - memcpy(arg_buf + offset, bv_buf, tocpy); + abd_copy_from_buf_off(db_data, bv_buf, tocpy, + db_offset + offset); else - memcpy(bv_buf, arg_buf + offset, tocpy); + abd_copy_to_buf_off(bv_buf, db_data, tocpy, + db_offset + offset); offset += tocpy; } @@ -1118,7 +1122,7 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) if (tocpy == 0) break; - didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, + didcpy = dmu_req_copy(db->db_data, bufoff, tocpy, req, req_offset); if (didcpy < tocpy) @@ -1173,7 +1177,7 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) else dmu_buf_will_dirty(db, tx); - didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, + didcpy = dmu_req_copy(db->db_data, bufoff, tocpy, req, req_offset); if (tocpy == db->db_size) @@ -1236,8 +1240,8 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else { - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); + err = abd_uiomove_off(db->db_data, tocpy, UIO_READ, + uio, bufoff); } if (err) break; @@ -1285,8 +1289,8 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) * to lock the pages in memory, so that uiomove won't * block. */ - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); + err = abd_uiomove_off(db->db_data, tocpy, UIO_WRITE, uio, + bufoff); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); @@ -1399,6 +1403,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, } else { objset_t *os; uint64_t object; + void *tmp_buf; DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); @@ -1407,7 +1412,13 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); - dmu_write(os, object, offset, blksz, buf->b_data, tx); + + tmp_buf = abd_borrow_buf_copy(buf->b_data, blksz); + + dmu_write(os, object, offset, blksz, tmp_buf, tx); + + abd_return_buf(buf->b_data, tmp_buf, blksz); + dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index 30fabbb07957..c7cbf55bdde6 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -138,7 +138,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - blk = abuf->b_data; + blk = ABD_TO_BUF(abuf->b_data); for (i = 0; i < blksz >> DNODE_SHIFT; i++) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f438ca62a11f..ad8082d3dfef 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -312,22 +312,22 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, arc_buf_t *buf = arc_buf_alloc(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); - bzero(buf->b_data, sizeof (objset_phys_t)); - bcopy(os->os_phys_buf->b_data, buf->b_data, + abd_zero(buf->b_data, sizeof (objset_phys_t)); + abd_copy(buf->b_data, os->os_phys_buf->b_data, arc_buf_size(os->os_phys_buf)); (void) arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } - os->os_phys = os->os_phys_buf->b_data; + os->os_phys = ABD_TO_BUF(os->os_phys_buf->b_data); os->os_flags = os->os_phys->os_flags; } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; os->os_phys_buf = arc_buf_alloc(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); - os->os_phys = os->os_phys_buf->b_data; + os->os_phys = ABD_TO_BUF(os->os_phys_buf->b_data); bzero(os->os_phys, size); } @@ -1219,7 +1219,7 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) void *data; if (db->db_dirtycnt == 0) - return (db->db.db_data); /* Nothing is changing */ + return (ABD_TO_BUF(db->db.db_data)); /* Nothing is changing */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg == tx->tx_txg) @@ -1235,7 +1235,7 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dn->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) - data = dr->dt.dl.dr_data->b_data; + data = ABD_TO_BUF(dr->dt.dl.dr_data->b_data); else data = dr->dt.dl.dr_data; @@ -1284,7 +1284,7 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); - data = (before) ? db->db.db_data : + data = (before) ? ABD_TO_BUF(db->db.db_data) : dmu_objset_userquota_find_data(db, tx); have_spill = B_TRUE; } else { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 1f61368c5d65..4f3c07fd5ac2 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -446,6 +446,16 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) +static int +fillbadblock(void *p, uint64_t size, void *private) +{ + int i; + uint64_t *x = p; + for (i = 0; i < size >> 3; i++) + x[i] = 0x2f5baddb10cULL; + return (0); +} + /* ARGSUSED */ static int backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, @@ -489,7 +499,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - blk = abuf->b_data; + blk = ABD_TO_BUF(abuf->b_data); for (i = 0; i < blksz >> DNODE_SHIFT; i++) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; @@ -508,7 +518,8 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); + err = dump_spill(dsp, zb->zb_object, blksz, + ABD_TO_BUF(abuf->b_data)); (void) arc_buf_remove_ref(abuf, &abuf); } else if (backup_do_embed(dsp, bp)) { /* it's an embedded level-0 block of a regular object */ @@ -519,6 +530,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); + void *buf; ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT0(zb->zb_level); @@ -526,21 +538,21 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { if (zfs_send_corrupt_data) { - uint64_t *ptr; /* Send a block filled with 0x"zfs badd bloc" */ abuf = arc_buf_alloc(spa, blksz, &abuf, ARC_BUFC_DATA); - for (ptr = abuf->b_data; - (char *)ptr < (char *)abuf->b_data + blksz; - ptr++) - *ptr = 0x2f5baddb10cULL; + + abd_iterate_wfunc(abuf->b_data, blksz, + fillbadblock, NULL); } else { return (SET_ERROR(EIO)); } } + buf = abd_borrow_buf_copy(abuf->b_data, blksz); err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz, - blksz, bp, abuf->b_data); + blksz, bp, buf); + abd_return_buf(abuf->b_data, buf, blksz); (void) arc_buf_remove_ref(abuf, &abuf); } @@ -1431,12 +1443,12 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); - bcopy(data, db->db_data, drro->drr_bonuslen); + bcopy(data, ABD_TO_BUF(db->db_data), drro->drr_bonuslen); if (ra->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); - dmu_ot_byteswap[byteswap].ob_func(db->db_data, - drro->drr_bonuslen); + dmu_ot_byteswap[byteswap].ob_func( + ABD_TO_BUF(db->db_data), drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); } @@ -1476,7 +1488,7 @@ restore_write(struct restorearg *ra, objset_t *os, dmu_tx_t *tx; dmu_buf_t *bonus; arc_buf_t *abuf; - void *data; + void *data, *lbuf; int err; if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || @@ -1490,9 +1502,11 @@ restore_write(struct restorearg *ra, objset_t *os, return (SET_ERROR(EINVAL)); abuf = dmu_request_arcbuf(bonus, drrw->drr_length); + lbuf = abd_borrow_buf(abuf->b_data, drrw->drr_length); - data = restore_read(ra, drrw->drr_length, abuf->b_data); + data = restore_read(ra, drrw->drr_length, lbuf); if (data == NULL) { + abd_return_buf(abuf->b_data, lbuf, drrw->drr_length); dmu_return_arcbuf(abuf); dmu_buf_rele(bonus, FTAG); return (ra->err); @@ -1504,6 +1518,7 @@ restore_write(struct restorearg *ra, objset_t *os, drrw->drr_offset, drrw->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { + abd_return_buf(abuf->b_data, lbuf, drrw->drr_length); dmu_return_arcbuf(abuf); dmu_buf_rele(bonus, FTAG); dmu_tx_abort(tx); @@ -1514,6 +1529,7 @@ restore_write(struct restorearg *ra, objset_t *os, DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); } + abd_return_buf_copy(abuf->b_data, lbuf, drrw->drr_length); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); @@ -1538,6 +1554,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, avl_index_t where; objset_t *ref_os = NULL; dmu_buf_t *dbp; + void *buf; if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) return (SET_ERROR(EINVAL)); @@ -1572,8 +1589,10 @@ restore_write_byref(struct restorearg *ra, objset_t *os, dmu_tx_abort(tx); return (err); } + buf = abd_borrow_buf_copy(dbp->db_data, drrwbr->drr_length); dmu_write(os, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + drrwbr->drr_offset, drrwbr->drr_length, buf, tx); + abd_return_buf(dbp->db_data, buf, drrwbr->drr_length); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); return (0); @@ -1662,7 +1681,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) if (db_spill->db_size < drrs->drr_length) VERIFY(0 == dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); - bcopy(data, db_spill->db_data, drrs->drr_length); + abd_copy_from_buf(db_spill->db_data, data, drrs->drr_length); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index b5c1ec758f8b..5b053aa4d5a8 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -295,7 +295,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_level - 1, zb->zb_blkid * epb + i); traverse_prefetch_metadata(td, - &((blkptr_t *)buf->b_data)[i], czb); + &((blkptr_t *)ABD_TO_BUF(buf->b_data))[i], czb); } /* recursively visitbp() blocks below this */ @@ -304,7 +304,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, - &((blkptr_t *)buf->b_data)[i], czb); + &((blkptr_t *)ABD_TO_BUF(buf->b_data))[i], czb); if (err != 0) break; } @@ -320,7 +320,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; - dnp = buf->b_data; + dnp = ABD_TO_BUF(buf->b_data); for (i = 0; i < epb; i++) { prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset, @@ -344,7 +344,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (err != 0) goto post; - osp = buf->b_data; + osp = ABD_TO_BUF(buf->b_data); dnp = &osp->os_meta_dnode; prefetch_dnode_metadata(td, dnp, zb->zb_objset, DMU_META_DNODE_OBJECT); @@ -552,7 +552,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, if (err != 0) return (err); - osp = buf->b_data; + osp = ABD_TO_BUF(buf->b_data); traverse_zil(td, &osp->os_zil_header); (void) arc_buf_remove_ref(buf, &buf); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index cdf5a6d0fcfa..4b828e5bbc0a 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -548,7 +548,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) break; } - bp = dbuf->db.db_data; + bp = ABD_TO_BUF(dbuf->db.db_data); bp += blkoff; for (i = 0; i < tochk; i++) { From 4812c902906b47a415bf8067b769542f77dd935b Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 08/26] Handle abd_t in dnode*.c, dsl_*.c Signed-off-by: Chunwei Chen --- module/zfs/dnode.c | 14 +++++----- module/zfs/dnode_sync.c | 56 ++++++++++++++++++++++++++------------- module/zfs/dsl_dataset.c | 10 +++---- module/zfs/dsl_deadlist.c | 4 +-- module/zfs/dsl_dir.c | 6 ++--- module/zfs/dsl_scan.c | 18 ++++++++----- 6 files changed, 64 insertions(+), 44 deletions(-) diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index ef74621a0f6c..eb8de04470d4 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -235,7 +235,7 @@ dnode_verify(dnode_t *dn) ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, - (dnode_phys_t *)dn->dn_dbuf->db.db_data + + (dnode_phys_t *)ABD_TO_BUF(dn->dn_dbuf->db.db_data) + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); } if (drop_struct_lock) @@ -1089,7 +1089,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); if ((dn = dnh->dnh_dnode) == NULL) { - dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; + dnode_phys_t *phys = + (dnode_phys_t *)ABD_TO_BUF(db->db.db_data) + idx; dnode_t *winner; dn = dnode_create(os, phys, db, object, dnh); @@ -1509,16 +1510,13 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) head = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, FTAG, &db) == 0) { - caddr_t data; - /* don't dirty if it isn't on disk and isn't dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - data = db->db.db_data; - bzero(data + blkoff, head); + abd_zero_off(db->db.db_data, head, blkoff); } dbuf_rele(db, FTAG); } @@ -1553,7 +1551,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - bzero(db->db.db_data, tail); + abd_zero(db->db.db_data, tail); } dbuf_rele(db, FTAG); } @@ -1780,7 +1778,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dbuf_rele(db, FTAG); return (error); } - data = db->db.db_data; + data = ABD_TO_BUF(db->db.db_data); } diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 1825e983551c..efd52c8f6f1f 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -69,7 +69,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); - bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, + bcopy(dn->dn_phys->dn_blkptr, ABD_TO_BUF(db->db.db_data), sizeof (blkptr_t) * nblkptr); arc_buf_freeze(db->db_buf); } @@ -98,7 +98,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) child->db_parent = db; dbuf_add_ref(db, child); if (db->db.db_data) - child->db_blkptr = (blkptr_t *)db->db.db_data + i; + child->db_blkptr = + (blkptr_t *)ABD_TO_BUF(db->db.db_data) + i; else child->db_blkptr = NULL; dprintf_dbuf_bp(child, child->db_blkptr, @@ -159,6 +160,21 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) } #ifdef ZFS_DEBUG +static int +checkzero(const void *p, uint64_t size, void *private) +{ + int i; + const uint64_t *x = p; + uint64_t *t = private; + for (i = 0; i < size >> 3; i++) { + if (x[i] != 0) { + *t = x[i]; + return (1); + } + } + return (0); +} + static void free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) { @@ -181,10 +197,9 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) ASSERT(db->db_blkptr != NULL); for (i = off; i < off+num; i++) { - uint64_t *buf; + abd_t *buf; dmu_buf_impl_t *child; dbuf_dirty_record_t *dr; - int j; ASSERT(db->db_level == 1); @@ -203,13 +218,14 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) /* data_old better be zeroed */ if (dr) { + uint64_t tmp = 0; buf = dr->dt.dl.dr_data->b_data; - for (j = 0; j < child->db.db_size >> 3; j++) { - if (buf[j] != 0) { - panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - (void *)child, i, off, num); - } + abd_iterate_rfunc(buf, child->db.db_size, + checkzero, &tmp); + if (tmp) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); } } @@ -221,12 +237,13 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && child->db_last_dirty == NULL) { - for (j = 0; j < child->db.db_size >> 3; j++) { - if (buf[j] != 0) { - panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - (void *)child, i, off, num); - } + uint64_t tmp = 0; + abd_iterate_rfunc(buf, child->db.db_size, + checkzero, &tmp); + if (tmp) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); } } mutex_exit(&child->db_mtx); @@ -257,7 +274,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); dbuf_release_bp(db); - bp = db->db.db_data; + bp = (blkptr_t *)ABD_TO_BUF(db->db.db_data); DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -296,13 +313,14 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, } /* If this whole block is free, free ourself too. */ - for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { + bp = ABD_TO_BUF(db->db.db_data); + for (i = 0; i < 1 << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) break; } if (i == 1 << epbs) { /* didn't find any non-holes */ - bzero(db->db.db_data, db->db.db_size); + bzero(ABD_TO_BUF(db->db.db_data), db->db.db_size); free_blocks(dn, db->db_blkptr, 1, tx); } else { /* diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 79cb6a3a25e5..322809e8ecd3 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -290,7 +290,7 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) FTAG, &headdbuf); if (err != 0) return (err); - headphys = headdbuf->db_data; + headphys = ABD_TO_BUF(headdbuf->db_data); err = zap_value_search(dp->dp_meta_objset, headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); dmu_buf_rele(headdbuf, FTAG); @@ -368,7 +368,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; - ds->ds_phys = dbuf->db_data; + ds->ds_phys = ABD_TO_BUF(dbuf->db_data); list_link_init(&ds->ds_synced_link); mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); @@ -459,7 +459,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } } ASSERT3P(ds->ds_dbuf, ==, dbuf); - ASSERT3P(ds->ds_phys, ==, dbuf->db_data); + ASSERT3P(ds->ds_phys, ==, ABD_TO_BUF(dbuf->db_data)); ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); @@ -658,7 +658,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; + dsphys = ABD_TO_BUF(dbuf->db_data); bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; dsphys->ds_flags = flags; @@ -1064,7 +1064,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; + dsphys = ABD_TO_BUF(dbuf->db_data); bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 8a4362ff9c1a..6a34f4e7a01e 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -111,7 +111,7 @@ dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) } dl->dl_oldfmt = B_FALSE; - dl->dl_phys = dl->dl_dbuf->db_data; + dl->dl_phys = ABD_TO_BUF(dl->dl_dbuf->db_data); dl->dl_havetree = B_FALSE; } @@ -482,7 +482,7 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) zap_cursor_fini(&zc); VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); - dlp = bonus->db_data; + dlp = ABD_TO_BUF(bonus->db_data); dmu_buf_will_dirty(bonus, tx); bzero(dlp, sizeof (*dlp)); dmu_buf_rele(bonus, FTAG); diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index b94b68e15774..b61b3a67ff41 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -101,7 +101,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; - dd->dd_phys = dbuf->db_data; + dd->dd_phys = ABD_TO_BUF(dbuf->db_data); mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), @@ -148,7 +148,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); if (err != 0) goto errout; - origin_phys = origin_bonus->db_data; + origin_phys = ABD_TO_BUF(origin_bonus->db_data); dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); @@ -405,7 +405,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - ddphys = dbuf->db_data; + ddphys = ABD_TO_BUF(dbuf->db_data); ddphys->dd_creation_time = gethrestime_sec(); if (pds) diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 8b166bcc68eb..34233b801b01 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -648,11 +648,13 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, scn->scn_phys.scn_errors++; return (err); } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + cbp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cbp++) { dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_object, zb->zb_blkid * epb + i); } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + cbp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, @@ -675,14 +677,16 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, scn->scn_phys.scn_errors++; return (err); } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + cdnp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cdnp++) { for (j = 0; j < cdnp->dn_nblkptr; j++) { blkptr_t *cbp = &cdnp->dn_blkptr[j]; dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_blkid * epb + i, j); } } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + cdnp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cdnp++) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } @@ -700,7 +704,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, return (err); } - osp = buf->b_data; + osp = ABD_TO_BUF(buf->b_data); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); @@ -1717,7 +1721,7 @@ dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1801,7 +1805,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (needs_io && !zfs_no_scrub_io) { vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; - void *data = zio_data_buf_alloc(size); + abd_t *data = abd_alloc_linear(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= maxinflight) From a4f28b986c3fe19eadb1e1ab88cd9cef79bbf004 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 09/26] Handle abd_t in sa_impl.h, sa.c, space_map.c, spa.c, spa_history.c and zil.c Signed-off-by: Chunwei Chen --- include/sys/sa_impl.h | 3 +-- module/zfs/sa.c | 13 ++++++++----- module/zfs/spa.c | 10 +++++----- module/zfs/spa_history.c | 6 +++--- module/zfs/space_map.c | 2 +- module/zfs/zil.c | 17 +++++++++++------ 6 files changed, 29 insertions(+), 22 deletions(-) diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h index fcbd8eb34e91..9dbe8f105d91 100644 --- a/include/sys/sa_impl.h +++ b/include/sys/sa_impl.h @@ -221,8 +221,7 @@ struct sa_handle { (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) #define SA_GET_HDR(hdl, type) \ - ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ - type))->db.db_data)) + ((sa_hdr_phys_t *)ABD_TO_BUF((SA_GET_DB(hdl, type))->db.db_data)) #define SA_IDX_TAB_GET(hdl, type) \ (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 9063d1dae449..60a4c586fb9f 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -724,8 +724,9 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, } /* setup starting pointers to lay down data */ - data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); - sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; + data_start = + (void *)((uintptr_t)ABD_TO_BUF(hdl->sa_bonus->db_data) + hdrsize); + sahdr = (sa_hdr_phys_t *)ABD_TO_BUF(hdl->sa_bonus->db_data); buftype = SA_BONUS; attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, @@ -753,7 +754,9 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, hash = -1ULL; len_idx = 0; - sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; + sahdr = (sa_hdr_phys_t *) + ABD_TO_BUF(hdl->sa_spill->db_data); + sahdr->sa_magic = SA_MAGIC; data_start = (void *)((uintptr_t)sahdr + spillhdrsize); @@ -1676,7 +1679,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, if (dn->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); - bcopy(hdl->sa_bonus->db_data, old_data[0], + abd_copy_to_buf(old_data[0], hdl->sa_bonus->db_data, hdl->sa_bonus->db_size); bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; } else { @@ -1689,7 +1692,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, if ((error = sa_get_spill(hdl)) == 0) { spill_data_size = hdl->sa_spill->db_size; old_data[1] = zio_buf_alloc(spill_data_size); - bcopy(hdl->sa_spill->db_data, old_data[1], + abd_copy_to_buf(old_data[1], hdl->sa_spill->db_data, hdl->sa_spill->db_size); spill_attr_count = hdl->sa_spill_tab->sa_layout->lot_attr_count; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 8bee15094685..10a4ba8349a5 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1583,7 +1583,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) if (error) return (error); - nvsize = *(uint64_t *)db->db_data; + nvsize = *(uint64_t *)ABD_TO_BUF(db->db_data); dmu_buf_rele(db, FTAG); packed = vmem_alloc(nvsize, KM_SLEEP); @@ -1866,7 +1866,7 @@ spa_load_verify_done(zio_t *zio) else atomic_add_64(&sle->sle_data_count, 1); } - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1889,7 +1889,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, { zio_t *rio; size_t size; - void *data; + abd_t *data; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); @@ -1905,7 +1905,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, rio = arg; size = BP_GET_PSIZE(bp); - data = zio_data_buf_alloc(size); + data = abd_alloc_linear(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) @@ -5961,7 +5961,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = nvsize; + *(uint64_t *)ABD_TO_BUF(db->db_data) = nvsize; dmu_buf_rele(db, FTAG); } diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index 14e681e77d8b..875c671e5bd8 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -99,7 +99,7 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); - shpp = dbp->db_data; + shpp = ABD_TO_BUF(dbp->db_data); dmu_buf_will_dirty(dbp, tx); /* @@ -222,7 +222,7 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx) * Update the offset when the write completes. */ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); - shpp = dbp->db_data; + shpp = ABD_TO_BUF(dbp->db_data); dmu_buf_will_dirty(dbp, tx); @@ -361,7 +361,7 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) return (err); - shpp = dbp->db_data; + shpp = ABD_TO_BUF(dbp->db_data); #ifdef ZFS_DEBUG { diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index b3aa469bf45b..d13f55ee3f18 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -349,7 +349,7 @@ space_map_open_impl(space_map_t *sm) return (error); dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); - sm->sm_phys = sm->sm_dbuf->db_data; + sm->sm_phys = ABD_TO_BUF(sm->sm_dbuf->db_data); return (0); } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 15897b363de6..d2ae8336c536 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -235,7 +235,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, cksum.zc_word[ZIL_ZC_SEQ]++; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = abuf->b_data; + zil_chain_t *zilc = ABD_TO_BUF(abuf->b_data); char *lr = (char *)(zilc + 1); uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); @@ -243,12 +243,12 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { error = SET_ERROR(ECKSUM); } else { - bcopy(lr, dst, len); + memcpy(dst, lr, len); *end = (char *)dst + len; *nbp = zilc->zc_next_blk; } } else { - char *lr = abuf->b_data; + char *lr = ABD_TO_BUF(abuf->b_data); uint64_t size = BP_GET_LSIZE(bp); zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; @@ -257,7 +257,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { - bcopy(lr, dst, zilc->zc_nused); + memcpy(dst, lr, zilc->zc_nused); *end = (char *)dst + zilc->zc_nused; *nbp = zilc->zc_next_blk; } @@ -299,7 +299,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (error == 0) { if (wbuf != NULL) - bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); + abd_copy_to_buf(wbuf, abuf->b_data, arc_buf_size(abuf)); (void) arc_buf_remove_ref(abuf, &abuf); } @@ -887,6 +887,7 @@ zil_lwb_write_done(zio_t *zio) * one in zil_commit_writer(). zil_sync() will only remove * the lwb if lwb_buf is null. */ + abd_put(zio->io_data); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_zio = NULL; @@ -923,12 +924,16 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); if (lwb->lwb_zio == NULL) { + abd_t *lwb_abd; if (!lwb->lwb_fastwrite) { metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); lwb->lwb_fastwrite = 1; } + + lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), + 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_FASTWRITE, &zb); From 7364eacd3bc1edb99cfb47f9910ff5a43e316a80 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 10/26] Handle abd_t in zap*.c, zfs_fuid.c, zfs_sa.c and zfs_vnops.c Signed-off-by: Chunwei Chen --- module/zfs/zap.c | 29 +++++++++++++++-------------- module/zfs/zap_micro.c | 6 +++--- module/zfs/zfs_fuid.c | 4 ++-- module/zfs/zfs_sa.c | 28 ++++++++++++++-------------- module/zfs/zfs_vnops.c | 5 +++-- 5 files changed, 37 insertions(+), 35 deletions(-) diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 5ffa138a6b4b..5ebd3b3d3988 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -91,7 +91,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) * explicitly zero it since it might be coming from an * initialized microzap */ - bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); + bzero(ABD_TO_BUF(zap->zap_dbuf->db_data), zap->zap_dbuf->db_size); zp->zap_block_type = ZBT_HEADER; zp->zap_magic = ZAP_MAGIC; @@ -117,7 +117,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); l->l_dbuf = db; - l->l_phys = db->db_data; + l->l_phys = ABD_TO_BUF(db->db_data); zap_leaf_init(l, zp->zap_normflags != 0); @@ -181,15 +181,16 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); - transfer_func(db_old->db_data, db_new->db_data, hepb); + transfer_func(ABD_TO_BUF(db_old->db_data), + ABD_TO_BUF(db_new->db_data), hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); - transfer_func((uint64_t *)db_old->db_data + hepb, - db_new->db_data, hepb); + transfer_func((uint64_t *)ABD_TO_BUF(db_old->db_data) + hepb, + ABD_TO_BUF(db_new->db_data), hepb); dmu_buf_rele(db_new, FTAG); dmu_buf_rele(db_old, FTAG); @@ -253,12 +254,11 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, return (err); } dmu_buf_will_dirty(db2, tx); - ((uint64_t *)db2->db_data)[off2] = val; - ((uint64_t *)db2->db_data)[off2+1] = val; + ((uint64_t *)ABD_TO_BUF(db2->db_data))[off2] = val; + ((uint64_t *)ABD_TO_BUF(db2->db_data))[off2+1] = val; dmu_buf_rele(db2, FTAG); } - - ((uint64_t *)db->db_data)[off] = val; + ((uint64_t *)ABD_TO_BUF(db->db_data))[off] = val; dmu_buf_rele(db, FTAG); return (0); @@ -281,7 +281,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err) return (err); - *valp = ((uint64_t *)db->db_data)[off]; + *valp = ((uint64_t *)ABD_TO_BUF(db->db_data))[off]; dmu_buf_rele(db, FTAG); if (tbl->zt_nextblk != 0) { @@ -350,7 +350,8 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) return (err); dmu_buf_will_dirty(db_new, tx); zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), - db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + ABD_TO_BUF(db_new->db_data), + 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); dmu_buf_rele(db_new, FTAG); zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; @@ -530,7 +531,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, dmu_buf_will_dirty(db, tx); ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); - ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data); + ASSERT3P(l->l_phys, ==, ABD_TO_BUF(l->l_dbuf->db_data)); ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF); ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); @@ -576,7 +577,7 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) int err; ASSERT(zap->zap_dbuf == NULL || - zap->zap_f.zap_phys == zap->zap_dbuf->db_data); + zap->zap_f.zap_phys == ABD_TO_BUF(zap->zap_dbuf->db_data)); ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); err = zap_idx_to_blk(zap, idx, &blk); @@ -1324,7 +1325,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { - zap_stats_ptrtbl(zap, db->db_data, + zap_stats_ptrtbl(zap, ABD_TO_BUF(db->db_data), 1<<(bs-3), zs); dmu_buf_rele(db, FTAG); } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index dfa7c6615659..b7f1654f7a0a 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -372,7 +372,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap->zap_object = obj; zap->zap_dbuf = db; - if (*(uint64_t *)db->db_data != ZBT_MICRO) { + if (*(uint64_t *)ABD_TO_BUF(db->db_data) != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; } else { @@ -531,7 +531,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) sz = zap->zap_dbuf->db_size; mzp = zio_buf_alloc(sz); - bcopy(zap->zap_dbuf->db_data, mzp, sz); + bcopy(ABD_TO_BUF(zap->zap_dbuf->db_data), mzp, sz); nchunks = zap->zap_m.zap_num_chunks; if (!flags) { @@ -587,7 +587,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, #endif dmu_buf_will_dirty(db, tx); - zp = db->db_data; + zp = ABD_TO_BUF(db->db_data); zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 6ca61b87242f..67154cf93dab 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -120,7 +120,7 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, ASSERT(fuid_obj != 0); VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db)); - fuid_size = *(uint64_t *)db->db_data; + fuid_size = *(uint64_t *)ABD_TO_BUF(db->db_data); dmu_buf_rele(db, FTAG); if (fuid_size) { @@ -281,7 +281,7 @@ zfs_fuid_sync(zfs_sb_t *zsb, dmu_tx_t *tx) VERIFY(0 == dmu_bonus_hold(zsb->z_os, zsb->z_fuid_obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = zsb->z_fuid_size; + *(uint64_t *)ABD_TO_BUF(db->db_data) = zsb->z_fuid_size; dmu_buf_rele(db, FTAG); zsb->z_fuid_dirty = B_FALSE; diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index 257ab4254bbd..c15098373c9b 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -77,14 +77,14 @@ zfs_sa_readlink(znode_t *zp, uio_t *uio) bufsz = zp->z_size; if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { - error = uiomove((caddr_t)db->db_data + - ZFS_OLD_ZNODE_PHYS_SIZE, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + error = abd_uiomove_off(db->db_data, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio, + ZFS_OLD_ZNODE_PHYS_SIZE); } else { dmu_buf_t *dbp; if ((error = dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { - error = uiomove(dbp->db_data, + error = abd_uiomove(dbp->db_data, MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); dmu_buf_rele(dbp, FTAG); } @@ -101,8 +101,8 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) VERIFY(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0); if (len) { - bcopy(link, (caddr_t)db->db_data + - ZFS_OLD_ZNODE_PHYS_SIZE, len); + abd_copy_from_buf_off(db->db_data, link, len, + ZFS_OLD_ZNODE_PHYS_SIZE); } } else { dmu_buf_t *dbp; @@ -114,7 +114,7 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) dmu_buf_will_dirty(dbp, tx); ASSERT3U(len, <=, dbp->db_size); - bcopy(link, dbp->db_data, len); + abd_copy_from_buf(dbp->db_data, link, len); dmu_buf_rele(dbp, FTAG); } } @@ -145,9 +145,9 @@ zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) ZFS_OLD_ZNODE_PHYS_SIZE; if (len <= doi.doi_bonus_size) { - (void) memcpy(xoap->xoa_av_scanstamp, - (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - sizeof (xoap->xoa_av_scanstamp)); + abd_copy_to_buf_off(xoap->xoa_av_scanstamp, + db->db_data, sizeof (xoap->xoa_av_scanstamp), + ZFS_OLD_ZNODE_PHYS_SIZE); } } XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); @@ -175,8 +175,8 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) ZFS_OLD_ZNODE_PHYS_SIZE; if (len > doi.doi_bonus_size) VERIFY(dmu_set_bonus(db, len, tx) == 0); - (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); + abd_copy_from_buf_off(db->db_data, xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp), ZFS_OLD_ZNODE_PHYS_SIZE); zp->z_pflags |= ZFS_BONUS_SCANSTAMP; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zsb), @@ -375,8 +375,8 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) /* if scanstamp then add scanstamp */ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { - bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - scanstamp, AV_SCANSTAMP_SZ); + abd_copy_to_buf_off(scanstamp, db->db_data, + AV_SCANSTAMP_SZ, ZFS_OLD_ZNODE_PHYS_SIZE); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zsb), NULL, scanstamp, AV_SCANSTAMP_SZ); zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 723d6210f26f..f14387aaa36d 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -711,7 +711,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) error = SET_ERROR(EDQUOT); break; } - + /* TODO: abd can't handle xuio */ if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; @@ -738,7 +738,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = uiocopy(abuf->b_data, max_blksz, + if ((error = abd_uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; @@ -800,6 +800,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ + /* TODO: abd can't handle xuio */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); From 76f2c36b86bf117f1db95a9079ba997222d20334 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 11/26] Handle abd_t in zio.c Signed-off-by: Chunwei Chen --- include/sys/zio.h | 2 +- module/zfs/zio.c | 240 +++++++++++++++++++++++++++++++--------------- 2 files changed, 166 insertions(+), 76 deletions(-) diff --git a/include/sys/zio.h b/include/sys/zio.h index 7bed94a6036c..e1b1b8c705cc 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -352,7 +352,7 @@ typedef struct zio_gang_node { } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, - zio_gang_node_t *gn, void *data); + zio_gang_node_t *gn, abd_t *data, uint64_t offset); typedef void zio_transform_func_t(zio_t *zio, abd_t *data, uint64_t size); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 49e2d93b6783..87342090a0dc 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -256,11 +256,16 @@ zio_data_buf_free(void *buf, size_t size) * ========================================================================== */ static void -zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, +zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); + if (ABD_IS_LINEAR(zio->io_data)) + ASSERT_ABD_LINEAR(data); + else + ASSERT_ABD_SCATTER(data); + zt->zt_orig_data = zio->io_data; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; @@ -284,7 +289,7 @@ zio_pop_transforms(zio_t *zio) zt->zt_orig_data, zt->zt_orig_size); if (zt->zt_bufsize != 0) - zio_buf_free(zio->io_data, zt->zt_bufsize); + abd_free(zio->io_data, zt->zt_bufsize); zio->io_data = zt->zt_orig_data; zio->io_size = zt->zt_orig_size; @@ -300,21 +305,29 @@ zio_pop_transforms(zio_t *zio) * ========================================================================== */ static void -zio_subblock(zio_t *zio, void *data, uint64_t size) +zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) - bcopy(zio->io_data, data, size); + abd_copy(data, zio->io_data, size); } static void -zio_decompress(zio_t *zio, void *data, uint64_t size) +zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { - if (zio->io_error == 0 && - zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_data, data, zio->io_size, size) != 0) - zio->io_error = SET_ERROR(EIO); + void *buf1, *buf2; + if (zio->io_error == 0) { + buf1 = abd_borrow_buf_copy(zio->io_data, zio->io_size); + buf2 = abd_borrow_buf(data, size); + + if (zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + buf1, buf2, zio->io_size, size) != 0) + zio->io_error = SET_ERROR(EIO); + + abd_return_buf_copy(data, buf2, size); + abd_return_buf(zio->io_data, buf1, zio->io_size); + } } /* @@ -483,7 +496,7 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) @@ -595,7 +608,7 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -611,7 +624,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, + abd_t *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) @@ -650,7 +663,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } zio_t * -zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, +zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { @@ -802,7 +815,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -823,7 +836,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -846,8 +859,13 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ - void *wbuf = zio_buf_alloc(size); - bcopy(data, wbuf, size); + abd_t *wbuf; + if (ABD_IS_LINEAR(data)) + wbuf = abd_alloc_linear(size); + else + wbuf = abd_alloc_scatter(size); + abd_copy(wbuf, data, size); + zio_push_transform(zio, wbuf, size, size, NULL); } @@ -859,7 +877,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; @@ -903,7 +921,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, } zio_t * -zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, +zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { @@ -961,14 +979,24 @@ zio_read_bp_init(zio_t *zio) !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); - void *cbuf = zio_buf_alloc(psize); + abd_t *cbuf; + if (ABD_IS_LINEAR(zio->io_data)) + cbuf = abd_alloc_linear(psize); + else + cbuf = abd_alloc_scatter(psize); zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { + void *data; + int psize; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - decode_embedded_bp_compressed(bp, zio->io_data); + + psize = BPE_GET_PSIZE(bp); + data = abd_borrow_buf(zio->io_data, psize); + decode_embedded_bp_compressed(bp, data); + abd_return_buf_copy(zio->io_data, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } @@ -1072,11 +1100,25 @@ zio_write_bp_init(zio_t *zio) } if (compress != ZIO_COMPRESS_OFF) { - void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + void *cbuf; + void *dbuf; + abd_t *cdata; + + if (ABD_IS_LINEAR(zio->io_data)) + cdata = abd_alloc_linear(lsize); + else + cdata = abd_alloc_scatter(lsize); + + cbuf = abd_borrow_buf(cdata, lsize); + + dbuf = abd_borrow_buf_copy(zio->io_data, lsize); + psize = zio_compress_data(compress, dbuf, cbuf, lsize); + abd_return_buf(zio->io_data, dbuf, lsize); + if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; - zio_buf_free(cbuf, lsize); + abd_return_buf(cdata, cbuf, lsize); + abd_free(cdata, lsize); } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { @@ -1085,7 +1127,8 @@ zio_write_bp_init(zio_t *zio) BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); - zio_buf_free(cbuf, lsize); + abd_return_buf(cdata, cbuf, lsize); + abd_free(cdata, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, @@ -1098,15 +1141,16 @@ zio_write_bp_init(zio_t *zio) */ size_t rounded = P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE); + abd_return_buf_copy(cdata, cbuf, lsize); if (rounded > psize) { - bzero((char *)cbuf + psize, rounded - psize); + abd_zero_off(cdata, rounded - psize, psize); psize = rounded; } if (psize == lsize) { compress = ZIO_COMPRESS_OFF; - zio_buf_free(cbuf, lsize); + abd_free(cdata, lsize); } else { - zio_push_transform(zio, cbuf, + zio_push_transform(zio, cdata, psize, lsize, NULL); } } @@ -1600,26 +1644,38 @@ zio_resume_wait(spa_t *spa) * ========================================================================== */ +static void +zio_gang_issue_func_done(zio_t *zio) +{ + abd_put(zio->io_data); +} + static zio_t * -zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { if (gn != NULL) return (pio); - return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), - NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), + BP_GET_PSIZE(bp), zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } -zio_t * -zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { zio_t *zio; + abd_t *gbh_abd; if (gn != NULL) { + gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will @@ -1630,8 +1686,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { + abd_t *buf = abd_get_offset(data, offset); + zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), - data, BP_GET_PSIZE(bp)); + buf, BP_GET_PSIZE(bp)); + + abd_put(buf); } /* * If we are here to damage data for testing purposes, @@ -1641,7 +1701,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, + abd_get_offset(data, offset), BP_GET_PSIZE(bp), + zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } @@ -1649,16 +1710,18 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) } /* ARGSUSED */ -zio_t * -zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ -zio_t * -zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); @@ -1722,13 +1785,14 @@ static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, - SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, - gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_gang_tree_assemble_done, gn, gio->io_priority, + ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void @@ -1746,12 +1810,14 @@ zio_gang_tree_assemble_done(zio_t *zio) return; if (BP_SHOULD_BYTESWAP(bp)) - byteswap_uint64_array(zio->io_data, zio->io_size); + byteswap_uint64_array(ABD_TO_BUF(zio->io_data), zio->io_size); - ASSERT(zio->io_data == gn->gn_gbh); + ASSERT(ABD_TO_BUF(zio->io_data) == gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + abd_put(zio->io_data); + for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) @@ -1761,7 +1827,8 @@ zio_gang_tree_assemble_done(zio_t *zio) } static void -zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) +zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, + uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -1775,7 +1842,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ - zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); @@ -1784,13 +1851,14 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; - zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); - data = (char *)data + BP_GET_PSIZE(gbp); + zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, + offset); + offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) - ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); + ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); @@ -1823,7 +1891,8 @@ zio_gang_issue(zio_t *zio) ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) - zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data, + 0); else zio_gang_tree_free(&zio->io_gang_tree); @@ -1863,6 +1932,12 @@ zio_write_gang_member_ready(zio_t *zio) mutex_exit(&pio->io_lock); } +static void +zio_write_gang_done(zio_t *zio) +{ + abd_put(zio->io_data); +} + static int zio_write_gang_block(zio_t *pio) { @@ -1872,6 +1947,7 @@ zio_write_gang_block(zio_t *pio) zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; + abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; @@ -1898,12 +1974,14 @@ zio_write_gang_block(zio_t *pio) gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); + gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ - zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_write_gang_done, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. @@ -1923,10 +2001,10 @@ zio_write_gang_block(zio_t *pio) zp.zp_nopwrite = B_FALSE; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, - zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), - &pio->io_bookmark)); + abd_get_offset(pio->io_data, pio->io_size - resid), lsize, + &zp, zio_write_gang_member_ready, NULL, + zio_write_gang_done, &gn->gn_child[g], pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } /* @@ -2017,10 +2095,11 @@ zio_ddt_child_read_done(zio_t *zio) ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ + if (zio->io_error == 0 && dde->dde_repair_data == NULL) dde->dde_repair_data = zio->io_data; else - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_exit(&pio->io_lock); } @@ -2053,10 +2132,10 @@ zio_ddt_read_start(zio_t *zio) ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, - zio_buf_alloc(zio->io_size), zio->io_size, - zio_ddt_child_read_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, - &zio->io_bookmark)); + abd_alloc_linear(zio->io_size), + zio->io_size, zio_ddt_child_read_done, dde, + zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | + ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } @@ -2093,7 +2172,8 @@ zio_ddt_read_done(zio_t *zio) return (ZIO_PIPELINE_STOP); } if (dde->dde_repair_data != NULL) { - bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); + abd_copy(zio->io_data, dde->dde_repair_data, + zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); @@ -2122,7 +2202,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || - bcmp(zio->io_orig_data, lio->io_orig_data, + abd_cmp(zio->io_orig_data, lio->io_orig_data, zio->io_orig_size) != 0); } } @@ -2147,7 +2227,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (error == 0) { if (arc_buf_size(abuf) != zio->io_orig_size || - bcmp(abuf->b_data, zio->io_orig_data, + abd_cmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); VERIFY(arc_buf_remove_ref(abuf, &abuf)); @@ -2574,11 +2654,15 @@ zio_vdev_io_start(zio_t *zio) P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); + abd_t *abuf; + if (ABD_IS_LINEAR(zio->io_data)) + abuf = abd_alloc_linear(asize); + else + abuf = abd_alloc_scatter(asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf + zio->io_size, asize - zio->io_size); + abd_copy(abuf, zio->io_data, zio->io_size); + abd_zero_off(abuf, asize - zio->io_size, zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } @@ -2703,7 +2787,7 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *buf = zio_buf_alloc(zio->io_size); - bcopy(zio->io_data, buf, zio->io_size); + abd_copy_to_buf(buf, zio->io_data, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; @@ -3028,21 +3112,27 @@ zio_done(zio_t *zio) zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio->io_data; + char *abuf; + abd_t *adata = zio->io_data; if (asize != zio->io_size) { - abuf = zio_buf_alloc(asize); - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf+zio->io_size, asize-zio->io_size); + adata = abd_alloc_linear(asize); + abd_copy(adata, zio->io_data, zio->io_size); + abd_zero_off(adata, asize-zio->io_size, + zio->io_size); } + abuf = abd_borrow_buf_copy(adata, asize); + zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); + abd_return_buf(adata, abuf, asize); + if (asize != zio->io_size) - zio_buf_free(abuf, asize); + abd_free(adata, asize); } } From 5badaba2e003c340ce9c5dbbb4909c84ec8507ed Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 12/26] Handle abd_t in vdev*.c sans vdev_raidz.c Signed-off-by: Chunwei Chen --- include/sys/vdev_impl.h | 2 +- module/zfs/vdev.c | 6 ++--- module/zfs/vdev_cache.c | 11 ++++---- module/zfs/vdev_disk.c | 28 ++++++++++++++----- module/zfs/vdev_file.c | 13 ++++++++- module/zfs/vdev_label.c | 58 +++++++++++++++++++++++----------------- module/zfs/vdev_mirror.c | 13 ++++++--- module/zfs/vdev_queue.c | 17 ++++++------ 8 files changed, 94 insertions(+), 54 deletions(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index a8dc9510e3e9..6dae2955b847 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -83,7 +83,7 @@ typedef const struct vdev_ops { * Virtual device properties */ struct vdev_cache_entry { - char *ve_data; + abd_t *ve_data; uint64_t ve_offset; clock_t ve_lastused; avl_node_t ve_offset_node; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 52198261e434..e53af316f3dc 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -945,12 +945,12 @@ vdev_probe_done(zio_t *zio) ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; @@ -1067,7 +1067,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_pad2)), - VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), + VDEV_PAD_SIZE, abd_alloc_linear(VDEV_PAD_SIZE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index 389fa6fd9d07..2d0a878dc653 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -146,7 +146,7 @@ vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) avl_remove(&vc->vc_lastused_tree, ve); avl_remove(&vc->vc_offset_tree, ve); - zio_buf_free(ve->ve_data, VCBS); + abd_free(ve->ve_data, VCBS); kmem_free(ve, sizeof (vdev_cache_entry_t)); } @@ -183,7 +183,7 @@ vdev_cache_allocate(zio_t *zio) ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; ve->ve_lastused = ddi_get_lbolt(); - ve->ve_data = zio_buf_alloc(VCBS); + ve->ve_data = abd_alloc_scatter(VCBS); avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_lastused_tree, ve); @@ -206,7 +206,8 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) } ve->ve_hits++; - bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); + abd_copy_off(zio->io_data, ve->ve_data, zio->io_size, + 0, cache_phase); } /* @@ -357,8 +358,8 @@ vdev_cache_write(zio_t *zio) if (ve->ve_fill_io != NULL) { ve->ve_missed_update = 1; } else { - bcopy((char *)zio->io_data + start - io_start, - ve->ve_data + start - ve->ve_offset, end - start); + abd_copy_off(ve->ve_data, zio->io_data, end - start, + start - ve->ve_offset, start - io_start); } ve = AVL_NEXT(&vc->vc_offset_tree, ve); } diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 7f2263457177..6dfaf2578daa 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -507,12 +507,15 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, size_t kbuf_size, uint64_t kbuf_offset, int flags) { dio_request_t *dr; - caddr_t bio_ptr; + uint64_t zio_offset; uint64_t bio_offset; int bio_size, bio_count = 16; int i = 0, error = 0; + abd_t *zio_data = NULL; ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); + /* we take either zio or kbuf_ptr, not both */ + ASSERT((!zio || !kbuf_ptr) && (zio || kbuf_ptr)); retry: dr = vdev_disk_dio_alloc(bio_count); @@ -532,9 +535,14 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, * their volume block size to match the maximum request size and * the common case will be one bio per vdev IO request. */ - bio_ptr = kbuf_ptr; + if (zio) + zio_data = zio->io_data; + else + zio_data = abd_get_from_buf(kbuf_ptr, kbuf_size); + + zio_offset = 0; bio_offset = kbuf_offset; - bio_size = kbuf_size; + bio_size = kbuf_size; for (i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ @@ -553,9 +561,11 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, } dr->dr_bio[i] = bio_alloc(GFP_NOIO, - bio_nr_pages(bio_ptr, bio_size)); + abd_bio_nr_pages_off(zio_data, bio_size, zio_offset)); /* bio_alloc() with __GFP_WAIT never returns NULL */ if (unlikely(dr->dr_bio[i] == NULL)) { + if (kbuf_ptr) + abd_put(zio_data); vdev_disk_dio_free(dr); return (ENOMEM); } @@ -570,10 +580,11 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, dr->dr_bio[i]->bi_private = dr; /* Remaining size is returned to become the new size */ - bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size); + bio_size = abd_bio_map_off(dr->dr_bio[i], zio_data, + bio_size, zio_offset); /* Advance in buffer and construct another bio if needed */ - bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]); + zio_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); } @@ -601,6 +612,9 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, ASSERT3S(atomic_read(&dr->dr_ref), ==, 1); } + if (kbuf_ptr) + abd_put(zio_data); + (void) vdev_disk_dio_put(dr); return (error); @@ -712,7 +726,7 @@ vdev_disk_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, + error = __vdev_disk_physio(vd->vd_bdev, zio, NULL, zio->io_size, zio->io_offset, flags); if (error) { zio->io_error = error; diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index 7f43ad8001f4..296e27ebd889 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -149,12 +149,23 @@ vdev_file_io_strategy(void *arg) vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; ssize_t resid; + void *buf; + + if (zio->io_type == ZIO_TYPE_READ) + buf = abd_borrow_buf(zio->io_data, zio->io_size); + else + buf = abd_borrow_buf_copy(zio->io_data, zio->io_size); zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, + UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size, zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_data, buf, zio->io_size); + else + abd_return_buf(zio->io_data, buf, zio->io_size); + if (resid != 0 && zio->io_error == 0) zio->io_error = SET_ERROR(ENOSPC); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7f588ed6b0b5..7dcfb95436bd 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -178,7 +178,7 @@ vdev_label_number(uint64_t psize, uint64_t offset) } static void -vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == @@ -192,7 +192,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, } static void -vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || @@ -430,6 +430,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; + abd_t *vp_abd; zio_t *zio; uint64_t best_txg = 0; int error = 0; @@ -442,7 +443,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t)); + vp = ABD_TO_BUF(vp_abd); retry: for (l = 0; l < VDEV_LABELS; l++) { @@ -450,7 +452,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) zio = zio_root(spa, NULL, NULL, flags); - vdev_label_read(zio, vd, l, vp, + vdev_label_read(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -489,7 +491,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) goto retry; } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd, sizeof (vdev_phys_t)); return (config); } @@ -627,6 +629,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) vdev_phys_t *vp; char *pad2; uberblock_t *ub; + abd_t *vp_abd, *pad2_abd, *ub_abd; zio_t *zio; char *buf; size_t buflen; @@ -710,7 +713,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize its label. */ - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t)); + vp = ABD_TO_BUF(vp_abd); bzero(vp, sizeof (vdev_phys_t)); /* @@ -771,7 +775,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd, sizeof (vdev_phys_t)); /* EFAULT means nvlist_pack ran out of room */ return (error == EFAULT ? ENAMETOOLONG : EINVAL); } @@ -779,13 +783,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING); + ub = ABD_TO_BUF(ub_abd); bzero(ub, VDEV_UBERBLOCK_RING); *ub = spa->spa_uberblock; ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = zio_buf_alloc(VDEV_PAD_SIZE); + pad2_abd = abd_alloc_linear(VDEV_PAD_SIZE); + pad2 = ABD_TO_BUF(pad2_abd); bzero(pad2, VDEV_PAD_SIZE); /* @@ -796,7 +802,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) for (l = 0; l < VDEV_LABELS; l++) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -805,11 +811,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ - vdev_label_write(zio, vd, l, pad2, + vdev_label_write(zio, vd, l, pad2_abd, offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); - vdev_label_write(zio, vd, l, ub, + vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } @@ -822,9 +828,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) } nvlist_free(label); - zio_buf_free(pad2, VDEV_PAD_SIZE); - zio_buf_free(ub, VDEV_UBERBLOCK_RING); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(pad2_abd, VDEV_PAD_SIZE); + abd_free(ub_abd, VDEV_UBERBLOCK_RING); + abd_free(vp_abd, sizeof (vdev_phys_t)); /* * If this vdev hasn't been previously identified as a spare, then we @@ -888,7 +894,7 @@ vdev_uberblock_load_done(zio_t *zio) vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; - uberblock_t *ub = zio->io_data; + uberblock_t *ub = ABD_TO_BUF(zio->io_data); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); @@ -909,7 +915,7 @@ vdev_uberblock_load_done(zio_t *zio) mutex_exit(&rio->io_lock); } - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); } static void @@ -925,7 +931,7 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (l = 0; l < VDEV_LABELS; l++) { for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, - zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), + abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd)), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); @@ -993,6 +999,7 @@ vdev_uberblock_sync_done(zio_t *zio) static void vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) { + abd_t *ub_abd; uberblock_t *ubbuf; int c, l, n; @@ -1007,17 +1014,18 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); - ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd)); + ubbuf = ABD_TO_BUF(ub_abd); bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); *ubbuf = *ub; for (l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ubbuf, + vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, zio->io_private, flags | ZIO_FLAG_DONT_PROPAGATE); - zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); + abd_free(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); } /* Sync the uberblocks to all vdevs in svd[] */ @@ -1094,6 +1102,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; + abd_t *vp_abd; char *buf; size_t buflen; int c; @@ -1112,7 +1121,8 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t)); + vp = ABD_TO_BUF(vp_abd); bzero(vp, sizeof (vdev_phys_t)); buf = vp->vp_nvlist; @@ -1120,7 +1130,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { for (; l < VDEV_LABELS; l += 2) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, zio->io_private, @@ -1128,7 +1138,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) } } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd, sizeof (vdev_phys_t)); nvlist_free(label); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 77c3d8d385e9..bbdf923f1692 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -262,13 +262,12 @@ vdev_mirror_scrub_done(zio_t *zio) while ((pio = zio_walk_parents(zio)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); + abd_copy(pio->io_data, zio->io_data, pio->io_size); mutex_exit(&pio->io_lock); } mutex_exit(&zio->io_lock); } - - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mc->mc_error = zio->io_error; mc->mc_tried = 1; @@ -345,10 +344,16 @@ vdev_mirror_io_start(zio_t *zio) * data into zio->io_data in vdev_mirror_scrub_done. */ for (c = 0; c < mm->mm_children; c++) { + abd_t *tmp; mc = &mm->mm_child[c]; + if (ABD_IS_LINEAR(zio->io_data)) + tmp = abd_alloc_linear(zio->io_size); + else + tmp = abd_alloc_scatter(zio->io_size); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio_buf_alloc(zio->io_size), zio->io_size, + tmp, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 3fa4219f260e..6984cce610d1 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -448,12 +448,12 @@ vdev_queue_agg_io_done(zio_t *aio) if (aio->io_type == ZIO_TYPE_READ) { zio_t *pio; while ((pio = zio_walk_parents(aio)) != NULL) { - bcopy((char *)aio->io_data + (pio->io_offset - - aio->io_offset), pio->io_data, pio->io_size); + abd_copy_off(pio->io_data, aio->io_data, pio->io_size, + 0, pio->io_offset - aio->io_offset); } } - zio_buf_free(aio->io_data, aio->io_size); + abd_free(aio->io_data, aio->io_size); } /* @@ -591,7 +591,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) ASSERT3U(size, <=, zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, - zio_buf_alloc(size), size, first->io_type, zio->io_priority, + abd_alloc_scatter(size), size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; @@ -604,12 +604,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - bzero((char *)aio->io_data + (dio->io_offset - - aio->io_offset), dio->io_size); + abd_zero_off(aio->io_data, dio->io_size, + dio->io_offset - aio->io_offset); } else if (dio->io_type == ZIO_TYPE_WRITE) { - bcopy(dio->io_data, (char *)aio->io_data + - (dio->io_offset - aio->io_offset), - dio->io_size); + abd_copy_off(aio->io_data, dio->io_data, dio->io_size, + dio->io_offset - aio->io_offset, 0); } zio_add_child(dio, aio); From d0efe9d50654b89548df3752be12259a102128b5 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 13/26] Handle abd_t in vdev_raidz.c Signed-off-by: Chunwei Chen --- module/zfs/vdev_raidz.c | 500 ++++++++++++++++++++++++++-------------- 1 file changed, 330 insertions(+), 170 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 493b332c4405..bdf4171445cd 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -103,7 +103,7 @@ typedef struct raidz_col { uint64_t rc_devidx; /* child device index for I/O */ uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ - void *rc_data; /* I/O data */ + abd_t *rc_data; /* I/O data */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ @@ -120,7 +120,7 @@ typedef struct raidz_map { uint64_t rm_firstdatacol; /* First data column/parity count */ uint64_t rm_nskip; /* Skipped sectors for padding */ uint64_t rm_skipstart; /* Column index of padding start */ - void *rm_datacopy; /* rm_asize-buffer of copied data */ + abd_t *rm_datacopy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ uint8_t rm_ecksuminjected; /* checksum error was injected */ @@ -258,7 +258,7 @@ vdev_raidz_map_free(raidz_map_t *rm) size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + abd_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, @@ -266,11 +266,13 @@ vdev_raidz_map_free(raidz_map_t *rm) } size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + abd_put(rm->rm_col[c].rc_data); size += rm->rm_col[c].rc_size; + } if (rm->rm_datacopy != NULL) - zio_buf_free(rm->rm_datacopy, size); + abd_free(rm->rm_datacopy, size); kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } @@ -307,7 +309,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) size_t x; const char *good = NULL; - const char *bad = rm->rm_col[c].rc_data; + char *bad; if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); @@ -321,8 +323,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) * data never changes for a given logical ZIO) */ if (rm->rm_col[0].rc_gdata == NULL) { - char *bad_parity[VDEV_RAIDZ_MAXPARITY]; + abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; char *buf; + int offset; /* * Set up the rm_col[]s to generate the parity for @@ -331,14 +334,19 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) */ for (x = 0; x < rm->rm_firstdatacol; x++) { bad_parity[x] = rm->rm_col[x].rc_data; - rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = + rm->rm_col[x].rc_gdata = zio_buf_alloc(rm->rm_col[x].rc_size); + rm->rm_col[x].rc_data = + abd_get_from_buf(rm->rm_col[x].rc_gdata, + rm->rm_col[x].rc_size); } /* fill in the data columns from good_data */ buf = (char *)good_data; for (; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; + abd_put(rm->rm_col[x].rc_data); + rm->rm_col[x].rc_data = abd_get_from_buf(buf, + rm->rm_col[x].rc_size); buf += rm->rm_col[x].rc_size; } @@ -348,13 +356,17 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) vdev_raidz_generate_parity(rm); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) + for (x = 0; x < rm->rm_firstdatacol; x++) { + abd_put(rm->rm_col[x].rc_data); rm->rm_col[x].rc_data = bad_parity[x]; + } - buf = rm->rm_datacopy; + offset = 0; for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; - buf += rm->rm_col[x].rc_size; + abd_put(rm->rm_col[x].rc_data); + rm->rm_col[x].rc_data = abd_get_offset( + rm->rm_datacopy, offset); + offset += rm->rm_col[x].rc_size; } } @@ -368,8 +380,10 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) good += rm->rm_col[x].rc_size; } + bad = abd_borrow_buf_copy(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_return_buf(rm->rm_col[c].rc_data, bad, rm->rm_col[c].rc_size); } /* @@ -382,7 +396,7 @@ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - caddr_t buf; + size_t offset; raidz_map_t *rm = zio->io_vsd; size_t size; @@ -412,17 +426,22 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) size += rm->rm_col[c].rc_size; - buf = rm->rm_datacopy = zio_buf_alloc(size); + if (ABD_IS_LINEAR(rm->rm_col[rm->rm_firstdatacol].rc_data)) + rm->rm_datacopy = abd_alloc_linear(size); + else + rm->rm_datacopy = abd_alloc_scatter(size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; + abd_t *tmp = abd_get_offset(rm->rm_datacopy, offset); - bcopy(col->rc_data, buf, col->rc_size); - col->rc_data = buf; + abd_copy(tmp, col->rc_data, col->rc_size); + abd_put(col->rc_data); + col->rc_data = tmp; - buf += col->rc_size; + offset += col->rc_size; } - ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); + ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -451,6 +470,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t off = 0; /* * "Quotient": The number of data sectors for this stripe on all but @@ -534,13 +554,16 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + rm->rm_col[c].rc_data = + abd_alloc_linear(rm->rm_col[c].rc_size); - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_data = abd_get_offset(zio->io_data, 0); + off = rm->rm_col[c].rc_size; - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_data = abd_get_offset(zio->io_data, off); + off += rm->rm_col[c].rc_size; + } /* * If all data stored spans all columns, there's a danger that parity @@ -582,29 +605,81 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, return (rm); } +struct pqr_struct { + uint64_t *p; + uint64_t *q; + uint64_t *r; +}; + +static int +vdev_raidz_p_func(const void *buf, uint64_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && !pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++) + *pqr->p ^= *src; + return (0); +} + +static int +vdev_raidz_pq_func(const void *buf, uint64_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + } + return (0); +} + +static int +vdev_raidz_pqr_func(const void *buf, uint64_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + VDEV_RAIDZ_64MUL_4(*pqr->r, mask); + *pqr->r ^= *src; + } + return (0); +} + static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { - uint64_t *p, *src, pcount, ccount, i; + uint64_t *p; int c; - - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + abd_t *src; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + p = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p = *src; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); } else { - ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p ^= *src; - } + struct pqr_struct pqr = { p, NULL, NULL }; + abd_iterate_rfunc(src, rm->rm_col[c].rc_size, + vdev_raidz_p_func, &pqr); } } } @@ -612,50 +687,43 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { - uint64_t *p, *q, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + p = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); + q = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_Q].rc_data); + + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p = *src; - *q = *src; - } - for (; i < pcnt; i++, src++, p++, q++) { - *p = 0; - *q = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + memcpy(q, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p ^= *src; + struct pqr_struct pqr = { p, q, NULL }; + abd_iterate_rfunc(src, rm->rm_col[c].rc_size, + vdev_raidz_pq_func, &pqr); + } - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; } - + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++) { - VDEV_RAIDZ_64MUL_2(*q, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); } } } @@ -664,10 +732,11 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) static void vdev_raidz_generate_parity_pqr(raidz_map_t *rm) { - uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, *r, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == @@ -675,48 +744,36 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + p = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); + q = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_Q].rc_data); + r = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_R].rc_data); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p = *src; - *q = *src; - *r = *src; - } - for (; i < pcnt; i++, src++, p++, q++, r++) { - *p = 0; - *q = 0; - *r = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + memcpy(q, p, rm->rm_col[c].rc_size); + memcpy(r, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p ^= *src; - - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + struct pqr_struct pqr = { p, q, r }; + abd_iterate_rfunc(src, rm->rm_col[c].rc_size, + vdev_raidz_pqr_func, &pqr); + } - VDEV_RAIDZ_64MUL_4(*r, mask); - *r ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; + r[i] = 0; } - + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++, r++) { - VDEV_RAIDZ_64MUL_2(*q, mask); - VDEV_RAIDZ_64MUL_4(*r, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + VDEV_RAIDZ_64MUL_4(r[i], mask); } } } @@ -744,40 +801,126 @@ vdev_raidz_generate_parity(raidz_map_t *rm) } } +static int +vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, uint64_t dsize, + uint64_t ssize, void *private) +{ + uint64_t *dst = dbuf, *src = sbuf; + int i, cnt = dsize / sizeof (src[0]); + + ASSERT(dsize == ssize); + + for (i = 0; i < cnt; i++) { + dst[i] ^= src[i]; + } + + return (0); +} + +static int +vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, uint64_t dsize, + uint64_t ssize, void *private) +{ + uint64_t *dst = dbuf, *src = sbuf; + uint64_t mask; + int i, dcnt = dsize / sizeof (src[0]), scnt = ssize / sizeof (src[0]); + + ASSERT(dsize >= ssize); + + for (i = 0; i < scnt; i++, dst++, src++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + *dst ^= *src; + } + + for (; i < dcnt; i++, dst++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + } + return (0); +} + +struct reconst_q_struct { + uint64_t *q; + int exp; +}; + +static int +vdev_raidz_reconst_q_post_func(void *buf, uint64_t size, void *private) +{ + struct reconst_q_struct *rq = private; + uint64_t *dst = buf; + uint8_t *b; + int i, j, cnt = size / sizeof (dst[0]); + + for (i = 0; i < cnt; i++, dst++, rq->q++) { + *dst ^= *rq->q; + for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { + *b = vdev_raidz_exp2(*b, rq->exp); + } + } + return (0); +} + +struct reconst_pq_struct { + uint8_t *p; + uint8_t *q; + uint8_t *pxy; + uint8_t *qxy; + int aexp; + int bexp; +}; + +static int +vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, uint64_t xsize, + uint64_t ysize, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf, *yd = ybuf; + int i; + + ASSERT(xsize >= ysize); + + for (i = 0; i < xsize; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, + xd++, yd++) { + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + + if (i < ysize) + *yd = *rpq->p ^ *rpq->pxy ^ *xd; + } + return (0); +} + static int vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, i; int x = tgts[0]; int c; + abd_t *dst, *src; ASSERT(ntgts == 1); ASSERT(x >= rm->rm_firstdatacol); ASSERT(x < rm->rm_cols); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); - ASSERT(xcount > 0); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); + ASSERT(rm->rm_col[x].rc_size > 0); src = rm->rm_col[VDEV_RAIDZ_P].rc_data; dst = rm->rm_col[x].rc_data; - for (i = 0; i < xcount; i++, dst++, src++) { - *dst = *src; - } + + abd_copy_from_buf(dst, ABD_TO_BUF(src), rm->rm_col[x].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + uint64_t size = MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + src = rm->rm_col[c].rc_data; dst = rm->rm_col[x].rc_data; if (c == x) continue; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - count = MIN(ccount, xcount); - - for (i = 0; i < count; i++, dst++, src++) { - *dst ^= *src; - } + abd_iterate_func2(dst, src, size, size, + vdev_raidz_reconst_p_func, NULL); } return (1 << VDEV_RAIDZ_P); @@ -786,44 +929,30 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, mask, i; - uint8_t *b; int x = tgts[0]; - int c, j, exp; + int c, exp; + abd_t *dst, *src; + struct reconst_q_struct rq; ASSERT(ntgts == 1); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + src = rm->rm_col[c].rc_data; dst = rm->rm_col[x].rc_data; - if (c == x) - ccount = 0; - else - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - - count = MIN(ccount, xcount); - if (c == rm->rm_firstdatacol) { - for (i = 0; i < count; i++, dst++, src++) { - *dst = *src; - } - for (; i < xcount; i++, dst++) { - *dst = 0; - } - + abd_copy(dst, src, size); + if (rm->rm_col[x].rc_size > size) + abd_zero_off(dst, rm->rm_col[x].rc_size - size, + size); } else { - for (i = 0; i < count; i++, dst++, src++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - *dst ^= *src; - } - - for (; i < xcount; i++, dst++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - } + abd_iterate_func2(dst, src, rm->rm_col[x].rc_size, + size, vdev_raidz_reconst_q_pre_func, NULL); } } @@ -831,12 +960,9 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) dst = rm->rm_col[x].rc_data; exp = 255 - (rm->rm_cols - 1 - x); - for (i = 0; i < xcount; i++, dst++, src++) { - *dst ^= *src; - for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { - *b = vdev_raidz_exp2(*b, exp); - } - } + rq = (struct reconst_q_struct) { ABD_TO_BUF(src), exp }; + abd_iterate_wfunc(dst, rm->rm_col[x].rc_size, + vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } @@ -844,11 +970,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { - uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; - void *pdata, *qdata; - uint64_t xsize, ysize, i; + uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; + abd_t *pdata, *qdata; + uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; + abd_t *xd, *yd; + struct reconst_pq_struct rpq; ASSERT(ntgts == 2); ASSERT(x < y); @@ -870,9 +998,9 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) ysize = rm->rm_col[y].rc_size; rm->rm_col[VDEV_RAIDZ_P].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size); rm->rm_col[VDEV_RAIDZ_Q].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size); rm->rm_col[x].rc_size = 0; rm->rm_col[y].rc_size = 0; @@ -881,10 +1009,10 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) rm->rm_col[x].rc_size = xsize; rm->rm_col[y].rc_size = ysize; - p = pdata; - q = qdata; - pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + p = ABD_TO_BUF(pdata); + q = ABD_TO_BUF(qdata); + pxy = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); + qxy = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_Q].rc_data); xd = rm->rm_col[x].rc_data; yd = rm->rm_col[y].rc_data; @@ -910,17 +1038,13 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; - for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { - *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ - vdev_raidz_exp2(*q ^ *qxy, bexp); + rpq = (struct reconst_pq_struct) { p, q, pxy, qxy, aexp, bexp }; + abd_iterate_func2(xd, yd, xsize, ysize, + vdev_raidz_reconst_pq_func, &rpq); - if (i < ysize) - *yd = *p ^ *pxy ^ *xd; - } - - zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, + abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, rm->rm_col[VDEV_RAIDZ_P].rc_size); - zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, + abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, rm->rm_col[VDEV_RAIDZ_Q].rc_size); /* @@ -1245,7 +1369,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, c = used[i]; ASSERT3U(c, <, rm->rm_cols); - src = rm->rm_col[c].rc_data; + src = ABD_TO_BUF(rm->rm_col[c].rc_data); ccount = rm->rm_col[c].rc_size; for (j = 0; j < nmissing; j++) { cc = missing[j] + rm->rm_firstdatacol; @@ -1253,7 +1377,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, ASSERT3U(cc, <, rm->rm_cols); ASSERT3U(cc, !=, c); - dst[j] = rm->rm_col[cc].rc_data; + dst[j] = ABD_TO_BUF(rm->rm_col[cc].rc_data); dcount[j] = rm->rm_col[cc].rc_size; } @@ -1301,8 +1425,25 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; + abd_t **bufs = NULL; + int code = 0; + /* + * matrix reconstruction can use scatter buffer yet, so we allocate + * temporary linear abds. + */ + if (!ABD_IS_LINEAR(rm->rm_col[rm->rm_firstdatacol].rc_data)) { + bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bufs[c] = col->rc_data; + col->rc_data = abd_alloc_linear(col->rc_size); + abd_copy(col->rc_data, bufs[c], col->rc_size); + } + } n = rm->rm_cols - rm->rm_firstdatacol; @@ -1389,6 +1530,20 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) kmem_free(p, psize); + /* + * copy back from temporary linear abds and free them + */ + if (bufs) { + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + abd_copy(bufs[c], col->rc_data, col->rc_size); + abd_free(col->rc_data, col->rc_size); + col->rc_data = bufs[c]; + } + kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + } + return (code); } @@ -1661,6 +1816,7 @@ vdev_raidz_io_start(zio_t *zio) static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { + void *buf; vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -1674,9 +1830,11 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; + buf = abd_borrow_buf_copy(rc->rc_data, rc->rc_size); zfs_ereport_post_checksum(zio->io_spa, vd, zio, - rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, + rc->rc_offset, rc->rc_size, buf, bad_data, &zbc); + abd_return_buf(rc->rc_data, buf, rc->rc_size); } } @@ -1718,7 +1876,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig[c], rc->rc_size); + abd_copy_to_buf(orig[c], rc->rc_data, rc->rc_size); } vdev_raidz_generate_parity(rm); @@ -1727,7 +1885,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; - if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { + if (bcmp(orig[c], ABD_TO_BUF(rc->rc_data), rc->rc_size) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -1835,7 +1993,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) ASSERT3S(c, >=, 0); ASSERT3S(c, <, rm->rm_cols); rc = &rm->rm_col[c]; - bcopy(rc->rc_data, orig[i], rc->rc_size); + abd_copy_to_buf(orig[i], rc->rc_data, + rc->rc_size); } /* @@ -1866,7 +2025,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; - bcopy(orig[i], rc->rc_data, rc->rc_size); + abd_copy_from_buf(rc->rc_data, orig[i], + rc->rc_size); } do { From 3fc3174e0306f048bb47f4cfb39c441e3f61b7fa Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 14/26] Handle ABD in ztest and zdb Use ABD API on related pointers and functions.(b_data, db_data, zio_*(), etc.) Suggested-by: DHE Signed-off-by: Chunwei Chen --- cmd/zdb/zdb.c | 21 ++++++++++++--------- cmd/zdb/zdb_il.c | 5 ++++- cmd/ztest/ztest.c | 35 ++++++++++++++++++++++------------- 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index b0d7170b92b2..c9c076c5718e 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1183,7 +1183,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, ASSERT(buf->b_data); /* recursively visit blocks below this */ - cbp = buf->b_data; + cbp = ABD_TO_BUF(buf->b_data); for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; @@ -1355,7 +1355,7 @@ dump_bptree(objset_t *os, uint64_t obj, char *name) return; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); zdb_nicenum(bt->bt_bytes, bytes); (void) printf("\n %s: %llu datasets, %s\n", name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); @@ -1805,7 +1805,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) if (error) fatal("dmu_bonus_hold(%llu) failed, errno %u", object, error); - bonus = db->db_data; + bonus = ABD_TO_BUF(db->db_data); bsize = db->db_size; dn = DB_DNODE((dmu_buf_impl_t *)db); } @@ -2028,7 +2028,7 @@ dump_config(spa_t *spa) spa->spa_config_object, FTAG, &db); if (error == 0) { - nvsize = *(uint64_t *)db->db_data; + nvsize = *(uint64_t *)ABD_TO_BUF(db->db_data); dmu_buf_rele(db, FTAG); (void) printf("\nMOS Configuration:\n"); @@ -2315,7 +2315,7 @@ zdb_blkptr_done(zio_t *zio) zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -2378,7 +2378,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); - void *data = zio_data_buf_alloc(size); + abd_t *data = abd_alloc_linear(size); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ @@ -3141,6 +3141,7 @@ zdb_read_block(char *thing, spa_t *spa) zio_t *zio; vdev_t *vd; void *pbuf, *lbuf, *buf; + abd_t *pbuf_abd; char *s, *p, *dup, *vdev, *flagstr; int i, error; @@ -3212,6 +3213,7 @@ zdb_read_block(char *thing, spa_t *spa) lsize = size; pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, 512, UMEM_NOFAIL); + pbuf_abd = abd_get_from_buf(pbuf, SPA_MAXBLOCKSIZE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); @@ -3239,15 +3241,15 @@ zdb_read_block(char *thing, spa_t *spa) /* * Treat this as a normal block read. */ - zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, + zio_nowait(zio_read(zio, spa, bp, pbuf_abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ - zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, - ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf_abd, + psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); @@ -3321,6 +3323,7 @@ zdb_read_block(char *thing, spa_t *spa) zdb_dump_block(thing, buf, size, flags); out: + abd_put(pbuf_abd); umem_free(pbuf, SPA_MAXBLOCKSIZE); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index b85ef7ddd97e..02e772c0c8e4 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -125,6 +125,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; char buf[SPA_MAXBLOCKSIZE]; + abd_t *abd; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; @@ -158,9 +159,11 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + abd = abd_get_from_buf(buf, BP_GET_LSIZE(bp)); error = zio_wait(zio_read(NULL, zilog->zl_spa, - bp, buf, BP_GET_LSIZE(bp), NULL, NULL, + bp, abd, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); + abd_put(abd); if (error) return; data = buf; diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 0602a7ec54bf..2a0e15765622 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -1305,19 +1305,23 @@ ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) } static void -ztest_pattern_set(void *buf, uint64_t size, uint64_t value) +ztest_pattern_set(abd_t *abd, uint64_t size, uint64_t value) { + void *buf = abd_borrow_buf(abd, size); uint64_t *ip = buf; uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); while (ip < ip_end) *ip++ = value; + + abd_return_buf_copy(abd, buf, size); } #ifndef NDEBUG static boolean_t -ztest_pattern_match(void *buf, uint64_t size, uint64_t value) +ztest_pattern_match(abd_t *abd, uint64_t size, uint64_t value) { + void *buf = abd_borrow_buf_copy(abd, size); uint64_t *ip = buf; uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); uint64_t diff = 0; @@ -1325,6 +1329,7 @@ ztest_pattern_match(void *buf, uint64_t size, uint64_t value) while (ip < ip_end) diff |= (value - *ip++); + abd_return_buf(abd, buf, size); return (diff == 0); } #endif @@ -1364,7 +1369,8 @@ ztest_bt_bonus(dmu_buf_t *db) dmu_object_info_from_db(db, &doi); ASSERT3U(doi.doi_bonus_size, <=, db->db_size); ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); - bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); + bt = (void *)((char *)ABD_TO_BUF(db->db_data) + doi.doi_bonus_size - + sizeof (*bt)); return (bt); } @@ -1726,7 +1732,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) if (abuf == NULL) { dmu_write(os, lr->lr_foid, offset, length, data, tx); } else { - bcopy(data, abuf->b_data, length); + abd_copy_from_buf(abuf->b_data, data, length); dmu_assign_arcbuf(db, offset, abuf, tx); } @@ -4121,16 +4127,19 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; if (i != 5) { - bcopy((caddr_t)bigbuf + (off - bigoff), - bigbuf_arcbufs[j]->b_data, chunksize); + abd_copy_from_buf(bigbuf_arcbufs[j]->b_data, + (caddr_t)bigbuf + (off - bigoff), + chunksize); } else { - bcopy((caddr_t)bigbuf + (off - bigoff), + abd_copy_from_buf( bigbuf_arcbufs[2 * j]->b_data, + (caddr_t)bigbuf + (off - bigoff), chunksize / 2); - bcopy((caddr_t)bigbuf + (off - bigoff) + - chunksize / 2, + + abd_copy_from_buf( bigbuf_arcbufs[2 * j + 1]->b_data, - chunksize / 2); + (caddr_t)bigbuf + (off - bigoff) + + chunksize / 2, chunksize / 2); } if (i == 1) { @@ -5181,7 +5190,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) enum zio_checksum checksum = spa_dedup_checksum(spa); dmu_buf_t *db; dmu_tx_t *tx; - void *buf; + abd_t *buf; blkptr_t blk; int copies = 2 * ZIO_DEDUPDITTO_MIN; int i; @@ -5262,14 +5271,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) * Damage the block. Dedup-ditto will save us when we read it later. */ psize = BP_GET_PSIZE(&blk); - buf = zio_buf_alloc(psize); + buf = abd_alloc_linear(psize); ztest_pattern_set(buf, psize, ~pattern); (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - zio_buf_free(buf, psize); + abd_free(buf, psize); (void) rw_unlock(&ztest_name_lock); umem_free(od, sizeof (ztest_od_t)); From 94420251abff6eb28550e2099cc752d39566861d Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 11 Feb 2015 16:45:53 +0800 Subject: [PATCH 15/26] Enable ABD Signed-off-by: Chunwei Chen --- include/sys/abd.h | 83 +--------------------------------------- lib/libzpool/Makefile.am | 2 +- module/zfs/Makefile.in | 1 + 3 files changed, 3 insertions(+), 83 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index 561b43e18188..64b34510828e 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -39,7 +39,7 @@ #ifdef __cplusplus extern "C" { #endif -#if 0 + #define ARC_BUF_DATA_MAGIC 0xa7cb0fda #if defined(ZFS_DEBUG) && !defined(_KERNEL) @@ -182,87 +182,6 @@ do { \ abd_copy_from_buf(a, b, n); \ abd_return_buf(a, b, n); \ } while (0) -#else /* 0 */ -typedef void abd_t; -#define ABD_TO_BUF(abd) ((void *)abd) -#define ABD_IS_SCATTER(abd) (0) -#define ABD_IS_LINEAR(abd) (1) -#define ASSERT_ABD_SCATTER(abd) ((void)0) -#define ASSERT_ABD_LINEAR(abd) ((void)0) -void *zio_buf_alloc(size_t); -void zio_buf_free(void *, size_t); -static inline abd_t *abd_alloc_linear(size_t size) -{ - return ((abd_t *)zio_buf_alloc(size)); -} -static inline void abd_free(abd_t *abd, size_t size) -{ - zio_buf_free((void *)abd, size); -} -#define abd_alloc_scatter abd_alloc_linear -#define abd_get_offset(abd, off) ((void *)(abd)+(off)) -#define abd_get_from_buf(buf, size) (buf) -#define abd_put(abd) do { } while (0) - -#define abd_iterate_rfunc(a, n, f, p) \ - (void) f(a, n, p) - -#define abd_iterate_wfunc(a, n, f, p) \ - (void) f(a, n, p) - -#define abd_iterate_func2(a, b, an, bn, f, p) \ - (void) f(a, b, an, bn, p) - -#define abd_copy_off(a, b, n, aoff, boff) \ - (void) memcpy((void *)(a)+(aoff), (void *)(b)+(boff), n) - -#define abd_copy_from_buf_off(a, b, n, off) \ - (void) memcpy((void *)(a)+(off), b, n) - -#define abd_copy_to_buf_off(a, b, n, off) \ - (void) memcpy(a, (void *)(b)+(off), n) - -#define abd_cmp(a, b, n) \ - memcmp(a, b, n) - -#define abd_cmp_buf_off(a, b, n, off) \ - memcmp((void *)(a)+(off), b, n) - -#define abd_zero_off(a, n, off) \ - (void) memset((void *)(a)+(off), 0, n) - -#ifdef _KERNEL -#define abd_copy_to_user_off(a, b, n, off) \ - copy_to_user(a, (void *)(b)+(off), n) - -#define abd_copy_from_user_off(a, b, n, off) \ - copy_from_user((void *)(a)+(off), b, n) - -#define abd_uiomove_off(p, n, rw, uio, off) \ - uiomove((void *)(p)+(off), n, rw, uio) - -#define abd_uiocopy_off(p, n, rw, uio, c, off) \ - uiocopy((void *)(p)+(off), n, rw, uio, c) - -#define abd_bio_map_off(bio, a, n, off) \ - bio_map(bio, (void *)(a)+(off), n) - -#define abd_bio_nr_pages_off(a, n, off) \ - bio_nr_pages((void *)(a)+(off), n) -#endif /* _KERNEL */ - -#define abd_borrow_buf(a, n) \ - ((void *)a) - -#define abd_borrow_buf_copy(a, n) \ - ((void *)a) - -#define abd_return_buf(a, b, n) \ - do { } while (0) - -#define abd_return_buf_copy(a, b, n) \ - do { } while (0) -#endif /* 0 */ /* * Wrappers for zero off functions diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index caa64787a95b..60bd06b7d07b 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -20,6 +20,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zcommon/zfs_uio.c \ $(top_srcdir)/module/zcommon/zpool_prop.c \ $(top_srcdir)/module/zcommon/zprop_common.c \ + $(top_srcdir)/module/zfs/abd.c \ $(top_srcdir)/module/zfs/arc.c \ $(top_srcdir)/module/zfs/blkptr.c \ $(top_srcdir)/module/zfs/bplist.c \ @@ -110,7 +111,6 @@ libzpool_la_LIBADD += $(ZLIB) libzpool_la_LDFLAGS = -version-info 2:0:0 EXTRA_DIST = \ - $(top_srcdir)/module/zfs/abd.c \ $(top_srcdir)/module/zfs/vdev_disk.c \ $(top_srcdir)/module/zfs/zfs_acl.c \ $(top_srcdir)/module/zfs/zfs_ctldir.c \ diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 954841f33137..e24a53d5eaf6 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -4,6 +4,7 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ obj-$(CONFIG_ZFS) := $(MODULE).o +$(MODULE)-objs += @top_srcdir@/module/zfs/abd.o $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o $(MODULE)-objs += @top_srcdir@/module/zfs/blkptr.o $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o From 8e99d78205c5184e237b77c90df537235d312cdf Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Tue, 17 Feb 2015 15:16:16 +0800 Subject: [PATCH 16/26] Try to fix build error --- include/sys/abd.h | 10 +++++++++- module/zfs/abd.c | 13 +++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index 64b34510828e..c95327c101db 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -42,10 +42,18 @@ extern "C" { #define ARC_BUF_DATA_MAGIC 0xa7cb0fda -#if defined(ZFS_DEBUG) && !defined(_KERNEL) +#ifndef _KERNEL + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#ifdef ZFS_DEBUG #define DEBUG_ABD #endif +#endif /* !_KERNEL */ + typedef struct arc_buf_data { #ifdef DEBUG_ABD char pad[PAGE_SIZE]; /* debug, coredumps when accessed */ diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 8d599e8d1b07..9c59839a0bc8 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -43,10 +43,6 @@ /* * page */ -#ifndef PAGE_SIZE -#define PAGE_SIZE 4096 -#endif - struct page; #define alloc_page(gfp) \ @@ -105,6 +101,8 @@ sg_next(struct scatterlist *sg) #define kmap(page) ((void *)page) #define kunmap(page) do { } while (0) +#define KM_USER0 (0) +#define KM_USER1 (1) #define zfs_kmap_atomic(page, type) ((void *)page) #define zfs_kunmap_atomic(addr, type) do { } while (0) #define pagefault_disable() do { } while (0) @@ -165,7 +163,7 @@ abd_miter_init_km(struct abd_miter *aiter, abd_t *abd, int rw, int km) aiter->nents = abd->abd_nents; aiter->rw = rw; #ifndef HAVE_1ARG_KMAP_ATOMIC - aiter->km_type = km; + aiter->km_type = (km ? KM_USER1 : KM_USER0); #endif } @@ -207,7 +205,7 @@ abd_miter_map_x(struct abd_miter *aiter, int atomic) if (atomic) paddr = zfs_kmap_atomic(sg_page(aiter->sg), - (aiter->km_type ? KM_USER1 : KM_USER0)); + aiter->km_type); else paddr = kmap(sg_page(aiter->sg)); } @@ -237,8 +235,7 @@ abd_miter_unmap_x(struct abd_miter *aiter, int atomic) if (atomic) { if (aiter->rw == ABD_MITER_W) flush_kernel_dcache_page(sg_page(aiter->sg)); - zfs_kunmap_atomic(paddr, - (aiter->km_type ? KM_USER1 : KM_USER0)); + zfs_kunmap_atomic(paddr, aiter->km_type); } else { kunmap(sg_page(aiter->sg)); } From e19172fd519a55d4784e1e7cf64b4adef25f4cfa Mon Sep 17 00:00:00 2001 From: kernelOfTruth Date: Mon, 23 Mar 2015 22:59:05 +0100 Subject: [PATCH 17/26] attempt to make DEBUG build (buildbots) happy: pad[PAGE_SIZE] to pad[4096] (how about archs other than x86, x86_64 ?) --- include/sys/abd.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index c95327c101db..d6d1dcc91ce2 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -48,15 +48,20 @@ extern "C" { #define PAGE_SIZE 4096 #endif +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + #ifdef ZFS_DEBUG #define DEBUG_ABD +#define PAGE_SHIFT 12 #endif #endif /* !_KERNEL */ typedef struct arc_buf_data { #ifdef DEBUG_ABD - char pad[PAGE_SIZE]; /* debug, coredumps when accessed */ + char pad[4096]; /* debug, coredumps when accessed */ #endif uint32_t abd_magic; /* ARC_BUF_DATA_MAGIC */ uint32_t abd_flags; From 70a292b883d31186c63cea81dfe753b934a51355 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Sun, 15 Feb 2015 22:28:32 -0600 Subject: [PATCH 18/26] Revert "fix l2arc compression buffers leak" This reverts commit 037763e44e0f6d7284e9328db988a89fdc975a4e. XXX - expand this comment as to why we're reverting it. conflicts due to #2129, in: static void arc_buf_free_on_write (abd_t) static void arc_buf_l2_cdata_free (related to arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size, abd_free); ) l2arc_release_cdata_buf (abd_free instead of zio_data_buf_free) --- module/zfs/arc.c | 65 ++++++++---------------------------------------- 1 file changed, 10 insertions(+), 55 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 8f48658ff370..1ffec5deaaaf 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -310,7 +310,6 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; @@ -399,7 +398,6 @@ static arc_stats_t arc_stats = { { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, - { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, @@ -1486,21 +1484,6 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) data, metadata, hits); } -static void -arc_buf_free_on_write(abd_t *data, size_t size, - void (*free_func)(abd_t *, size_t)) -{ - l2arc_data_free_t *df; - - df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); - df->l2df_data = data; - df->l2df_size = size; - df->l2df_func = free_func; - mutex_enter(&l2arc_free_on_write_mtx); - list_insert_head(l2arc_free_on_write, df); - mutex_exit(&l2arc_free_on_write_mtx); -} - /* * Free the arc data buffer. If it is an l2arc write in progress, * the buffer is placed on l2arc_free_on_write to be freed later. @@ -1511,7 +1494,14 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(abd_t *, size_t)) arc_buf_hdr_t *hdr = buf->b_hdr; if (HDR_L2_WRITING(hdr)) { - arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); + l2arc_data_free_t *df; + df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df->l2df_data = buf->b_data; + df->l2df_size = hdr->b_size; + df->l2df_func = free_func; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { free_func(buf->b_data, hdr->b_size); @@ -1522,23 +1512,6 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(abd_t *, size_t)) * Free up buf->b_data and if 'remove' is set, then pull the * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ -static void -arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) -{ - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; - - ASSERT(MUTEX_HELD(&l2arc_buflist_mtx)); - - if (l2hdr->b_tmp_cdata == NULL) - return; - - ASSERT(HDR_L2_WRITING(hdr)); - arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size, - abd_free); - ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); - l2hdr->b_tmp_cdata = NULL; -} - static void arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) { @@ -1634,7 +1607,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) if (l2hdr != NULL) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); - arc_buf_l2_cdata_free(hdr); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); vdev_space_update(l2hdr->b_dev->l2ad_vdev, @@ -3687,7 +3659,6 @@ arc_release(arc_buf_t *buf, void *tag) l2hdr = hdr->b_l2hdr; if (l2hdr) { mutex_enter(&l2arc_buflist_mtx); - arc_buf_l2_cdata_free(hdr); hdr->b_l2hdr = NULL; list_remove(l2hdr->b_dev->l2ad_buflist, hdr); } @@ -4929,11 +4900,6 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_evicted += abl2->b_asize; ab->b_l2hdr = NULL; - /* - * We are destroying l2hdr, so ensure that - * its compressed buffer, if any, is not leaked. - */ - ASSERT(abl2->b_tmp_cdata == NULL); kmem_cache_free(l2arc_hdr_cache, abl2); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); @@ -5168,14 +5134,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, buf_data = l2hdr->b_tmp_cdata; buf_sz = l2hdr->b_asize; - /* - * If the data has not been compressed, then clear b_tmp_cdata - * to make sure that it points only to a temporary compression - * buffer. - */ - if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress)) - l2hdr->b_tmp_cdata = NULL; - /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { uint64_t buf_p_sz; @@ -5378,18 +5336,15 @@ l2arc_release_cdata_buf(arc_buf_hdr_t *ab) { l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; - ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress)); - if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) { + if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { /* * If the data was compressed, then we've allocated a * temporary buffer for it, so now we need to release it. */ ASSERT(l2hdr->b_tmp_cdata != NULL); abd_free(l2hdr->b_tmp_cdata, ab->b_size); - l2hdr->b_tmp_cdata = NULL; - } else { - ASSERT(l2hdr->b_tmp_cdata == NULL); } + l2hdr->b_tmp_cdata = NULL; } /* From 707d6dc6446d96e95531965016dabbb5be356511 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Sat, 24 Jan 2015 12:40:59 -0600 Subject: [PATCH 19/26] Revert "Add ddt, ddt_entry, and l2arc_hdr caches" This reverts commit ecf3d9b8e63e5659269e15db527380c65780f71a. Conflicts: module/zfs/arc.c module/zfs/ddt.c module/zfs/spa_misc.c --- include/sys/ddt.h | 2 -- module/zfs/arc.c | 17 +++++++---------- module/zfs/ddt.c | 37 ++++++++----------------------------- module/zfs/spa_misc.c | 2 -- 4 files changed, 15 insertions(+), 43 deletions(-) diff --git a/include/sys/ddt.h b/include/sys/ddt.h index b7977aae0357..77e1d4d5eb87 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -216,8 +216,6 @@ extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); extern void ddt_enter(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt); -extern void ddt_init(void); -extern void ddt_fini(void); extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1ffec5deaaaf..5d2200445fc4 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -799,7 +799,6 @@ buf_hash_remove(arc_buf_hdr_t *buf) */ static kmem_cache_t *hdr_cache; static kmem_cache_t *buf_cache; -static kmem_cache_t *l2arc_hdr_cache; static void buf_fini(void) @@ -821,7 +820,6 @@ buf_fini(void) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_cache); kmem_cache_destroy(buf_cache); - kmem_cache_destroy(l2arc_hdr_cache); } /* @@ -923,8 +921,6 @@ buf_init(void) 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); - l2arc_hdr_cache = kmem_cache_create("l2arc_buf_hdr_t", L2HDR_SIZE, - 0, NULL, NULL, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) @@ -1611,7 +1607,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); vdev_space_update(l2hdr->b_dev->l2ad_vdev, -l2hdr->b_asize, 0, 0); - kmem_cache_free(l2arc_hdr_cache, l2hdr); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); if (hdr->b_state == arc_l2c_only) l2arc_hdr_stat_remove(); @@ -3756,7 +3752,8 @@ arc_release(arc_buf_t *buf, void *tag) ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); vdev_space_update(l2hdr->b_dev->l2ad_vdev, -l2hdr->b_asize, 0, 0); - kmem_cache_free(l2arc_hdr_cache, l2hdr); + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -buf_size); mutex_exit(&l2arc_buflist_mtx); @@ -4641,7 +4638,7 @@ l2arc_write_done(zio_t *zio) ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_dropped += abl2->b_asize; ab->b_l2hdr = NULL; - kmem_cache_free(l2arc_hdr_cache, abl2); + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } @@ -4900,7 +4897,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_evicted += abl2->b_asize; ab->b_l2hdr = NULL; - kmem_cache_free(l2arc_hdr_cache, abl2); + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } @@ -5046,9 +5043,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Create and add a new L2ARC header. */ - l2hdr = kmem_cache_alloc(l2arc_hdr_cache, KM_SLEEP); + l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), + KM_PUSHPAGE); l2hdr->b_dev = dev; - l2hdr->b_daddr = 0; arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); ab->b_flags |= ARC_L2_WRITING; diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 4d9fab6392b8..41edceae4058 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -37,9 +37,6 @@ #include #include -static kmem_cache_t *ddt_cache; -static kmem_cache_t *ddt_entry_cache; - /* * Enable/disable prefetching of dedup-ed blocks which are going to be freed. */ @@ -517,7 +514,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) { ddt_histogram_t *ddh_total; - ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_PUSHPAGE); ddt_get_dedup_histogram(spa, ddh_total); ddt_histogram_stat(dds_total, ddh_total); kmem_free(ddh_total, sizeof (ddt_histogram_t)); @@ -664,29 +661,12 @@ ddt_exit(ddt_t *ddt) mutex_exit(&ddt->ddt_lock); } -void -ddt_init(void) -{ - ddt_cache = kmem_cache_create("ddt_cache", - sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - ddt_entry_cache = kmem_cache_create("ddt_entry_cache", - sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -ddt_fini(void) -{ - kmem_cache_destroy(ddt_entry_cache); - kmem_cache_destroy(ddt_cache); -} - static ddt_entry_t * ddt_alloc(const ddt_key_t *ddk) { ddt_entry_t *dde; - dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); - bzero(dde, sizeof (ddt_entry_t)); + dde = kmem_zalloc(sizeof (ddt_entry_t), KM_PUSHPAGE); cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); dde->dde_key = *ddk; @@ -708,7 +688,7 @@ ddt_free(ddt_entry_t *dde) abd_free(dde->dde_repair_data, DDK_GET_PSIZE(&dde->dde_key)); cv_destroy(&dde->dde_cv); - kmem_cache_free(ddt_entry_cache, dde); + kmem_free(dde, sizeof (*dde)); } void @@ -833,8 +813,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) { ddt_t *ddt; - ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP); - bzero(ddt, sizeof (ddt_t)); + ddt = kmem_zalloc(sizeof (*ddt), KM_PUSHPAGE); mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&ddt->ddt_tree, ddt_entry_compare, @@ -856,7 +835,7 @@ ddt_table_free(ddt_t *ddt) avl_destroy(&ddt->ddt_tree); avl_destroy(&ddt->ddt_repair_tree); mutex_destroy(&ddt->ddt_lock); - kmem_cache_free(ddt_cache, ddt); + kmem_free(ddt, sizeof (*ddt)); } void @@ -936,20 +915,20 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) return (B_TRUE); ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; - dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); + dde = kmem_alloc(sizeof (ddt_entry_t), KM_PUSHPAGE); ddt_key_fill(&(dde->dde_key), bp); for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class <= max_class; class++) { if (ddt_object_lookup(ddt, type, class, dde) == 0) { - kmem_cache_free(ddt_entry_cache, dde); + kmem_free(dde, sizeof (ddt_entry_t)); return (B_TRUE); } } } - kmem_cache_free(ddt_entry_cache, dde); + kmem_free(dde, sizeof (ddt_entry_t)); return (B_FALSE); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index ec0c0195947c..d97f35dd60f3 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1738,7 +1738,6 @@ spa_init(int mode) refcount_init(); unique_init(); range_tree_init(); - ddt_init(); zio_init(); dmu_init(); zil_init(); @@ -1763,7 +1762,6 @@ spa_fini(void) zil_fini(); dmu_fini(); zio_fini(); - ddt_fini(); range_tree_fini(); unique_fini(); refcount_fini(); From 115a8c22e6616644afc1ebf15f29b79ccdf5116d Mon Sep 17 00:00:00 2001 From: George Wilson Date: Sat, 6 Dec 2014 09:24:32 -0800 Subject: [PATCH 20/26] Illumos 5369 - arc flags should be an enum 5369 arc flags should be an enum 5370 consistent arc_buf_hdr_t naming scheme Reviewed by: Matthew Ahrens Reviewed by: Alex Reece Reviewed by: Sebastien Roy Reviewed by: Richard Elling Approved by: Richard Lowe conflicts due to #2129, in: arc.c static void arc_evict_ghost : (with additional argument "arc_buf_contents_t type" since Revert "Allow arc_evict_ghost() to only evict meta data" is left out) l2arc_release_cdata_buf(arc_buf_hdr_t *ab) : zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); is changed to abd_free(l2hdr->b_tmp_cdata, hdr->b_size); fix commit 989fd514b1053d5443b4e6155af9c8d863f5f0f2 "Change ASSERT(!"...") to cmn_err(CE_PANIC, ...)" --- cmd/zdb/zdb.c | 2 +- include/sys/arc.h | 41 ++- include/sys/arc_impl.h | 2 +- module/zfs/arc.c | 691 +++++++++++++++++++------------------- module/zfs/dbuf.c | 11 +- module/zfs/dmu_diff.c | 2 +- module/zfs/dmu_objset.c | 6 +- module/zfs/dmu_send.c | 6 +- module/zfs/dmu_traverse.c | 12 +- module/zfs/dsl_scan.c | 8 +- module/zfs/spa_stats.c | 2 +- module/zfs/zil.c | 4 +- module/zfs/zio.c | 2 +- 13 files changed, 399 insertions(+), 390 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index c9c076c5718e..ab722e94338f 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1169,7 +1169,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, print_indirect(bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; diff --git a/include/sys/arc.h b/include/sys/arc.h index 8ffc893e0d28..04f3dc3ace8e 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -58,6 +58,36 @@ struct arc_prune { refcount_t p_refcnt; }; +typedef enum arc_flags +{ + /* + * Public flags that can be passed into the ARC by external consumers. + */ + ARC_FLAG_NONE = 1 << 0, /* No flags set */ + ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ + ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ + + /* + * Private ARC flags. These flags are private ARC only flags that + * will show up in b_flags in the arc_hdr_buf_t. These flags should + * only be set by ARC code. + */ + ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ + ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */ + ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */ + ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */ + ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */ + ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */ +} arc_flags_t; + struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; @@ -72,15 +102,6 @@ typedef enum arc_buf_contents { ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES } arc_buf_contents_t; -/* - * These are the flags we pass into calls to the arc - */ -#define ARC_WAIT (1 << 1) /* perform I/O synchronously */ -#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ -#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ -#define ARC_CACHED (1 << 4) /* I/O was already in cache */ -#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */ -#define ARC_L2COMPRESS (1 << 6) /* compress in L2ARC */ /* * The following breakdows of arc_size exist for kstat only. @@ -147,7 +168,7 @@ int arc_referenced(arc_buf_t *buf); int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int flags, - uint32_t *arc_flags, const zbookmark_phys_t *zb); + arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index e7068ea188e3..1f8351a6784b 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -107,7 +107,7 @@ struct arc_buf_hdr { arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - uint32_t b_flags; + arc_flags_t b_flags; uint32_t b_datacnt; arc_callback_t *b_acb; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5d2200445fc4..6bc26bed3539 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -496,51 +496,26 @@ static kmutex_t arc_prune_mtx; static arc_buf_t *arc_eviction_list; static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; -static void arc_get_data_buf(arc_buf_t *buf); -static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); -static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type); -static void arc_buf_watch(arc_buf_t *buf); - -static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) -/* - * Private ARC flags. These flags are private ARC only flags that will show up - * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can - * be passed in as arc_flags in things like arc_read. However, these flags - * should never be passed and should only be set by ARC code. When adding new - * public flags, make sure not to smash the private ones. - */ - -#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ -#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ -#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ -#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ -#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ -#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ -#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ -#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ -#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ -#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ - -#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) -#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) -#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) -#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) -#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) -#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) -#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ - (hdr)->b_l2hdr != NULL) -#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) -#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) -#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) +#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) +#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) +#define HDR_FREE_IN_PROGRESS(hdr) \ + ((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS) +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2_READING(hdr) \ + ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS && \ + (hdr)->b_l2hdr != NULL) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) /* * Other sizes @@ -657,14 +632,21 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static void l2arc_read_done(zio_t *zio); +static void arc_get_data_buf(arc_buf_t *); +static void arc_access(arc_buf_hdr_t *, kmutex_t *); +static int arc_evict_needed(arc_buf_contents_t); +static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type); +static void arc_buf_watch(arc_buf_t *); + +static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); +static void l2arc_read_done(zio_t *); static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); -static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); -static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, - enum zio_compress c); -static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); +static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *); +static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); +static void l2arc_release_cdata_buf(arc_buf_hdr_t *); static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) @@ -709,14 +691,14 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *buf; + arc_buf_hdr_t *hdr; mutex_enter(hash_lock); - for (buf = buf_hash_table.ht_table[idx]; buf != NULL; - buf = buf->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, buf)) { + for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; + hdr = hdr->b_hash_next) { + if (BUF_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; - return (buf); + return (hdr); } } mutex_exit(hash_lock); @@ -731,27 +713,27 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) * Otherwise returns NULL. */ static arc_buf_hdr_t * -buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) +buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *fbuf; + arc_buf_hdr_t *fhdr; uint32_t i; - ASSERT(!DVA_IS_EMPTY(&buf->b_dva)); - ASSERT(buf->b_birth != 0); - ASSERT(!HDR_IN_HASH_TABLE(buf)); + ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); + ASSERT(hdr->b_birth != 0); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); *lockp = hash_lock; mutex_enter(hash_lock); - for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; - fbuf = fbuf->b_hash_next, i++) { - if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) - return (fbuf); + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; + fhdr = fhdr->b_hash_next, i++) { + if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + return (fhdr); } - buf->b_hash_next = buf_hash_table.ht_table[idx]; - buf_hash_table.ht_table[idx] = buf; - buf->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = hdr; + hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; /* collect some hash table performance data */ if (i > 0) { @@ -769,22 +751,22 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) } static void -buf_hash_remove(arc_buf_hdr_t *buf) +buf_hash_remove(arc_buf_hdr_t *hdr) { - arc_buf_hdr_t *fbuf, **bufp; - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + arc_buf_hdr_t *fhdr, **hdrp; + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); - ASSERT(HDR_IN_HASH_TABLE(buf)); + ASSERT(HDR_IN_HASH_TABLE(hdr)); - bufp = &buf_hash_table.ht_table[idx]; - while ((fbuf = *bufp) != buf) { - ASSERT(fbuf != NULL); - bufp = &fbuf->b_hash_next; + hdrp = &buf_hash_table.ht_table[idx]; + while ((fhdr = *hdrp) != hdr) { + ASSERT(fhdr != NULL); + hdrp = &fhdr->b_hash_next; } - *bufp = buf->b_hash_next; - buf->b_hash_next = NULL; - buf->b_flags &= ~ARC_IN_HASH_TABLE; + *hdrp = hdr->b_hash_next; + hdr->b_hash_next = NULL; + hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); @@ -830,14 +812,14 @@ buf_fini(void) static int hdr_cons(void *vbuf, void *unused, int kmflag) { - arc_buf_hdr_t *buf = vbuf; - - bzero(buf, sizeof (arc_buf_hdr_t)); - refcount_create(&buf->b_refcnt); - cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - list_link_init(&buf->b_arc_node); - list_link_init(&buf->b_l2node); + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, sizeof (arc_buf_hdr_t)); + refcount_create(&hdr->b_refcnt); + cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + list_link_init(&hdr->b_arc_node); + list_link_init(&hdr->b_l2node); arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); return (0); @@ -864,12 +846,12 @@ buf_cons(void *vbuf, void *unused, int kmflag) static void hdr_dest(void *vbuf, void *unused) { - arc_buf_hdr_t *buf = vbuf; + arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(buf)); - refcount_destroy(&buf->b_refcnt); - cv_destroy(&buf->b_cv); - mutex_destroy(&buf->b_freeze_lock); + ASSERT(BUF_EMPTY(hdr)); + refcount_destroy(&hdr->b_refcnt); + cv_destroy(&hdr->b_cv); + mutex_destroy(&hdr->b_freeze_lock); arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); } @@ -944,7 +926,7 @@ arc_cksum_verify(arc_buf_t *buf) mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum == NULL || - (buf->b_hdr->b_flags & ARC_IO_ERROR)) { + (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) { mutex_exit(&buf->b_hdr->b_freeze_lock); return; } @@ -1034,7 +1016,7 @@ arc_buf_thaw(arc_buf_t *buf) if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_state != arc_anon) panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } @@ -1069,54 +1051,54 @@ arc_buf_freeze(arc_buf_t *buf) } static void -add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { ASSERT(MUTEX_HELD(hash_lock)); - if ((refcount_add(&ab->b_refcnt, tag) == 1) && - (ab->b_state != arc_anon)) { - uint64_t delta = ab->b_size * ab->b_datacnt; - list_t *list = &ab->b_state->arcs_list[ab->b_type]; - uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; - - ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); - mutex_enter(&ab->b_state->arcs_mtx); - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(list, ab); - if (GHOST_STATE(ab->b_state)) { - ASSERT0(ab->b_datacnt); - ASSERT3P(ab->b_buf, ==, NULL); - delta = ab->b_size; + if ((refcount_add(&hdr->b_refcnt, tag) == 1) && + (hdr->b_state != arc_anon)) { + uint64_t delta = hdr->b_size * hdr->b_datacnt; + list_t *list = &hdr->b_state->arcs_list[hdr->b_type]; + uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + + ASSERT(!MUTEX_HELD(&hdr->b_state->arcs_mtx)); + mutex_enter(&hdr->b_state->arcs_mtx); + ASSERT(list_link_active(&hdr->b_arc_node)); + list_remove(list, hdr); + if (GHOST_STATE(hdr->b_state)) { + ASSERT0(hdr->b_datacnt); + ASSERT3P(hdr->b_buf, ==, NULL); + delta = hdr->b_size; } ASSERT(delta > 0); ASSERT3U(*size, >=, delta); atomic_add_64(size, -delta); - mutex_exit(&ab->b_state->arcs_mtx); + mutex_exit(&hdr->b_state->arcs_mtx); /* remove the prefetch flag if we get a reference */ - if (ab->b_flags & ARC_PREFETCH) - ab->b_flags &= ~ARC_PREFETCH; + if (hdr->b_flags & ARC_FLAG_PREFETCH) + hdr->b_flags &= ~ARC_FLAG_PREFETCH; } } static int -remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; - arc_state_t *state = ab->b_state; + arc_state_t *state = hdr->b_state; ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); - if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && + if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[ab->b_type]; + uint64_t *size = &state->arcs_lsize[hdr->b_type]; ASSERT(!MUTEX_HELD(&state->arcs_mtx)); mutex_enter(&state->arcs_mtx); - ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(&state->arcs_list[ab->b_type], ab); - ASSERT(ab->b_datacnt > 0); - atomic_add_64(size, ab->b_size * ab->b_datacnt); + ASSERT(!list_link_active(&hdr->b_arc_node)); + list_insert_head(&state->arcs_list[hdr->b_type], hdr); + ASSERT(hdr->b_datacnt > 0); + atomic_add_64(size, hdr->b_size * hdr->b_datacnt); mutex_exit(&state->arcs_mtx); } return (cnt); @@ -1175,19 +1157,20 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) * for the buffer must be held by the caller. */ static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, + kmutex_t *hash_lock) { - arc_state_t *old_state = ab->b_state; - int64_t refcnt = refcount_count(&ab->b_refcnt); + arc_state_t *old_state = hdr->b_state; + int64_t refcnt = refcount_count(&hdr->b_refcnt); uint64_t from_delta, to_delta; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || ab->b_datacnt > 0); - ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); + ASSERT(refcnt == 0 || hdr->b_datacnt > 0); + ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state)); + ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon); - from_delta = to_delta = ab->b_datacnt * ab->b_size; + from_delta = to_delta = hdr->b_datacnt * hdr->b_size; /* * If this buffer is evictable, transfer it from the @@ -1196,22 +1179,22 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) if (refcnt == 0) { if (old_state != arc_anon) { int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); - uint64_t *size = &old_state->arcs_lsize[ab->b_type]; + uint64_t *size = &old_state->arcs_lsize[hdr->b_type]; if (use_mutex) mutex_enter(&old_state->arcs_mtx); - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&old_state->arcs_list[ab->b_type], ab); + ASSERT(list_link_active(&hdr->b_arc_node)); + list_remove(&old_state->arcs_list[hdr->b_type], hdr); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ - if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { + if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) { /* ghost elements have a ghost size */ - ASSERT(ab->b_buf == NULL); - from_delta = ab->b_size; + ASSERT(hdr->b_buf == NULL); + from_delta = hdr->b_size; } ASSERT3U(*size, >=, from_delta); atomic_add_64(size, -from_delta); @@ -1221,18 +1204,19 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } if (new_state != arc_anon) { int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); - uint64_t *size = &new_state->arcs_lsize[ab->b_type]; + uint64_t *size = &new_state->arcs_lsize[hdr->b_type]; if (use_mutex) mutex_enter(&new_state->arcs_mtx); - list_insert_head(&new_state->arcs_list[ab->b_type], ab); + list_insert_head(&new_state->arcs_list[hdr->b_type], + hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT(ab->b_datacnt == 0); - ASSERT(ab->b_buf == NULL); - to_delta = ab->b_size; + ASSERT(hdr->b_datacnt == 0); + ASSERT(hdr->b_buf == NULL); + to_delta = hdr->b_size; } atomic_add_64(size, to_delta); @@ -1241,9 +1225,9 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } } - ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) - buf_hash_remove(ab); + ASSERT(!BUF_EMPTY(hdr)); + if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); /* adjust state sizes */ if (to_delta) @@ -1252,7 +1236,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT3U(old_state->arcs_size, >=, from_delta); atomic_add_64(&old_state->arcs_size, -from_delta); } - ab->b_state = new_state; + hdr->b_state = new_state; /* adjust l2arc hdr stats */ if (new_state == arc_l2c_only) @@ -1475,7 +1459,7 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, hits); } @@ -1673,7 +1657,7 @@ arc_buf_free(arc_buf_t *buf, void *tag) } else { ASSERT(buf == hdr->b_buf); ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } mutex_exit(hash_lock); } else if (HDR_IO_IN_PROGRESS(hdr)) { @@ -1725,7 +1709,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) } else if (no_callback) { ASSERT(hdr->b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } ASSERT(no_callback || hdr->b_datacnt > 1 || refcount_is_zero(&hdr->b_refcnt)); @@ -1800,7 +1784,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, { arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; - arc_buf_hdr_t *ab, *ab_prev = NULL; + arc_buf_hdr_t *hdr, *hdr_prev = NULL; list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; boolean_t have_lock; @@ -1816,24 +1800,24 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); - for (ab = list_tail(list); ab; ab = ab_prev) { - ab_prev = list_prev(list, ab); + for (hdr = list_tail(list); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(list, hdr); /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(ab) || - (spa && ab->b_spa != spa) || - (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && - ddi_get_lbolt() - ab->b_arc_access < + if (HDR_IO_IN_PROGRESS(hdr) || + (spa && hdr->b_spa != spa) || + (hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT) && + ddi_get_lbolt() - hdr->b_arc_access < zfs_arc_min_prefetch_lifespan)) { skipped++; continue; } /* "lookahead" for better eviction candidate */ - if (recycle && ab->b_size != bytes && - ab_prev && ab_prev->b_size == bytes) + if (recycle && hdr->b_size != bytes && + hdr_prev && hdr_prev->b_size == bytes) continue; /* ignore markers */ - if (ab->b_spa == 0) + if (hdr->b_spa == 0) continue; /* @@ -1846,34 +1830,34 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, * the hot code path, so don't sleep. */ if (!recycle && count++ > arc_evict_iterations) { - list_insert_after(list, ab, &marker); + list_insert_after(list, hdr, &marker); mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); kpreempt(KPREEMPT_SYNC); mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); - ab_prev = list_prev(list, &marker); + hdr_prev = list_prev(list, &marker); list_remove(list, &marker); count = 0; continue; } - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT0(refcount_count(&ab->b_refcnt)); - ASSERT(ab->b_datacnt > 0); - while (ab->b_buf) { - arc_buf_t *buf = ab->b_buf; + ASSERT0(refcount_count(&hdr->b_refcnt)); + ASSERT(hdr->b_datacnt > 0); + while (hdr->b_buf) { + arc_buf_t *buf = hdr->b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } if (buf->b_data) { - bytes_evicted += ab->b_size; - if (recycle && ab->b_type == type && - ab->b_size == bytes && - !HDR_L2_WRITING(ab)) { + bytes_evicted += hdr->b_size; + if (recycle && hdr->b_type == type && + hdr->b_size == bytes && + !HDR_L2_WRITING(hdr)) { stolen = buf->b_data; recycle = FALSE; } @@ -1882,7 +1866,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, mutex_enter(&arc_eviction_mtx); arc_buf_destroy(buf, buf->b_data == stolen, FALSE); - ab->b_buf = buf->b_next; + hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; @@ -1895,26 +1879,26 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, } } - if (ab->b_l2hdr) { + if (hdr->b_l2hdr) { ARCSTAT_INCR(arcstat_evict_l2_cached, - ab->b_size); + hdr->b_size); } else { - if (l2arc_write_eligible(ab->b_spa, ab)) { + if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, - ab->b_size); + hdr->b_size); } else { ARCSTAT_INCR( arcstat_evict_l2_ineligible, - ab->b_size); + hdr->b_size); } } - if (ab->b_datacnt == 0) { - arc_change_state(evicted_state, ab, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(ab)); - ab->b_flags |= ARC_IN_HASH_TABLE; - ab->b_flags &= ~ARC_BUF_AVAILABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + if (hdr->b_datacnt == 0) { + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } if (!have_lock) mutex_exit(hash_lock); @@ -1965,7 +1949,7 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { - arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t *hdr, *hdr_prev; arc_buf_hdr_t marker; list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; @@ -1977,18 +1961,18 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, bzero(&marker, sizeof (marker)); top: mutex_enter(&state->arcs_mtx); - for (ab = list_tail(list); ab; ab = ab_prev) { - ab_prev = list_prev(list, ab); - if (ab->b_type > ARC_BUFC_NUMTYPES) - panic("invalid ab=%p", (void *)ab); - if (spa && ab->b_spa != spa) + for (hdr = list_tail(list); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(list, hdr); + if (hdr->b_type > ARC_BUFC_NUMTYPES) + panic("invalid hdr=%p", (void *)hdr); + if (spa && hdr->b_spa != spa) continue; /* ignore markers */ - if (ab->b_spa == 0) + if (hdr->b_spa == 0) continue; - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); /* caller may be trying to modify this buffer, skip it */ if (MUTEX_HELD(hash_lock)) continue; @@ -2000,35 +1984,35 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, * before reacquiring the lock. */ if (count++ > arc_evict_iterations) { - list_insert_after(list, ab, &marker); + list_insert_after(list, hdr, &marker); mutex_exit(&state->arcs_mtx); kpreempt(KPREEMPT_SYNC); mutex_enter(&state->arcs_mtx); - ab_prev = list_prev(list, &marker); + hdr_prev = list_prev(list, &marker); list_remove(list, &marker); count = 0; continue; } if (mutex_tryenter(hash_lock)) { - ASSERT(!HDR_IO_IN_PROGRESS(ab)); - ASSERT(ab->b_buf == NULL); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(hdr->b_buf == NULL); ARCSTAT_BUMP(arcstat_deleted); - bytes_deleted += ab->b_size; + bytes_deleted += hdr->b_size; - if (ab->b_l2hdr != NULL) { + if (hdr->b_l2hdr != NULL) { /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ - arc_change_state(arc_l2c_only, ab, hash_lock); + arc_change_state(arc_l2c_only, hdr, hash_lock); mutex_exit(hash_lock); } else { - arc_change_state(arc_anon, ab, hash_lock); + arc_change_state(arc_anon, hdr, hash_lock); mutex_exit(hash_lock); - arc_hdr_destroy(ab); + arc_hdr_destroy(hdr); } - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (bytes >= 0 && bytes_deleted >= bytes) break; } else if (bytes < 0) { @@ -2037,12 +2021,12 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, * hash lock to become available. Once its * available, restart from where we left off. */ - list_insert_after(list, ab, &marker); + list_insert_after(list, hdr, &marker); mutex_exit(&state->arcs_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); mutex_enter(&state->arcs_mtx); - ab_prev = list_prev(list, &marker); + hdr_prev = list_prev(list, &marker); list_remove(list, &marker); } else { bufs_skipped += 1; @@ -2747,7 +2731,8 @@ arc_get_data_buf(arc_buf_t *buf) * will end up on the mru list; so steal space from there. */ if (state == arc_mfu_ghost) - state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; + state = buf->b_hdr->b_flags & ARC_FLAG_PREFETCH ? + arc_mru : arc_mfu; else if (state == arc_mru_ghost) state = arc_mru; @@ -2834,25 +2819,25 @@ arc_get_data_buf(arc_buf_t *buf) * NOTE: the hash lock is dropped in this function. */ static void -arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); - if (buf->b_state == arc_anon) { + if (hdr->b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ - ASSERT(buf->b_arc_access == 0); - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); - arc_change_state(arc_mru, buf, hash_lock); + ASSERT(hdr->b_arc_access == 0); + hdr->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mru, hdr, hash_lock); - } else if (buf->b_state == arc_mru) { + } else if (hdr->b_state == arc_mru) { now = ddi_get_lbolt(); /* @@ -2863,15 +2848,15 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - if (refcount_count(&buf->b_refcnt) == 0) { - ASSERT(list_link_active(&buf->b_arc_node)); + if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { + if (refcount_count(&hdr->b_refcnt) == 0) { + ASSERT(list_link_active(&hdr->b_arc_node)); } else { - buf->b_flags &= ~ARC_PREFETCH; - atomic_inc_32(&buf->b_mru_hits); + hdr->b_flags &= ~ARC_FLAG_PREFETCH; + atomic_inc_32(&hdr->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } - buf->b_arc_access = now; + hdr->b_arc_access = now; return; } @@ -2880,19 +2865,19 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * but it is still in the cache. Move it to the MFU * state. */ - if (ddi_time_after(now, buf->b_arc_access + ARC_MINTIME)) { + if (ddi_time_after(now, hdr->b_arc_access + ARC_MINTIME)) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ - buf->b_arc_access = now; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc_mfu, buf, hash_lock); + hdr->b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); } - atomic_inc_32(&buf->b_mru_hits); + atomic_inc_32(&hdr->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); - } else if (buf->b_state == arc_mru_ghost) { + } else if (hdr->b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but @@ -2900,22 +2885,22 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * MFU state. */ - if (buf->b_flags & ARC_PREFETCH) { + if (hdr->b_flags & ARC_FLAG_PREFETCH) { new_state = arc_mru; - if (refcount_count(&buf->b_refcnt) > 0) - buf->b_flags &= ~ARC_PREFETCH; - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + if (refcount_count(&hdr->b_refcnt) > 0) + hdr->b_flags &= ~ARC_FLAG_PREFETCH; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } - buf->b_arc_access = ddi_get_lbolt(); - arc_change_state(new_state, buf, hash_lock); + hdr->b_arc_access = ddi_get_lbolt(); + arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&buf->b_mru_ghost_hits); + atomic_inc_32(&hdr->b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); - } else if (buf->b_state == arc_mfu) { + } else if (hdr->b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. @@ -2925,14 +2910,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - ASSERT(refcount_count(&buf->b_refcnt) == 0); - ASSERT(list_link_active(&buf->b_arc_node)); + if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { + ASSERT(refcount_count(&hdr->b_refcnt) == 0); + ASSERT(list_link_active(&hdr->b_arc_node)); } - atomic_inc_32(&buf->b_mfu_hits); + atomic_inc_32(&hdr->b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); - buf->b_arc_access = ddi_get_lbolt(); - } else if (buf->b_state == arc_mfu_ghost) { + hdr->b_arc_access = ddi_get_lbolt(); + } else if (hdr->b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has @@ -2940,31 +2925,31 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * MFU state. */ - if (buf->b_flags & ARC_PREFETCH) { + if (hdr->b_flags & ARC_FLAG_PREFETCH) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT0(refcount_count(&buf->b_refcnt)); + ASSERT0(refcount_count(&hdr->b_refcnt)); new_state = arc_mru; } - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(new_state, buf, hash_lock); + hdr->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&buf->b_mfu_ghost_hits); + atomic_inc_32(&hdr->b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); - } else if (buf->b_state == arc_l2c_only) { + } else if (hdr->b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc_mfu, buf, hash_lock); + hdr->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); } else { - cmn_err(CE_PANIC, "invalid arc state 0x%p", buf->b_state); + cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_state); } } @@ -3032,9 +3017,9 @@ arc_read_done(zio_t *zio) (found == hdr && HDR_L2_READING(hdr))); } - hdr->b_flags &= ~ARC_L2_EVICTED; - if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) - hdr->b_flags &= ~ARC_L2CACHE; + hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; + if (l2arc_noprefetch && (hdr->b_flags & ARC_FLAG_PREFETCH)) + hdr->b_flags &= ~ARC_FLAG_L2CACHE; /* byteswap if necessary */ callback_list = hdr->b_acb; @@ -3079,18 +3064,18 @@ arc_read_done(zio_t *zio) } } hdr->b_acb = NULL; - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); if (abuf == buf) { ASSERT(buf->b_efunc == NULL); ASSERT(hdr->b_datacnt == 1); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); if (zio->io_error != 0) { - hdr->b_flags |= ARC_IO_ERROR; + hdr->b_flags |= ARC_FLAG_IO_ERROR; if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) @@ -3156,8 +3141,8 @@ arc_read_done(zio_t *zio) */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, - void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, - const zbookmark_phys_t *zb) + void *private, zio_priority_t priority, int zio_flags, + arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; arc_buf_t *buf = NULL; @@ -3180,16 +3165,16 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (hdr != NULL && hdr->b_datacnt > 0) { - *arc_flags |= ARC_CACHED; + *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { - if (*arc_flags & ARC_WAIT) { + if (*arc_flags & ARC_FLAG_WAIT) { cv_wait(&hdr->b_cv, hash_lock); mutex_exit(hash_lock); goto top; } - ASSERT(*arc_flags & ARC_NOWAIT); + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { arc_callback_t *acb = NULL; @@ -3227,24 +3212,24 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT(buf->b_data); if (HDR_BUF_AVAILABLE(hdr)) { ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; } else { buf = arc_buf_clone(buf); } - } else if (*arc_flags & ARC_PREFETCH && + } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_refcnt) == 0) { - hdr->b_flags |= ARC_PREFETCH; + hdr->b_flags |= ARC_FLAG_PREFETCH; } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, hits); @@ -3294,18 +3279,19 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } + /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) { + if (*arc_flags & ARC_FLAG_PREFETCH) { (void) remove_reference(hdr, hash_lock, private); - hdr->b_flags |= ARC_PREFETCH; + hdr->b_flags |= ARC_FLAG_PREFETCH; } - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_INDIRECT; + hdr->b_flags |= ARC_FLAG_INDIRECT; } else { /* this block is in the ghost cache */ ASSERT(GHOST_STATE(hdr->b_state)); @@ -3314,14 +3300,14 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT(hdr->b_buf == NULL); /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) - hdr->b_flags |= ARC_PREFETCH; + if (*arc_flags & ARC_FLAG_PREFETCH) + hdr->b_flags |= ARC_FLAG_PREFETCH; else add_reference(hdr, hash_lock, private); - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; @@ -3343,7 +3329,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT(hdr->b_acb == NULL); hdr->b_acb = acb; - hdr->b_flags |= ARC_IO_IN_PROGRESS; + hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; if (hdr->b_l2hdr != NULL && (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { @@ -3370,7 +3356,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); @@ -3433,12 +3419,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); - if (*arc_flags & ARC_NOWAIT) { + if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); goto out; } - ASSERT(*arc_flags & ARC_WAIT); + ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) goto out; @@ -3464,12 +3450,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); - if (*arc_flags & ARC_WAIT) { + if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); goto out; } - ASSERT(*arc_flags & ARC_NOWAIT); + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } @@ -3540,7 +3526,7 @@ arc_freed(spa_t *spa, const blkptr_t *bp) if (HDR_BUF_AVAILABLE(hdr)) { arc_buf_t *buf = hdr->b_buf; add_reference(hdr, hash_lock, FTAG); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; mutex_exit(hash_lock); arc_release(buf, FTAG); @@ -3607,7 +3593,7 @@ arc_clear_callback(arc_buf_t *buf) arc_buf_destroy(buf, FALSE, TRUE); } else { ASSERT(buf == hdr->b_buf); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; mutex_exit(&buf->b_evict_lock); } @@ -3718,7 +3704,7 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_mfu_hits = 0; nhdr->b_mfu_ghost_hits = 0; nhdr->b_l2_hits = 0; - nhdr->b_flags = flags & ARC_L2_WRITING; + nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; nhdr->b_freeze_cksum = NULL; @@ -3809,7 +3795,7 @@ arc_write_ready(zio_t *zio) mutex_exit(&hdr->b_freeze_lock); } arc_cksum_compute(buf, B_FALSE); - hdr->b_flags |= ARC_IO_IN_PROGRESS; + hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; } /* @@ -3890,13 +3876,13 @@ arc_write_done(zio_t *zio) ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ if (!exists && hdr->b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; } ASSERT(!refcount_is_zero(&hdr->b_refcnt)); @@ -3919,12 +3905,12 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(ready != NULL); ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); - ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); + ASSERT((hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) == 0); ASSERT(hdr->b_acb == NULL); if (l2arc) - hdr->b_flags |= ARC_L2CACHE; + hdr->b_flags |= ARC_FLAG_L2CACHE; if (l2arc_compress) - hdr->b_flags |= ARC_L2COMPRESS; + hdr->b_flags |= ARC_FLAG_L2COMPRESS; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_physdone = physdone; @@ -4414,7 +4400,7 @@ arc_fini(void) */ static boolean_t -l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: @@ -4423,8 +4409,8 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ - if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || - HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) + if (hdr->b_spa != spa_guid || hdr->b_l2hdr != NULL || + HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); return (B_TRUE); @@ -4585,7 +4571,7 @@ l2arc_write_done(zio_t *zio) l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; - arc_buf_hdr_t *head, *ab, *ab_prev; + arc_buf_hdr_t *head, *hdr, *hdr_prev; l2arc_buf_hdr_t *abl2; kmutex_t *hash_lock; int64_t bytes_dropped = 0; @@ -4609,17 +4595,17 @@ l2arc_write_done(zio_t *zio) /* * All writes completed, or an error was hit. */ - for (ab = list_prev(buflist, head); ab; ab = ab_prev) { - ab_prev = list_prev(buflist, ab); - abl2 = ab->b_l2hdr; + for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); + abl2 = hdr->b_l2hdr; /* * Release the temporary compressed buffer as soon as possible. */ if (abl2->b_compress != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(ab); + l2arc_release_cdata_buf(hdr); - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * This buffer misses out. It may be in a stage @@ -4634,19 +4620,19 @@ l2arc_write_done(zio_t *zio) /* * Error - drop L2ARC entry. */ - list_remove(buflist, ab); + list_remove(buflist, hdr); ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_dropped += abl2->b_asize; - ab->b_l2hdr = NULL; + hdr->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); } /* * Allow ARC to begin reads to this L2ARC entry. */ - ab->b_flags &= ~ARC_L2_WRITING; + hdr->b_flags &= ~ARC_FLAG_L2_WRITING; mutex_exit(hash_lock); } @@ -4793,7 +4779,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; l2arc_buf_hdr_t *abl2; - arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; int64_t bytes_evicted = 0; @@ -4825,10 +4811,10 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) top: mutex_enter(&l2arc_buflist_mtx); - for (ab = list_tail(buflist); ab; ab = ab_prev) { - ab_prev = list_prev(buflist, ab); + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. @@ -4840,19 +4826,19 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) goto top; } - if (HDR_L2_WRITE_HEAD(ab)) { + if (HDR_L2_WRITE_HEAD(hdr)) { /* * We hit a write head node. Leave it for * l2arc_write_done(). */ - list_remove(buflist, ab); + list_remove(buflist, hdr); mutex_exit(hash_lock); continue; } - if (!all && ab->b_l2hdr != NULL && - (ab->b_l2hdr->b_daddr > taddr || - ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { + if (!all && hdr->b_l2hdr != NULL && + (hdr->b_l2hdr->b_daddr > taddr || + hdr->b_l2hdr->b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. @@ -4861,7 +4847,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) break; } - if (HDR_FREE_IN_PROGRESS(ab)) { + if (HDR_FREE_IN_PROGRESS(hdr)) { /* * Already on the path to destruction. */ @@ -4869,45 +4855,45 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) continue; } - if (ab->b_state == arc_l2c_only) { - ASSERT(!HDR_L2_READING(ab)); + if (hdr->b_state == arc_l2c_only) { + ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_size. */ - arc_change_state(arc_anon, ab, hash_lock); - arc_hdr_destroy(ab); + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); } else { /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ - if (HDR_L2_READING(ab)) { + if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); - ab->b_flags |= ARC_L2_EVICTED; + hdr->b_flags |= ARC_FLAG_L2_EVICTED; } /* * Tell ARC this no longer exists in L2ARC. */ - if (ab->b_l2hdr != NULL) { - abl2 = ab->b_l2hdr; + if (hdr->b_l2hdr != NULL) { + abl2 = hdr->b_l2hdr; ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_evicted += abl2->b_asize; - ab->b_l2hdr = NULL; + hdr->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); } - list_remove(buflist, ab); + list_remove(buflist, hdr); /* * This may have been leftover after a * failed write. */ - ab->b_flags &= ~ARC_L2_WRITING; + hdr->b_flags &= ~ARC_FLAG_L2_WRITING; } mutex_exit(hash_lock); } @@ -4920,7 +4906,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) /* * Find and write ARC buffers to the L2ARC device. * - * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid + * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. @@ -4932,7 +4918,7 @@ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { - arc_buf_hdr_t *ab, *ab_prev, *head; + arc_buf_hdr_t *hdr, *hdr_prev, *head; list_t *list; uint64_t write_asize, write_psize, write_sz, headroom, buf_compress_minsz; @@ -4954,7 +4940,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); - head->b_flags |= ARC_L2_WRITE_HEAD; + head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; /* * We will want to try to compress buffers that are at least 2x the @@ -4978,25 +4964,25 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) - ab = list_head(list); + hdr = list_head(list); else - ab = list_tail(list); + hdr = list_tail(list); headroom = target_sz * l2arc_headroom; if (do_headroom_boost) headroom = (headroom * l2arc_headroom_boost) / 100; - for (; ab; ab = ab_prev) { + for (; hdr; hdr = hdr_prev) { l2arc_buf_hdr_t *l2hdr; kmutex_t *hash_lock; uint64_t buf_sz; if (arc_warm == B_FALSE) - ab_prev = list_next(list, ab); + hdr_prev = list_next(list, hdr); else - ab_prev = list_prev(list, ab); + hdr_prev = list_prev(list, hdr); - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. @@ -5004,7 +4990,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - passed_sz += ab->b_size; + passed_sz += hdr->b_size; if (passed_sz > headroom) { /* * Searched too far. @@ -5013,12 +4999,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, break; } - if (!l2arc_write_eligible(guid, ab)) { + if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } - if ((write_sz + ab->b_size) > target_sz) { + if ((write_sz + hdr->b_size) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -5048,32 +5034,32 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, l2hdr->b_dev = dev; arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ab->b_flags |= ARC_L2_WRITING; + hdr->b_flags |= ARC_FLAG_L2_WRITING; /* * Temporarily stash the data buffer in b_tmp_cdata. * The subsequent write step will pick it up from - * there. This is because can't access ab->b_buf + * there. This is because can't access hdr->b_buf * without holding the hash_lock, which we in turn * can't access without holding the ARC list locks * (which we want to avoid during compression/writing) */ l2hdr->b_compress = ZIO_COMPRESS_OFF; - l2hdr->b_asize = ab->b_size; - l2hdr->b_tmp_cdata = ab->b_buf->b_data; + l2hdr->b_asize = hdr->b_size; + l2hdr->b_tmp_cdata = hdr->b_buf->b_data; l2hdr->b_hits = 0; - buf_sz = ab->b_size; - ab->b_l2hdr = l2hdr; + buf_sz = hdr->b_size; + hdr->b_l2hdr = l2hdr; - list_insert_head(dev->l2ad_buflist, ab); + list_insert_head(dev->l2ad_buflist, hdr); /* * Compute and store the buffer cksum before * writing. On debug the cksum is verified first. */ - arc_cksum_verify(ab->b_buf); - arc_cksum_compute(ab->b_buf, B_TRUE); + arc_cksum_verify(hdr->b_buf); + arc_cksum_compute(hdr->b_buf, B_TRUE); mutex_exit(hash_lock); @@ -5099,21 +5085,22 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * and work backwards, retracing the course of the buffer selector * loop above. */ - for (ab = list_prev(dev->l2ad_buflist, head); ab; - ab = list_prev(dev->l2ad_buflist, ab)) { + for (hdr = list_prev(dev->l2ad_buflist, head); hdr; + hdr = list_prev(dev->l2ad_buflist, hdr)) { l2arc_buf_hdr_t *l2hdr; uint64_t buf_sz; /* * We shouldn't need to lock the buffer here, since we flagged - * it as ARC_L2_WRITING in the previous step, but we must take - * care to only access its L2 cache parameters. In particular, - * ab->b_buf may be invalid by now due to ARC eviction. + * it as ARC_FLAG_L2_WRITING in the previous step, but we must + * take care to only access its L2 cache parameters. In + * particular, hdr->b_buf may be invalid by now due to + * ARC eviction. */ - l2hdr = ab->b_l2hdr; + l2hdr = hdr->b_l2hdr; l2hdr->b_daddr = dev->l2ad_hand; - if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) && + if (!l2arc_nocompress && (hdr->b_flags & ARC_FLAG_L2COMPRESS) && l2hdr->b_asize >= buf_compress_minsz) { if (l2arc_compress_buf(l2hdr)) { /* @@ -5329,9 +5316,9 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) * done, we can dispose of it. */ static void -l2arc_release_cdata_buf(arc_buf_hdr_t *ab) +l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; + l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { /* @@ -5339,7 +5326,7 @@ l2arc_release_cdata_buf(arc_buf_hdr_t *ab) * temporary buffer for it, so now we need to release it. */ ASSERT(l2hdr->b_tmp_cdata != NULL); - abd_free(l2hdr->b_tmp_cdata, ab->b_size); + abd_free(l2hdr->b_tmp_cdata, hdr->b_size); } l2hdr->b_tmp_cdata = NULL; } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 170e7b996469..6df9d94307ff 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -592,7 +592,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { dnode_t *dn; zbookmark_phys_t zb; - uint32_t aflags = ARC_NOWAIT; + uint32_t aflags = ARC_FLAG_NOWAIT; int err; DB_DNODE_ENTER(db); @@ -648,9 +648,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) - aflags |= ARC_L2CACHE; + aflags |= ARC_FLAG_L2CACHE; if (DBUF_IS_L2COMPRESSIBLE(db)) - aflags |= ARC_L2COMPRESS; + aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, @@ -662,7 +662,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); - if (aflags & ARC_CACHED) + if (aflags & ARC_FLAG_CACHED) *flags |= DB_RF_CACHED; return (SET_ERROR(err)); @@ -1949,7 +1949,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t aflags = + ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index c7cbf55bdde6..32838bbb63db 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -129,7 +129,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } else if (zb->zb_level == 0) { dnode_phys_t *blk; arc_buf_t *abuf; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; int blksz = BP_GET_LSIZE(bp); int i; diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index ad8082d3dfef..a9afe5772d83 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -284,15 +284,15 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_spa = spa; os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (DMU_OS_IS_L2CACHEABLE(os)) - aflags |= ARC_L2CACHE; + aflags |= ARC_FLAG_L2CACHE; if (DMU_OS_IS_L2COMPRESSIBLE(os)) - aflags |= ARC_L2COMPRESS; + aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 4f3c07fd5ac2..4e4a0c379db1 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -491,7 +491,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dnode_phys_t *blk; int i; int blksz = BP_GET_LSIZE(bp); - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, @@ -509,7 +509,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } (void) arc_buf_remove_ref(abuf, &abuf); } else if (type == DMU_OT_SA) { - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); @@ -527,7 +527,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, err = dump_write_embedded(dsp, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); } else { /* it's a level-0 block of a regular object */ - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); void *buf; diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 5b053aa4d5a8..2989c1283da7 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -177,7 +177,7 @@ static void traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { - uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) return; @@ -278,7 +278,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; + uint32_t flags = ARC_FLAG_WAIT; int32_t i; int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t *czb; @@ -312,7 +312,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, kmem_free(czb, sizeof (zbookmark_phys_t)); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; + uint32_t flags = ARC_FLAG_WAIT; int32_t i; int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; @@ -335,7 +335,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; dnode_phys_t *dnp; @@ -451,7 +451,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { prefetch_data_t *pfd = arg; - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT(pfd->pd_blks_fetched >= 0); if (pfd->pd_cancel) @@ -542,7 +542,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 34233b801b01..7938ef2b5524 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -571,7 +571,7 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, uint64_t object, uint64_t blkid) { zbookmark_phys_t czb; - uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; if (zfs_no_scrub_prefetch) return; @@ -636,7 +636,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, int err; if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; @@ -665,7 +665,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, } (void) arc_buf_remove_ref(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; int i, j; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; @@ -693,7 +693,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, (void) arc_buf_remove_ref(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 3e39dba2c2e6..2b8559b5d276 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -200,7 +200,7 @@ spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) if (zfs_read_history == 0 && ssh->size == 0) return; - if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) + if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) return; srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index d2ae8336c536..c363e7fae87e 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -204,7 +204,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, char **end) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; @@ -277,7 +277,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 87342090a0dc..ac88c2642cc4 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2212,7 +2212,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; From b0258ca7f2d3f1f312a320a23f06fe948f4c2a36 Mon Sep 17 00:00:00 2001 From: Chris Williamson Date: Mon, 29 Dec 2014 19:12:23 -0800 Subject: [PATCH 21/26] Illumos - 5408 managing ZFS cache devices requires lots of RAM 5408 managing ZFS cache devices requires lots of RAM Reviewed by: Christopher Siden Reviewed by: George Wilson Reviewed by: Matthew Ahrens Reviewed by: Don Brady Reviewed by: Josef 'Jeff' Sipek Approved by: Garrett D'Amore Ported by: Tim Chase Porting notes: Due to the restructuring of the ARC-related structures, this patch conflicts with at least the following existing ZoL commits: 6e1d7276c94cbd7c2e19f9232f6ba4bafa62dbe0 Fix inaccurate arcstat_l2_hdr_size calculations The ARC_SPACE_HDRS constant no longer exists and has been somewhat equivalently replaced by HDR_L2ONLY_SIZE. e0b0ca983d6897bcddf05af2c0e5d01ff66f90db Add visibility in to cached dbufs The new layering of l{1,2}arc_buf_hdr_t within the arc_buf_hdr struct requires additional structure member names to be used when referencing the inner items. Also, the presence of L1 or L2 inner member is indicated by flags using the new HDR_HAS_L{1,2}HDR macros. conflicts due to #2129 solely and/or in combination with these changes, in: cmd/ztest/ztest.c @ztest_dmu_read_write_zcopy abd_copy_from_buf(bigbuf_arcbufs[j]->b_data, (caddr_t)bigbuf + (off - bigoff), chunksize); stays, only if-statement above is expanded to: if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { module/zfs/arc.c @l2arc_decompress_zio bzero(hdr->b_buf->b_data, hdr->b_size); to abd_zero(hdr->b_buf->b_data, hdr->b_size); moved: struct l2arc_buf_hdr { to typedef struct l2arc_buf_hdr { keeping in mind for later: /* temporary buffer holder for in-flight compressed data */ abd_t *b_tmp_cdata; @l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) abd_free(l2hdr->b_tmp_cdata, hdr->b_size); to zio_data_buf_free(l2hdr->b_tmp_cdata, hdr->b_size); to abd_free(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size); @l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) bzero(hdr->b_buf->b_data, hdr->b_size); to bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size) abd_zero(hdr->b_buf->b_data, hdr->b_size); to abd_zero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); line below that: zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; to zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; merge conflict in @arc_cksum_equal(arc_buf_t *buf) fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); to abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); (no change) @arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_state); to cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); + fixing >80 lines problem in include/sys/arc_impl.h void *b_tmp_cdata; to abd_t *b_tmp_cdata; --- cmd/ztest/ztest.c | 9 +- include/sys/arc.h | 27 +- include/sys/arc_impl.h | 95 ++- include/sys/trace_arc.h | 54 +- module/zfs/arc.c | 1398 +++++++++++++++++++++++---------------- 5 files changed, 949 insertions(+), 634 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 2a0e15765622..fdc42f4433ed 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -4057,7 +4057,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) * assign an arcbuf to a dbuf. */ for (j = 0; j < s; j++) { - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bigbuf_arcbufs[j] = dmu_request_arcbuf(bonus_db, chunksize); } else { @@ -4081,7 +4081,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); for (j = 0; j < s; j++) { - if (i != 5) { + if (i != 5 || + chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_return_arcbuf(bigbuf_arcbufs[j]); } else { dmu_return_arcbuf( @@ -4126,7 +4127,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) } for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { abd_copy_from_buf(bigbuf_arcbufs[j]->b_data, (caddr_t)bigbuf + (off - bigoff), chunksize); @@ -4146,7 +4147,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_assign_arcbuf(bonus_db, off, bigbuf_arcbufs[j], tx); } else { diff --git a/include/sys/arc.h b/include/sys/arc.h index 04f3dc3ace8e..8802450bf88e 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -82,10 +82,29 @@ typedef enum arc_flags ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */ ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */ ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */ - ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */ - ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */ + ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */ + /* indicates that the buffer contains metadata (otherwise, data) */ + ARC_FLAG_BUFC_METADATA = 1 << 16, + + /* Flags specifying whether optional hdr struct fields are defined */ + ARC_FLAG_HAS_L1HDR = 1 << 17, + ARC_FLAG_HAS_L2HDR = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + } arc_flags_t; struct arc_buf { diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 1f8351a6784b..6e29bfdd73b3 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -74,8 +74,6 @@ typedef struct arc_state { arc_state_type_t arcs_state; } arc_state_t; -typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; - typedef struct arc_callback arc_callback_t; struct arc_callback { @@ -96,27 +94,45 @@ struct arc_write_callback { arc_buf_t *awcb_buf; }; -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - uint64_t b_cksum0; - +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; - arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - arc_flags_t b_flags; uint32_t b_datacnt; - - arc_callback_t *b_acb; + /* for waiting on writes to complete */ kcondvar_t b_cv; - /* immutable */ - arc_buf_contents_t b_type; - uint64_t b_size; - uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; @@ -133,9 +149,10 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; - l2arc_buf_hdr_t *b_l2hdr; - list_node_t b_l2node; -}; + arc_callback_t *b_acb; + /* temporary buffer holder for in-flight compressed data */ + abd_t *b_tmp_cdata; +} l1arc_buf_hdr_t; typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ @@ -146,15 +163,51 @@ typedef struct l2arc_dev { uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ - list_t *l2ad_buflist; /* buffer list */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + /* real alloc'd buffer size depending on b_compress applied */ + uint32_t b_hits; + int32_t b_asize; + + list_node_t b_l2node; +} l2arc_buf_hdr_t; + typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + /* + * Even though this checksum is only set/verified when a buffer is in + * the L1 cache, it needs to be in the set of common fields because it + * must be preserved from the time before a buffer is written out to + * L2ARC until after it is read back in. + */ + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* immutable */ + int32_t b_size; + uint64_t b_spa; + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; +}; #ifdef __cplusplus } #endif diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h index 8b885eff73a7..b9df228eae33 100644 --- a/include/sys/trace_arc.h +++ b/include/sys/trace_arc.h @@ -45,7 +45,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) - __field(uint64_t, hdr_cksum0) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_datacnt) __field(arc_buf_contents_t, hdr_type) @@ -64,27 +63,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_dva_word[0] = ab->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; __entry->hdr_birth = ab->b_birth; - __entry->hdr_cksum0 = ab->b_cksum0; __entry->hdr_flags = ab->b_flags; - __entry->hdr_datacnt = ab->b_datacnt; - __entry->hdr_type = ab->b_type; + __entry->hdr_datacnt = ab->b_l1hdr.b_datacnt; __entry->hdr_size = ab->b_size; __entry->hdr_spa = ab->b_spa; - __entry->hdr_state_type = ab->b_state->arcs_state; - __entry->hdr_access = ab->b_arc_access; - __entry->hdr_mru_hits = ab->b_mru_hits; - __entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits; - __entry->hdr_mfu_hits = ab->b_mfu_hits; - __entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits; - __entry->hdr_l2_hits = ab->b_l2_hits; - __entry->hdr_refcount = ab->b_refcnt.rc_count; + __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = ab->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), - TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " "flags 0x%x datacnt %u type %u size %llu spa %llu " "state_type %u access %lu mru_hits %u mru_ghost_hits %u " "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], - __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, + __entry->hdr_birth, __entry->hdr_flags, __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, @@ -261,7 +258,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) - __field(uint64_t, hdr_cksum0) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_datacnt) __field(arc_buf_contents_t, hdr_type) @@ -292,20 +288,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; __entry->hdr_birth = hdr->b_birth; - __entry->hdr_cksum0 = hdr->b_cksum0; __entry->hdr_flags = hdr->b_flags; - __entry->hdr_datacnt = hdr->b_datacnt; - __entry->hdr_type = hdr->b_type; + __entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt; __entry->hdr_size = hdr->b_size; __entry->hdr_spa = hdr->b_spa; - __entry->hdr_state_type = hdr->b_state->arcs_state; - __entry->hdr_access = hdr->b_arc_access; - __entry->hdr_mru_hits = hdr->b_mru_hits; - __entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits; - __entry->hdr_mfu_hits = hdr->b_mfu_hits; - __entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits; - __entry->hdr_l2_hits = hdr->b_l2_hits; - __entry->hdr_refcount = hdr->b_refcnt.rc_count; + __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = hdr->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count; __entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0]; __entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1]; @@ -325,8 +319,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->zb_level = zb->zb_level; __entry->zb_blkid = zb->zb_blkid; ), - TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " - "flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u " + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " + "flags 0x%x datacnt %u size %llu spa %llu state_type %u " "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " "mfu_ghost_hits %u l2_hits %u refcount %lli } " "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " @@ -334,8 +328,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, "lsize %llu } zb { objset %llu object %llu level %lli " "blkid %llu }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], - __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, - __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, + __entry->hdr_birth, __entry->hdr_flags, + __entry->hdr_datacnt, __entry->hdr_size, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6bc26bed3539..92c79aa352ea 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -118,7 +118,7 @@ * Note that the majority of the performance stats are manipulated * with atomic operations. * - * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction @@ -309,6 +309,7 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_hdr_miss; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; @@ -397,6 +398,7 @@ static arc_stats_t arc_stats = { { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, @@ -507,22 +509,38 @@ static arc_buf_hdr_t arc_eviction_hdr; #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) -#define HDR_FREE_IN_PROGRESS(hdr) \ - ((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS) + #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ - ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS && \ - (hdr)->b_l2hdr != NULL) + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_ISTYPE_METADATA(hdr) \ + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) +#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) + +#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) +#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) + +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET 24 +#define HDR_COMPRESS_NBITS 7 + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) + /* * Other sizes */ -#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines @@ -592,7 +610,6 @@ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ -static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ @@ -607,19 +624,6 @@ typedef struct l2arc_read_callback { enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; -struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ - /* compression applied to buffer data */ - enum zio_compress b_compress; - /* real alloc'd buffer size depending on b_compress applied */ - uint32_t b_hits; - uint64_t b_asize; - /* temporary buffer holder for in-flight compressed data */ - abd_t *b_tmp_cdata; -}; - typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_data; @@ -639,12 +643,13 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type); static void arc_buf_watch(arc_buf_t *); +static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); +static uint32_t arc_bufc_to_flags(arc_buf_contents_t); + static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); -static void l2arc_hdr_stat_add(void); -static void l2arc_hdr_stat_remove(void); -static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *); +static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); static void l2arc_release_cdata_buf(arc_buf_hdr_t *); @@ -667,8 +672,7 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0 && \ - (buf)->b_cksum0 == 0) + (buf)->b_dva.dva_word[1] == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ @@ -681,7 +685,6 @@ buf_discard_identity(arc_buf_hdr_t *hdr) hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; - hdr->b_cksum0 = 0; } static arc_buf_hdr_t * @@ -711,6 +714,7 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. + * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) @@ -723,8 +727,14 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); - *lockp = hash_lock; - mutex_enter(hash_lock); + + if (lockp != NULL) { + *lockp = hash_lock; + mutex_enter(hash_lock); + } else { + ASSERT(MUTEX_HELD(hash_lock)); + } + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) @@ -779,7 +789,8 @@ buf_hash_remove(arc_buf_hdr_t *hdr) /* * Global data structures and functions for the buf kmem cache. */ -static kmem_cache_t *hdr_cache; +static kmem_cache_t *hdr_full_cache; +static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void @@ -800,7 +811,8 @@ buf_fini(void) #endif for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); - kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(hdr_full_cache); + kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } @@ -810,17 +822,29 @@ buf_fini(void) */ /* ARGSUSED */ static int -hdr_cons(void *vbuf, void *unused, int kmflag) +hdr_full_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, HDR_FULL_SIZE); + cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&hdr->b_l1hdr.b_refcnt); + mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + list_link_init(&hdr->b_l1hdr.b_arc_node); + list_link_init(&hdr->b_l2hdr.b_l2node); + arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; - bzero(hdr, sizeof (arc_buf_hdr_t)); - refcount_create(&hdr->b_refcnt); - cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - list_link_init(&hdr->b_arc_node); - list_link_init(&hdr->b_l2node); - arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + bzero(hdr, HDR_L2ONLY_SIZE); + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } @@ -844,15 +868,25 @@ buf_cons(void *vbuf, void *unused, int kmflag) */ /* ARGSUSED */ static void -hdr_dest(void *vbuf, void *unused) +hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(BUF_EMPTY(hdr)); - refcount_destroy(&hdr->b_refcnt); - cv_destroy(&hdr->b_cv); - mutex_destroy(&hdr->b_freeze_lock); - arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + cv_destroy(&hdr->b_l1hdr.b_cv); + refcount_destroy(&hdr->b_l1hdr.b_refcnt); + mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +hdr_l2only_dest(void *vbuf, void *unused) +{ + ASSERTV(arc_buf_hdr_t *hdr = vbuf); + + ASSERT(BUF_EMPTY(hdr)); + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ @@ -899,8 +933,11 @@ buf_init(void) goto retry; } - hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), - 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0); + hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, + 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); + hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", + HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, + NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); @@ -914,6 +951,81 @@ buf_init(void) } } +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev; + + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + dev = hdr->b_l2hdr.b_dev; + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + if (new == hdr_full_cache) { + nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + } else { + ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT0(hdr->b_l1hdr.b_datacnt); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* + * We might be removing the L1hdr of a buffer which was just + * written out to L2ARC. If such a buffer is compressed then we + * need to free its b_tmp_cdata before destroying the header. + */ + if (hdr->b_l1hdr.b_tmp_cdata != NULL && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + l2arc_release_cdata_buf(hdr); + nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; + } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); + + buf_discard_identity(hdr); + hdr->b_freeze_cksum = NULL; + kmem_cache_free(old, hdr); + + return (nhdr); +} + + #define ARC_MINTIME (hz>>4) /* 62 ms */ static void @@ -924,16 +1036,15 @@ arc_cksum_verify(arc_buf_t *buf) if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || - (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); } static int @@ -942,10 +1053,10 @@ arc_cksum_equal(arc_buf_t *buf) zio_cksum_t zc; int equal; - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return (equal); } @@ -956,16 +1067,16 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } @@ -1010,24 +1121,50 @@ arc_buf_watch(arc_buf_t *buf) #endif } +static arc_buf_contents_t +arc_buf_type(arc_buf_hdr_t *hdr) +{ + if (HDR_ISTYPE_METADATA(hdr)) { + return (ARC_BUFC_METADATA); + } else { + return (ARC_BUFC_DATA); + } +} + +static uint32_t +arc_bufc_to_flags(arc_buf_contents_t type) +{ + switch (type) { + case ARC_BUFC_DATA: + /* metadata field is 0 if buffer contains normal data */ + return (0); + case ARC_BUFC_METADATA: + return (ARC_FLAG_BUFC_METADATA); + default: + break; + } + panic("undefined ARC buffer type!"); + return ((uint32_t)-1); +} + void arc_buf_thaw(arc_buf_t *buf) { if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_state != arc_anon) + if (buf->b_hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) + if (HDR_IO_IN_PROGRESS(buf->b_hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); arc_buf_unwatch(buf); } @@ -1044,7 +1181,7 @@ arc_buf_freeze(arc_buf_t *buf) mutex_enter(hash_lock); ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_state == arc_anon); + buf->b_hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); @@ -1053,30 +1190,37 @@ arc_buf_freeze(arc_buf_t *buf) static void add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { + arc_state_t *state; + + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(MUTEX_HELD(hash_lock)); - if ((refcount_add(&hdr->b_refcnt, tag) == 1) && - (hdr->b_state != arc_anon)) { - uint64_t delta = hdr->b_size * hdr->b_datacnt; - list_t *list = &hdr->b_state->arcs_list[hdr->b_type]; - uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; - - ASSERT(!MUTEX_HELD(&hdr->b_state->arcs_mtx)); - mutex_enter(&hdr->b_state->arcs_mtx); - ASSERT(list_link_active(&hdr->b_arc_node)); - list_remove(list, hdr); - if (GHOST_STATE(hdr->b_state)) { - ASSERT0(hdr->b_datacnt); - ASSERT3P(hdr->b_buf, ==, NULL); - delta = hdr->b_size; + state = hdr->b_l1hdr.b_state; + + if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && + (state != arc_anon)) { + /* We don't use the L2-only state list. */ + if (state != arc_l2c_only) { + uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; + list_t *list = &state->arcs_list[arc_buf_type(hdr)]; + uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; + + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); + mutex_enter(&state->arcs_mtx); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_remove(list, hdr); + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_datacnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + delta = hdr->b_size; + } + ASSERT(delta > 0); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); + mutex_exit(&state->arcs_mtx); } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); - mutex_exit(&hdr->b_state->arcs_mtx); /* remove the prefetch flag if we get a reference */ - if (hdr->b_flags & ARC_FLAG_PREFETCH) - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + hdr->b_flags &= ~ARC_FLAG_PREFETCH; } } @@ -1084,21 +1228,27 @@ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; - arc_state_t *state = hdr->b_state; + arc_state_t *state = hdr->b_l1hdr.b_state; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); - if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) && + /* + * arc_l2c_only counts as a ghost state so we don't need to explicitly + * check to prevent usage of the arc_l2c_only list. + */ + if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[hdr->b_type]; + uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; ASSERT(!MUTEX_HELD(&state->arcs_mtx)); mutex_enter(&state->arcs_mtx); - ASSERT(!list_link_active(&hdr->b_arc_node)); - list_insert_head(&state->arcs_list[hdr->b_type], hdr); - ASSERT(hdr->b_datacnt > 0); - atomic_add_64(size, hdr->b_size * hdr->b_datacnt); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); + atomic_add_64(size, hdr->b_size * + hdr->b_l1hdr.b_datacnt); mutex_exit(&state->arcs_mtx); } return (cnt); @@ -1115,31 +1265,45 @@ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { arc_buf_hdr_t *hdr = ab->b_hdr; - arc_state_t *state = hdr->b_state; + l1arc_buf_hdr_t *l1hdr = NULL; + l2arc_buf_hdr_t *l2hdr = NULL; + arc_state_t *state = NULL; + + if (HDR_HAS_L1HDR(hdr)) { + l1hdr = &hdr->b_l1hdr; + state = l1hdr->b_state; + } + if (HDR_HAS_L2HDR(hdr)) + l2hdr = &hdr->b_l2hdr; memset(abi, 0, sizeof (arc_buf_info_t)); abi->abi_flags = hdr->b_flags; - abi->abi_datacnt = hdr->b_datacnt; + + if (l1hdr) { + abi->abi_datacnt = l1hdr->b_datacnt; + abi->abi_access = l1hdr->b_arc_access; + abi->abi_mru_hits = l1hdr->b_mru_hits; + abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; + abi->abi_mfu_hits = l1hdr->b_mfu_hits; + abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; + abi->abi_holds = refcount_count(&l1hdr->b_refcnt); + } + + if (l2hdr) { + abi->abi_l2arc_dattr = l2hdr->b_daddr; + abi->abi_l2arc_asize = l2hdr->b_asize; + abi->abi_l2arc_compress = HDR_GET_COMPRESS(hdr); + abi->abi_l2arc_hits = l2hdr->b_hits; + } + abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; - abi->abi_state_contents = hdr->b_type; + abi->abi_state_contents = arc_buf_type(hdr); abi->abi_state_index = -1; abi->abi_size = hdr->b_size; - abi->abi_access = hdr->b_arc_access; - abi->abi_mru_hits = hdr->b_mru_hits; - abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits; - abi->abi_mfu_hits = hdr->b_mfu_hits; - abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits; - abi->abi_holds = refcount_count(&hdr->b_refcnt); - - if (hdr->b_l2hdr) { - abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr; - abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize; - abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress; - abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits; - } - if (state && state_index && list_link_active(&hdr->b_arc_node)) { - list_t *list = &state->arcs_list[hdr->b_type]; + if (l1hdr && state && state_index && + list_link_active(&l1hdr->b_arc_node)) { + list_t *list = &state->arcs_list[arc_buf_type(hdr)]; arc_buf_hdr_t *h; mutex_enter(&state->arcs_mtx); @@ -1160,40 +1324,60 @@ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { - arc_state_t *old_state = hdr->b_state; - int64_t refcnt = refcount_count(&hdr->b_refcnt); + arc_state_t *old_state; + int64_t refcnt; + uint32_t datacnt; uint64_t from_delta, to_delta; + arc_buf_contents_t buftype = arc_buf_type(hdr); + + /* + * We almost always have an L1 hdr here, since we call arc_hdr_realloc() + * in arc_read() when bringing a buffer out of the L2ARC. However, the + * L1 hdr doesn't always exist when we change state to arc_anon before + * destroying a header, in which case reallocating to add the L1 hdr is + * pointless. + */ + if (HDR_HAS_L1HDR(hdr)) { + old_state = hdr->b_l1hdr.b_state; + refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); + datacnt = hdr->b_l1hdr.b_datacnt; + } else { + old_state = arc_l2c_only; + refcnt = 0; + datacnt = 0; + } ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || hdr->b_datacnt > 0); - ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon); + ASSERT(refcnt == 0 || datacnt > 0); + ASSERT(!GHOST_STATE(new_state) || datacnt == 0); + ASSERT(old_state != arc_anon || datacnt <= 1); - from_delta = to_delta = hdr->b_datacnt * hdr->b_size; + from_delta = to_delta = datacnt * hdr->b_size; /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { - if (old_state != arc_anon) { + if (old_state != arc_anon && old_state != arc_l2c_only) { int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); - uint64_t *size = &old_state->arcs_lsize[hdr->b_type]; + uint64_t *size = &old_state->arcs_lsize[buftype]; if (use_mutex) mutex_enter(&old_state->arcs_mtx); - ASSERT(list_link_active(&hdr->b_arc_node)); - list_remove(&old_state->arcs_list[hdr->b_type], hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_remove(&old_state->arcs_list[buftype], hdr); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ - if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) { + if (GHOST_STATE(old_state) && datacnt == 0) { /* ghost elements have a ghost size */ - ASSERT(hdr->b_buf == NULL); + ASSERT(hdr->b_l1hdr.b_buf == NULL); from_delta = hdr->b_size; } ASSERT3U(*size, >=, from_delta); @@ -1202,20 +1386,26 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (use_mutex) mutex_exit(&old_state->arcs_mtx); } - if (new_state != arc_anon) { + if (new_state != arc_anon && new_state != arc_l2c_only) { int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); - uint64_t *size = &new_state->arcs_lsize[hdr->b_type]; + uint64_t *size = &new_state->arcs_lsize[buftype]; + /* + * An L1 header always exists here, since if we're + * moving to some L1-cached state (i.e. not l2c_only or + * anonymous), we realloc the header to add an L1hdr + * beforehand. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); if (use_mutex) mutex_enter(&new_state->arcs_mtx); - list_insert_head(&new_state->arcs_list[hdr->b_type], - hdr); + list_insert_head(&new_state->arcs_list[buftype], hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT(hdr->b_datacnt == 0); - ASSERT(hdr->b_buf == NULL); + ASSERT0(datacnt); + ASSERT(hdr->b_l1hdr.b_buf == NULL); to_delta = hdr->b_size; } atomic_add_64(size, to_delta); @@ -1229,20 +1419,22 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - /* adjust state sizes */ - if (to_delta) + /* adjust state sizes (ignore arc_l2c_only) */ + if (to_delta && new_state != arc_l2c_only) atomic_add_64(&new_state->arcs_size, to_delta); - if (from_delta) { + if (from_delta && old_state != arc_l2c_only) { ASSERT3U(old_state->arcs_size, >=, from_delta); atomic_add_64(&old_state->arcs_size, -from_delta); } - hdr->b_state = new_state; + if (HDR_HAS_L1HDR(hdr)) + hdr->b_l1hdr.b_state = new_state; - /* adjust l2arc hdr stats */ - if (new_state == arc_l2c_only) - l2arc_hdr_stat_add(); - else if (old_state == arc_l2c_only) - l2arc_hdr_stat_remove(); + /* + * L2 headers should never be on the L2 state list since they don't + * have L1 headers allocated. + */ + ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void @@ -1320,30 +1512,36 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type) arc_buf_t *buf; VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); + ASSERT3P(hdr->b_freeze_cksum, ==, NULL); hdr->b_size = size; - hdr->b_type = type; hdr->b_spa = spa_load_guid(spa); - hdr->b_state = arc_anon; - hdr->b_arc_access = 0; - hdr->b_mru_hits = 0; - hdr->b_mru_ghost_hits = 0; - hdr->b_mfu_hits = 0; - hdr->b_mfu_ghost_hits = 0; - hdr->b_l2_hits = 0; + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; + hdr->b_l1hdr.b_l2_hits = 0; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; - hdr->b_buf = buf; + + hdr->b_flags = arc_bufc_to_flags(type); + hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + + hdr->b_l1hdr.b_buf = buf; + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_datacnt = 1; + arc_get_data_buf(buf); - hdr->b_datacnt = 1; - hdr->b_flags = 0; - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - (void) refcount_add(&hdr->b_refcnt, tag); + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); return (buf); } @@ -1376,8 +1574,9 @@ arc_return_buf(arc_buf_t *buf, void *tag) arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - (void) refcount_add(&hdr->b_refcnt, tag); - (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } @@ -1386,12 +1585,12 @@ arc_return_buf(arc_buf_t *buf, void *tag) void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; + arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - hdr = buf->b_hdr; - (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); - (void) refcount_remove(&hdr->b_refcnt, tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); buf->b_efunc = NULL; buf->b_private = NULL; @@ -1405,15 +1604,16 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; - ASSERT(hdr->b_state != arc_anon); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; - buf->b_next = hdr->b_buf; - hdr->b_buf = buf; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; arc_get_data_buf(buf); abd_copy(buf->b_data, from->b_data, size); @@ -1423,11 +1623,11 @@ arc_buf_clone(arc_buf_t *from) * then track the size and number of duplicates. These stats will be * updated as duplicate buffers are created and destroyed. */ - if (hdr->b_type == ARC_BUFC_DATA) { + if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMP(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); } - hdr->b_datacnt += 1; + hdr->b_l1hdr.b_datacnt += 1; return (buf); } @@ -1450,17 +1650,20 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); mutex_exit(&buf->b_evict_lock); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); + add_reference(hdr, hash_lock, tag); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } @@ -1498,10 +1701,10 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) arc_buf_t **bufp; /* free up data associated with the buf */ - if (buf->b_data) { - arc_state_t *state = buf->b_hdr->b_state; + if (buf->b_data != NULL) { + arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; + arc_buf_contents_t type = arc_buf_type(buf->b_hdr); arc_cksum_verify(buf); arc_buf_unwatch(buf); @@ -1516,11 +1719,12 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) arc_space_return(size, ARC_SPACE_DATA); } } - if (list_link_active(&buf->b_hdr->b_arc_node)) { + if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { uint64_t *cnt = &state->arcs_lsize[type]; - ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); - ASSERT(state != arc_anon); + ASSERT(refcount_is_zero( + &buf->b_hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); ASSERT3U(*cnt, >=, size); atomic_add_64(cnt, -size); @@ -1533,13 +1737,13 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) * If we're destroying a duplicate buffer make sure * that the appropriate statistics are updated. */ - if (buf->b_hdr->b_datacnt > 1 && - buf->b_hdr->b_type == ARC_BUFC_DATA) { + if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && + HDR_ISTYPE_DATA(buf->b_hdr)) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); } - ASSERT(buf->b_hdr->b_datacnt > 0); - buf->b_hdr->b_datacnt -= 1; + ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); + buf->b_hdr->b_l1hdr.b_datacnt -= 1; } /* only remove the buf if requested */ @@ -1547,7 +1751,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) return; /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) + for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; + bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; buf->b_next = NULL; @@ -1562,84 +1767,82 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; - - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - ASSERT3P(hdr->b_state, ==, arc_anon); + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_buf == NULL || + hdr->b_l1hdr.b_datacnt > 0); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + + if (HDR_HAS_L2HDR(hdr)) { + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx); - if (l2hdr != NULL) { - boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); - /* - * To prevent arc_free() and l2arc_evict() from - * attempting to free the same buffer at the same time, - * a FREE_IN_PROGRESS flag is given to arc_free() to - * give it priority. l2arc_evict() can't destroy this - * header while we are waiting on l2arc_buflist_mtx. - * - * The hdr may be removed from l2ad_buflist before we - * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. - */ if (!buflist_held) { - mutex_enter(&l2arc_buflist_mtx); - l2hdr = hdr->b_l2hdr; + mutex_enter(&l2hdr->b_dev->l2ad_mtx); + l2hdr = &hdr->b_l2hdr; } - if (l2hdr != NULL) { - list_remove(l2hdr->b_dev->l2ad_buflist, hdr); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - vdev_space_update(l2hdr->b_dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); - kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - if (hdr->b_state == arc_l2c_only) - l2arc_hdr_stat_remove(); - hdr->b_l2hdr = NULL; - } + list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); + + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); if (!buflist_held) - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&l2hdr->b_dev->l2ad_mtx); + + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; } - if (!BUF_EMPTY(hdr)) { - ASSERT(!HDR_IN_HASH_TABLE(hdr)); + if (!BUF_EMPTY(hdr)) buf_discard_identity(hdr); - } - while (hdr->b_buf) { - arc_buf_t *buf = hdr->b_buf; - - if (buf->b_efunc) { - mutex_enter(&arc_eviction_mtx); - mutex_enter(&buf->b_evict_lock); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_buf, FALSE, FALSE); - hdr->b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_eviction_mtx); - } else { - arc_buf_destroy(hdr->b_buf, FALSE, TRUE); - } - } + if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - ASSERT(!list_link_active(&hdr->b_arc_node)); + if (HDR_HAS_L1HDR(hdr)) { + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; + + if (buf->b_efunc != NULL) { + mutex_enter(&arc_eviction_mtx); + mutex_enter(&buf->b_evict_lock); + ASSERT(buf->b_hdr != NULL); + arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, + FALSE); + hdr->b_l1hdr.b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&buf->b_evict_lock); + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, + TRUE); + } + } + } + ASSERT3P(hdr->b_hash_next, ==, NULL); - ASSERT3P(hdr->b_acb, ==, NULL); - kmem_cache_free(hdr_cache, hdr); + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + kmem_cache_free(hdr_full_cache, hdr); + } else { + kmem_cache_free(hdr_l2only_cache, hdr); + } } void arc_buf_free(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_state != arc_anon; + int hashed = hdr->b_l1hdr.b_state != arc_anon; ASSERT(buf->b_efunc == NULL); ASSERT(buf->b_data != NULL); @@ -1652,10 +1855,10 @@ arc_buf_free(arc_buf_t *buf, void *tag) ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { arc_buf_destroy(buf, FALSE, TRUE); } else { - ASSERT(buf == hdr->b_buf); + ASSERT(buf == hdr->b_l1hdr.b_buf); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } @@ -1669,7 +1872,7 @@ arc_buf_free(arc_buf_t *buf, void *tag) */ mutex_enter(&arc_eviction_mtx); (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_refcnt)); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); mutex_exit(&arc_eviction_mtx); if (destroy_hdr) @@ -1689,8 +1892,8 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) kmutex_t *hash_lock = NULL; boolean_t no_callback = (buf->b_efunc == NULL); - if (hdr->b_state == arc_anon) { - ASSERT(hdr->b_datacnt == 1); + if (hdr->b_l1hdr.b_state == arc_anon) { + ASSERT(hdr->b_l1hdr.b_datacnt == 1); arc_buf_free(buf, tag); return (no_callback); } @@ -1698,21 +1901,22 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; + ASSERT(hdr->b_l1hdr.b_datacnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(hdr->b_state != arc_anon); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); ASSERT(buf->b_data != NULL); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { if (no_callback) arc_buf_destroy(buf, FALSE, TRUE); } else if (no_callback) { - ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } - ASSERT(no_callback || hdr->b_datacnt > 1 || - refcount_is_zero(&hdr->b_refcnt)); + ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || + refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); mutex_exit(hash_lock); return (no_callback); } @@ -1758,7 +1962,7 @@ arc_buf_eviction_needed(arc_buf_t *buf) return (B_TRUE); } - if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) evict_needed = B_TRUE; mutex_exit(&buf->b_evict_lock); @@ -1797,16 +2001,27 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; top: - mutex_enter(&state->arcs_mtx); + /* + * The ghost list lock must be acquired first in order to prevent + * a 3 party deadlock: + * + * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by + * l2ad_mtx in arc_hdr_realloc + * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx + * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by + * arc_*_ghost->arcs_mtx and forms a deadlock cycle. + * + * This situation is avoided by acquiring the ghost list lock first. + */ mutex_enter(&evicted_state->arcs_mtx); + mutex_enter(&state->arcs_mtx); for (hdr = list_tail(list); hdr; hdr = hdr_prev) { hdr_prev = list_prev(list, hdr); /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || - (spa && hdr->b_spa != spa) || - (hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT) && - ddi_get_lbolt() - hdr->b_arc_access < + ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < zfs_arc_min_prefetch_lifespan)) { skipped++; continue; @@ -1831,11 +2046,11 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, */ if (!recycle && count++ > arc_evict_iterations) { list_insert_after(list, hdr, &marker); - mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); + mutex_exit(&evicted_state->arcs_mtx); kpreempt(KPREEMPT_SYNC); - mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); + mutex_enter(&state->arcs_mtx); hdr_prev = list_prev(list, &marker); list_remove(list, &marker); count = 0; @@ -1845,28 +2060,29 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, hash_lock = HDR_LOCK(hdr); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT0(refcount_count(&hdr->b_refcnt)); - ASSERT(hdr->b_datacnt > 0); - while (hdr->b_buf) { - arc_buf_t *buf = hdr->b_buf; + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } - if (buf->b_data) { + if (buf->b_data != NULL) { bytes_evicted += hdr->b_size; - if (recycle && hdr->b_type == type && + if (recycle && + arc_buf_type(hdr) == type && hdr->b_size == bytes && !HDR_L2_WRITING(hdr)) { stolen = buf->b_data; recycle = FALSE; } } - if (buf->b_efunc) { + if (buf->b_efunc != NULL) { mutex_enter(&arc_eviction_mtx); arc_buf_destroy(buf, buf->b_data == stolen, FALSE); - hdr->b_buf = buf->b_next; + hdr->b_l1hdr.b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; @@ -1879,7 +2095,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, } } - if (hdr->b_l2hdr) { + if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); } else { @@ -1893,7 +2109,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, } } - if (hdr->b_datacnt == 0) { + if (hdr->b_l1hdr.b_datacnt == 0) { arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; @@ -1909,8 +2125,8 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, } } - mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); + mutex_exit(&evicted_state->arcs_mtx); if (list == &state->arcs_list[ARC_BUFC_DATA] && (bytes < 0 || bytes_evicted < bytes)) { @@ -1963,7 +2179,7 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, mutex_enter(&state->arcs_mtx); for (hdr = list_tail(list); hdr; hdr = hdr_prev) { hdr_prev = list_prev(list, hdr); - if (hdr->b_type > ARC_BUFC_NUMTYPES) + if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) panic("invalid hdr=%p", (void *)hdr); if (spa && hdr->b_spa != spa) continue; @@ -1995,16 +2211,23 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, } if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_buf == NULL); + ASSERT(!HDR_HAS_L1HDR(hdr) || + hdr->b_l1hdr.b_buf == NULL); ARCSTAT_BUMP(arcstat_deleted); bytes_deleted += hdr->b_size; - if (hdr->b_l2hdr != NULL) { + if (HDR_HAS_L2HDR(hdr)) { /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr, hash_lock); + /* + * dropping from L1+L2 cached to L2-only, + * realloc to remove the L1 header. + */ + hdr = arc_hdr_realloc(hdr, hdr_full_cache, + hdr_l2only_cache); mutex_exit(hash_lock); } else { arc_change_state(arc_anon, hdr, hash_lock); @@ -2269,27 +2492,27 @@ arc_flush(spa_t *spa) { uint64_t guid = 0; - if (spa) + if (spa != NULL) guid = spa_load_guid(spa); while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa) + if (spa != NULL) break; } while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa) + if (spa != NULL) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa) + if (spa != NULL) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa) + if (spa != NULL) break; } @@ -2361,8 +2584,8 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) } } - kmem_cache_reap_now(buf_cache); - kmem_cache_reap_now(hdr_cache); + kmem_cache_reap_now(hdr_full_cache); + kmem_cache_reap_now(hdr_l2only_cache); } /* @@ -2702,9 +2925,9 @@ arc_evict_needed(arc_buf_contents_t type) static void arc_get_data_buf(arc_buf_t *buf) { - arc_state_t *state = buf->b_hdr->b_state; + arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; + arc_buf_contents_t type = arc_buf_type(buf->b_hdr); arc_buf_contents_t evict = ARC_BUFC_DATA; boolean_t recycle = TRUE; @@ -2731,8 +2954,7 @@ arc_get_data_buf(arc_buf_t *buf) * will end up on the mru list; so steal space from there. */ if (state == arc_mfu_ghost) - state = buf->b_hdr->b_flags & ARC_FLAG_PREFETCH ? - arc_mru : arc_mfu; + state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; else if (state == arc_mru_ghost) state = arc_mru; @@ -2795,20 +3017,21 @@ arc_get_data_buf(arc_buf_t *buf) * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ - if (!GHOST_STATE(buf->b_hdr->b_state)) { + if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { arc_buf_hdr_t *hdr = buf->b_hdr; - atomic_add_64(&hdr->b_state->arcs_size, size); - if (list_link_active(&hdr->b_arc_node)) { - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - atomic_add_64(&hdr->b_state->arcs_lsize[type], size); + atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); + if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], + size); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (!zfs_arc_p_aggressive_disable && - arc_size < arc_c && hdr->b_state == arc_anon && + arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } @@ -2824,20 +3047,21 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) clock_t now; ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); - if (hdr->b_state == arc_anon) { + if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ - ASSERT(hdr->b_arc_access == 0); - hdr->b_arc_access = ddi_get_lbolt(); + ASSERT0(hdr->b_l1hdr.b_arc_access); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr, hash_lock); - } else if (hdr->b_state == arc_mru) { + } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* @@ -2848,15 +3072,16 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { - if (refcount_count(&hdr->b_refcnt) == 0) { - ASSERT(list_link_active(&hdr->b_arc_node)); + if (HDR_PREFETCH(hdr)) { + if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + ASSERT(list_link_active( + &hdr->b_l1hdr.b_arc_node)); } else { hdr->b_flags &= ~ARC_FLAG_PREFETCH; - atomic_inc_32(&hdr->b_mru_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } - hdr->b_arc_access = now; + hdr->b_l1hdr.b_arc_access = now; return; } @@ -2865,19 +3090,20 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * but it is still in the cache. Move it to the MFU * state. */ - if (ddi_time_after(now, hdr->b_arc_access + ARC_MINTIME)) { + if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + + ARC_MINTIME)) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ - hdr->b_arc_access = now; + hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } - atomic_inc_32(&hdr->b_mru_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); - } else if (hdr->b_state == arc_mru_ghost) { + } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but @@ -2885,9 +3111,9 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (hdr->b_flags & ARC_FLAG_PREFETCH) { + if (HDR_PREFETCH(hdr)) { new_state = arc_mru; - if (refcount_count(&hdr->b_refcnt) > 0) + if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) hdr->b_flags &= ~ARC_FLAG_PREFETCH; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { @@ -2895,12 +3121,12 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } - hdr->b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&hdr->b_mru_ghost_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); - } else if (hdr->b_state == arc_mfu) { + } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. @@ -2910,14 +3136,14 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { - ASSERT(refcount_count(&hdr->b_refcnt) == 0); - ASSERT(list_link_active(&hdr->b_arc_node)); + if ((HDR_PREFETCH(hdr)) != 0) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); } - atomic_inc_32(&hdr->b_mfu_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); - hdr->b_arc_access = ddi_get_lbolt(); - } else if (hdr->b_state == arc_mfu_ghost) { + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has @@ -2925,31 +3151,32 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (hdr->b_flags & ARC_FLAG_PREFETCH) { + if (HDR_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT0(refcount_count(&hdr->b_refcnt)); + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } - hdr->b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&hdr->b_mfu_ghost_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); - } else if (hdr->b_state == arc_l2c_only) { + } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ - hdr->b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } else { - cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_state); + cmn_err(CE_PANIC, "invalid arc state 0x%p", + hdr->b_l1hdr.b_state); } } @@ -3018,11 +3245,11 @@ arc_read_done(zio_t *zio) } hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; - if (l2arc_noprefetch && (hdr->b_flags & ARC_FLAG_PREFETCH)) + if (l2arc_noprefetch && HDR_PREFETCH(hdr)) hdr->b_flags &= ~ARC_FLAG_L2CACHE; /* byteswap if necessary */ - callback_list = hdr->b_acb; + callback_list = hdr->b_l1hdr.b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { dmu_object_byteswap_t bswap = @@ -3041,7 +3268,8 @@ arc_read_done(zio_t *zio) arc_cksum_compute(buf, B_FALSE); arc_buf_watch(buf); - if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { + if (hash_lock && zio->io_error == 0 && + hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -3063,24 +3291,25 @@ arc_read_done(zio_t *zio) abuf = NULL; } } - hdr->b_acb = NULL; + hdr->b_l1hdr.b_acb = NULL; hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); if (abuf == buf) { ASSERT(buf->b_efunc == NULL); - ASSERT(hdr->b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_datacnt == 1); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } - ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || + callback_list != NULL); if (zio->io_error != 0) { hdr->b_flags |= ARC_FLAG_IO_ERROR; - if (hdr->b_state != arc_anon) + if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - freeable = refcount_is_zero(&hdr->b_refcnt); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* @@ -3088,9 +3317,9 @@ arc_read_done(zio_t *zio) * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ - cv_broadcast(&hdr->b_cv); + cv_broadcast(&hdr->b_l1hdr.b_cv); - if (hash_lock) { + if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* @@ -3099,8 +3328,8 @@ arc_read_done(zio_t *zio) * moved to the anonymous state (so that it won't show up * in the cache). */ - ASSERT3P(hdr->b_state, ==, arc_anon); - freeable = refcount_is_zero(&hdr->b_refcnt); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ @@ -3163,14 +3392,14 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && hdr->b_datacnt > 0) { + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { if (*arc_flags & ARC_FLAG_WAIT) { - cv_wait(&hdr->b_cv, hash_lock); + cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } @@ -3188,8 +3417,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); - acb->acb_next = hdr->b_acb; - hdr->b_acb = acb; + acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); goto out; @@ -3198,7 +3427,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, goto out; } - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); if (done) { add_reference(hdr, hash_lock, private); @@ -3207,7 +3437,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ - buf = hdr->b_buf; + buf = hdr->b_l1hdr.b_buf; ASSERT(buf); ASSERT(buf->b_data); if (HDR_BUF_AVAILABLE(hdr)) { @@ -3218,7 +3448,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, } } else if (*arc_flags & ARC_FLAG_PREFETCH && - refcount_count(&hdr->b_refcnt) == 0) { + refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { hdr->b_flags |= ARC_FLAG_PREFETCH; } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); @@ -3229,8 +3459,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr->b_flags |= ARC_FLAG_L2COMPRESS; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) @@ -3242,7 +3472,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, uint64_t addr = 0; boolean_t devw = B_FALSE; enum zio_compress b_compress = ZIO_COMPRESS_OFF; - uint64_t b_asize = 0; + int32_t b_asize = 0; /* * Gracefully handle a damaged logical block size as a @@ -3269,7 +3499,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); - hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { @@ -3293,11 +3522,20 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (BP_GET_LEVEL(bp) > 0) hdr->b_flags |= ARC_FLAG_INDIRECT; } else { - /* this block is in the ghost cache */ - ASSERT(GHOST_STATE(hdr->b_state)); + /* + * This block is in the ghost cache. If it was L2-only + * (and thus didn't have an L1 hdr), we realloc the + * header to add an L1 hdr. + */ + if (!HDR_HAS_L1HDR(hdr)) { + hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, + hdr_full_cache); + } + + ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT0(refcount_count(&hdr->b_refcnt)); - ASSERT(hdr->b_buf == NULL); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(hdr->b_l1hdr.b_buf == NULL); /* if this is a prefetch, we don't have a reference */ if (*arc_flags & ARC_FLAG_PREFETCH) @@ -3314,29 +3552,29 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; - hdr->b_buf = buf; - ASSERT(hdr->b_datacnt == 0); - hdr->b_datacnt = 1; + hdr->b_l1hdr.b_buf = buf; + ASSERT0(hdr->b_l1hdr.b_datacnt); + hdr->b_l1hdr.b_datacnt = 1; arc_get_data_buf(buf); arc_access(hdr, hash_lock); } - ASSERT(!GHOST_STATE(hdr->b_state)); + ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - ASSERT(hdr->b_acb == NULL); - hdr->b_acb = acb; + ASSERT(hdr->b_l1hdr.b_acb == NULL); + hdr->b_l1hdr.b_acb = acb; hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; - if (hdr->b_l2hdr != NULL && - (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { - devw = hdr->b_l2hdr->b_dev->l2ad_writing; - addr = hdr->b_l2hdr->b_daddr; - b_compress = hdr->b_l2hdr->b_compress; - b_asize = hdr->b_l2hdr->b_asize; + if (HDR_HAS_L2HDR(hdr) && + (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr.b_dev->l2ad_writing; + addr = hdr->b_l2hdr.b_daddr; + b_compress = HDR_GET_COMPRESS(hdr); + b_asize = hdr->b_l2hdr.b_asize; /* * Lock out device removal. */ @@ -3356,8 +3594,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { @@ -3370,14 +3608,14 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ - if (hdr->b_l2hdr != NULL && + if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); - atomic_inc_32(&hdr->b_l2hdr->b_hits); + atomic_inc_32(&hdr->b_l2hdr.b_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); @@ -3499,8 +3737,9 @@ void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || + func == NULL); ASSERT(buf->b_efunc == NULL); ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); @@ -3524,7 +3763,7 @@ arc_freed(spa_t *spa, const blkptr_t *bp) if (hdr == NULL) return; if (HDR_BUF_AVAILABLE(hdr)) { - arc_buf_t *buf = hdr->b_buf; + arc_buf_t *buf = hdr->b_l1hdr.b_buf; add_reference(hdr, hash_lock, FTAG); hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; mutex_exit(hash_lock); @@ -3582,17 +3821,19 @@ arc_clear_callback(arc_buf_t *buf) hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, + hdr->b_l1hdr.b_datacnt); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); buf->b_efunc = NULL; buf->b_private = NULL; - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, FALSE, TRUE); } else { - ASSERT(buf == hdr->b_buf); + ASSERT(buf == hdr->b_l1hdr.b_buf); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; mutex_exit(&buf->b_evict_lock); } @@ -3611,10 +3852,9 @@ arc_clear_callback(arc_buf_t *buf) void arc_release(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock = NULL; - l2arc_buf_hdr_t *l2hdr; - uint64_t buf_size = 0; + kmutex_t *hash_lock; + arc_state_t *state; + arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if it's DMU metadata (level > @@ -3623,56 +3863,89 @@ arc_release(arc_buf_t *buf, void *tag) */ mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - /* this buffer is not on any list */ - ASSERT(refcount_count(&hdr->b_refcnt) > 0); + /* + * We don't grab the hash lock prior to this check, because if + * the buffer's header is in the arc_anon state, it won't be + * linked into the hash table. + */ + if (hdr->b_l1hdr.b_state == arc_anon) { + mutex_exit(&buf->b_evict_lock); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + ASSERT(!HDR_HAS_L2HDR(hdr)); + ASSERT(BUF_EMPTY(hdr)); - if (hdr->b_state == arc_anon) { - /* this buffer is already released */ - ASSERT(buf->b_efunc == NULL); - } else { - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); + ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + + ASSERT3P(buf->b_efunc, ==, NULL); + ASSERT3P(buf->b_private, ==, NULL); + + hdr->b_l1hdr.b_arc_access = 0; + arc_buf_thaw(buf); + + return; } - l2hdr = hdr->b_l2hdr; - if (l2hdr) { - mutex_enter(&l2arc_buflist_mtx); - hdr->b_l2hdr = NULL; - list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + /* + * This assignment is only valid as long as the hash_lock is + * held, we must be careful not to reference state or the + * b_state field after dropping the lock. + */ + state = hdr->b_l1hdr.b_state; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3P(state, !=, arc_anon); + + /* this buffer is not on any list */ + ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); + + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + + mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); + list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); + mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); + + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; } - buf_size = hdr->b_size; /* * Do we have more than one buf? */ - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; - arc_buf_contents_t type = hdr->b_type; + arc_buf_contents_t type = arc_buf_type(hdr); uint32_t flags = hdr->b_flags; - ASSERT(hdr->b_buf != buf || buf->b_next != NULL); + ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); - bufp = &hdr->b_buf; + bufp = &hdr->b_l1hdr.b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; *bufp = buf->b_next; buf->b_next = NULL; - ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); - atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); - if (refcount_is_zero(&hdr->b_refcnt)) { - uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + ASSERT3P(state, !=, arc_l2c_only); + ASSERT3U(state->arcs_size, >=, hdr->b_size); + atomic_add_64(&state->arcs_size, -hdr->b_size); + if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + uint64_t *size; + + ASSERT3P(state, !=, arc_l2c_only); + size = &state->arcs_lsize[type]; ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } @@ -3681,69 +3954,60 @@ arc_release(arc_buf_t *buf, void *tag) * We're releasing a duplicate user data buffer, update * our statistics accordingly. */ - if (hdr->b_type == ARC_BUFC_DATA) { + if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -hdr->b_size); } - hdr->b_datacnt -= 1; + hdr->b_l1hdr.b_datacnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); nhdr->b_size = blksz; nhdr->b_spa = spa; - nhdr->b_type = type; - nhdr->b_buf = buf; - nhdr->b_state = arc_anon; - nhdr->b_arc_access = 0; - nhdr->b_mru_hits = 0; - nhdr->b_mru_ghost_hits = 0; - nhdr->b_mfu_hits = 0; - nhdr->b_mfu_ghost_hits = 0; - nhdr->b_l2_hits = 0; + + nhdr->b_l1hdr.b_mru_hits = 0; + nhdr->b_l1hdr.b_mru_ghost_hits = 0; + nhdr->b_l1hdr.b_mfu_hits = 0; + nhdr->b_l1hdr.b_mfu_ghost_hits = 0; + nhdr->b_l1hdr.b_l2_hits = 0; nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; - nhdr->b_l2hdr = NULL; - nhdr->b_datacnt = 1; + nhdr->b_flags |= arc_bufc_to_flags(type); + nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + + nhdr->b_l1hdr.b_buf = buf; + nhdr->b_l1hdr.b_datacnt = 1; + nhdr->b_l1hdr.b_state = arc_anon; + nhdr->b_l1hdr.b_arc_access = 0; nhdr->b_freeze_cksum = NULL; - (void) refcount_add(&nhdr->b_refcnt, tag); + + (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); atomic_add_64(&arc_anon->arcs_size, blksz); } else { mutex_exit(&buf->b_evict_lock); - ASSERT(refcount_count(&hdr->b_refcnt) == 1); - ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); + /* protected by hash lock */ + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - if (hdr->b_state != arc_anon) - arc_change_state(arc_anon, hdr, hash_lock); - hdr->b_arc_access = 0; - hdr->b_mru_hits = 0; - hdr->b_mru_ghost_hits = 0; - hdr->b_mfu_hits = 0; - hdr->b_mfu_ghost_hits = 0; - hdr->b_l2_hits = 0; - if (hash_lock) - mutex_exit(hash_lock); + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; + hdr->b_l1hdr.b_l2_hits = 0; + arc_change_state(arc_anon, hdr, hash_lock); + hdr->b_l1hdr.b_arc_access = 0; + mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; - - if (l2hdr) { - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - vdev_space_update(l2hdr->b_dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); - list_remove(l2hdr->b_dev->l2ad_buflist, hdr); - kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -buf_size); - mutex_exit(&l2arc_buflist_mtx); - } } int @@ -3752,7 +4016,8 @@ arc_released(arc_buf_t *buf) int released; mutex_enter(&buf->b_evict_lock); - released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + released = (buf->b_data != NULL && + buf->b_hdr->b_l1hdr.b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } @@ -3764,7 +4029,7 @@ arc_referenced(arc_buf_t *buf) int referenced; mutex_enter(&buf->b_evict_lock); - referenced = (refcount_count(&buf->b_hdr->b_refcnt)); + referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } @@ -3777,7 +4042,9 @@ arc_write_ready(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); callback->awcb_ready(zio, buf, callback->awcb_private); /* @@ -3787,12 +4054,12 @@ arc_write_ready(zio_t *zio) * accounting for any re-write attempt. */ if (HDR_IO_IN_PROGRESS(hdr)) { - mutex_enter(&hdr->b_freeze_lock); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - mutex_exit(&hdr->b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } arc_cksum_compute(buf, B_FALSE); hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; @@ -3817,7 +4084,7 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_acb == NULL); + ASSERT(hdr->b_l1hdr.b_acb == NULL); if (zio->io_error == 0) { if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { @@ -3825,7 +4092,6 @@ arc_write_done(zio_t *zio) } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); - hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; } } else { ASSERT(BUF_EMPTY(hdr)); @@ -3846,7 +4112,7 @@ arc_write_done(zio_t *zio) arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); - if (exists) { + if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove @@ -3856,7 +4122,8 @@ arc_write_done(zio_t *zio) if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); - ASSERT(refcount_is_zero(&exists->b_refcnt)); + ASSERT(refcount_is_zero( + &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); @@ -3870,22 +4137,22 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_datacnt == 1); - ASSERT(hdr->b_state == arc_anon); + ASSERT(hdr->b_l1hdr.b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ - if (!exists && hdr->b_state == arc_anon) + if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; } - ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); @@ -3905,8 +4172,9 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(ready != NULL); ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); - ASSERT((hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) == 0); - ASSERT(hdr->b_acb == NULL); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); if (l2arc) hdr->b_flags |= ARC_FLAG_L2CACHE; if (l2arc_compress) @@ -4121,25 +4389,35 @@ arc_init(void) mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; @@ -4250,7 +4528,7 @@ arc_fini(void) buf_fini(); - ASSERT(arc_loaned_bytes == 0); + ASSERT0(arc_loaned_bytes); } /* @@ -4409,7 +4687,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ - if (hdr->b_spa != spa_guid || hdr->b_l2hdr != NULL || + if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); @@ -4462,20 +4740,6 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) return (next); } -static void -l2arc_hdr_stat_add(void) -{ - ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE); - ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); -} - -static void -l2arc_hdr_stat_remove(void) -{ - ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE); - ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); -} - /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. @@ -4572,7 +4836,6 @@ l2arc_write_done(zio_t *zio) l2arc_dev_t *dev; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; - l2arc_buf_hdr_t *abl2; kmutex_t *hash_lock; int64_t bytes_dropped = 0; @@ -4582,7 +4845,7 @@ l2arc_write_done(zio_t *zio) ASSERT(dev != NULL); head = cb->l2wcb_head; ASSERT(head != NULL); - buflist = dev->l2ad_buflist; + buflist = &dev->l2ad_buflist; ASSERT(buflist != NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); @@ -4590,42 +4853,43 @@ l2arc_write_done(zio_t *zio) if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); - mutex_enter(&l2arc_buflist_mtx); + mutex_enter(&dev->l2ad_mtx); /* * All writes completed, or an error was hit. */ for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); - abl2 = hdr->b_l2hdr; - - /* - * Release the temporary compressed buffer as soon as possible. - */ - if (abl2->b_compress != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * This buffer misses out. It may be in a stage - * of eviction. Its ARC_L2_WRITING flag will be + * of eviction. Its ARC_FLAG_L2_WRITING flag will be * left set, denying reads to this buffer. */ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); continue; } + /* + * It's possible that this buffer got evicted from the L1 cache + * before we grabbed the vdev + hash locks, in which case + * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. + * Only free the buffer if we still have an L1 hdr. + */ + if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + l2arc_release_cdata_buf(hdr); + if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); - ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); - bytes_dropped += abl2->b_asize; - hdr->b_l2hdr = NULL; - kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); } @@ -4639,8 +4903,9 @@ l2arc_write_done(zio_t *zio) atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); - kmem_cache_free(hdr_cache, head); - mutex_exit(&l2arc_buflist_mtx); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); + mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); @@ -4778,16 +5043,12 @@ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; - l2arc_buf_hdr_t *abl2; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; int64_t bytes_evicted = 0; - buflist = dev->l2ad_buflist; - - if (buflist == NULL) - return; + buflist = &dev->l2ad_buflist; if (!all && dev->l2ad_first) { /* @@ -4810,7 +5071,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) uint64_t, taddr, boolean_t, all); top: - mutex_enter(&l2arc_buflist_mtx); + mutex_enter(&dev->l2ad_mtx); for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); @@ -4820,7 +5081,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; @@ -4836,9 +5097,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) continue; } - if (!all && hdr->b_l2hdr != NULL && - (hdr->b_l2hdr->b_daddr > taddr || - hdr->b_l2hdr->b_daddr < dev->l2ad_hand)) { + if (!all && HDR_HAS_L2HDR(hdr) && + (hdr->b_l2hdr.b_daddr > taddr || + hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. @@ -4847,15 +5108,8 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) break; } - if (HDR_FREE_IN_PROGRESS(hdr)) { - /* - * Already on the path to destruction. - */ - mutex_exit(hash_lock); - continue; - } - - if (hdr->b_state == arc_l2c_only) { + ASSERT(HDR_HAS_L2HDR(hdr)); + if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. @@ -4865,6 +5119,8 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } else { + ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); + ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write @@ -4878,26 +5134,18 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) /* * Tell ARC this no longer exists in L2ARC. */ - if (hdr->b_l2hdr != NULL) { - abl2 = hdr->b_l2hdr; - ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); - bytes_evicted += abl2->b_asize; - hdr->b_l2hdr = NULL; - kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - } + /* Tell ARC this no longer exists in L2ARC. */ + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; list_remove(buflist, hdr); - /* - * This may have been leftover after a - * failed write. - */ + /* This may have been leftover after a failed write. */ hdr->b_flags &= ~ARC_FLAG_L2_WRITING; } mutex_exit(hash_lock); } - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); dev->l2ad_evict = taddr; @@ -4939,8 +5187,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, pio = NULL; write_sz = write_asize = write_psize = 0; full = B_FALSE; - head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; + head->b_flags |= ARC_FLAG_HAS_L2HDR; /* * We will want to try to compress buffers that are at least 2x the @@ -4951,7 +5200,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Copy buffers for L2ARC writing. */ - mutex_enter(&l2arc_buflist_mtx); + mutex_enter(&dev->l2ad_mtx); for (try = 0; try <= 3; try++) { uint64_t passed_sz = 0; @@ -4973,7 +5222,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { - l2arc_buf_hdr_t *l2hdr; kmutex_t *hash_lock; uint64_t buf_sz; @@ -5016,7 +5264,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * l2arc_write_done() can find where the * write buffers begin without searching. */ - list_insert_head(dev->l2ad_buflist, head); + list_insert_head(&dev->l2ad_buflist, head); cb = kmem_alloc(sizeof (l2arc_write_callback_t), KM_SLEEP); @@ -5029,37 +5277,33 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Create and add a new L2ARC header. */ - l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), - KM_PUSHPAGE); - l2hdr->b_dev = dev; - arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); - + hdr->b_l2hdr.b_dev = dev; + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); hdr->b_flags |= ARC_FLAG_L2_WRITING; - /* * Temporarily stash the data buffer in b_tmp_cdata. * The subsequent write step will pick it up from - * there. This is because can't access hdr->b_buf + * there. This is because can't access b_l1hdr.b_buf * without holding the hash_lock, which we in turn * can't access without holding the ARC list locks * (which we want to avoid during compression/writing) */ - l2hdr->b_compress = ZIO_COMPRESS_OFF; - l2hdr->b_asize = hdr->b_size; - l2hdr->b_tmp_cdata = hdr->b_buf->b_data; - l2hdr->b_hits = 0; + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + hdr->b_l2hdr.b_asize = hdr->b_size; + hdr->b_l2hdr.b_hits = 0; + hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; buf_sz = hdr->b_size; - hdr->b_l2hdr = l2hdr; + hdr->b_flags |= ARC_FLAG_HAS_L2HDR; - list_insert_head(dev->l2ad_buflist, hdr); + list_insert_head(&dev->l2ad_buflist, hdr); /* * Compute and store the buffer cksum before * writing. On debug the cksum is verified first. */ - arc_cksum_verify(hdr->b_buf); - arc_cksum_compute(hdr->b_buf, B_TRUE); + arc_cksum_verify(hdr->b_l1hdr.b_buf); + arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); mutex_exit(hash_lock); @@ -5075,8 +5319,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_sz); - mutex_exit(&l2arc_buflist_mtx); - kmem_cache_free(hdr_cache, head); + mutex_exit(&dev->l2ad_mtx); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); return (0); } @@ -5085,24 +5330,22 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * and work backwards, retracing the course of the buffer selector * loop above. */ - for (hdr = list_prev(dev->l2ad_buflist, head); hdr; - hdr = list_prev(dev->l2ad_buflist, hdr)) { - l2arc_buf_hdr_t *l2hdr; + for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; + hdr = list_prev(&dev->l2ad_buflist, hdr)) { uint64_t buf_sz; /* * We shouldn't need to lock the buffer here, since we flagged * it as ARC_FLAG_L2_WRITING in the previous step, but we must * take care to only access its L2 cache parameters. In - * particular, hdr->b_buf may be invalid by now due to + * particular, hdr->l1hdr.b_buf may be invalid by now due to * ARC eviction. */ - l2hdr = hdr->b_l2hdr; - l2hdr->b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - if (!l2arc_nocompress && (hdr->b_flags & ARC_FLAG_L2COMPRESS) && - l2hdr->b_asize >= buf_compress_minsz) { - if (l2arc_compress_buf(l2hdr)) { + if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) && + hdr->b_l2hdr.b_asize >= buf_compress_minsz) { + if (l2arc_compress_buf(hdr)) { /* * If compression succeeded, enable headroom * boost on the next scan cycle. @@ -5115,8 +5358,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * Pick up the buffer data we had previously stashed away * (and now potentially also compressed). */ - buf_data = l2hdr->b_tmp_cdata; - buf_sz = l2hdr->b_asize; + buf_data = hdr->b_l1hdr.b_tmp_cdata; + buf_sz = hdr->b_l2hdr.b_asize; /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { @@ -5141,7 +5384,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, } } - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); @@ -5169,7 +5412,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Compresses an L2ARC buffer. - * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its + * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its * size in l2hdr->b_asize. This routine tries to compress the data and * depending on the compression result there are three possible outcomes: * *) The buffer was incompressible. The original l2hdr contents were left @@ -5187,24 +5430,31 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * buffer was incompressible). */ static boolean_t -l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) +l2arc_compress_buf(arc_buf_hdr_t *hdr) { abd_t *cdata; void *ddata; size_t csize, len, rounded; + l2arc_buf_hdr_t *l2hdr; - ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); - ASSERT(l2hdr->b_tmp_cdata != NULL); + ASSERT(HDR_HAS_L2HDR(hdr)); + + l2hdr = &hdr->b_l2hdr; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); + ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); len = l2hdr->b_asize; cdata = abd_alloc_linear(len); + ASSERT3P(cdata, !=, NULL); - ddata = abd_borrow_buf_copy(l2hdr->b_tmp_cdata, l2hdr->b_asize); + ddata = abd_borrow_buf_copy(hdr->b_l1hdr.b_tmp_cdata, l2hdr->b_asize); csize = zio_compress_data(ZIO_COMPRESS_LZ4, ddata, ABD_TO_BUF(cdata), l2hdr->b_asize); - abd_return_buf(l2hdr->b_tmp_cdata, ddata, l2hdr->b_asize); + abd_return_buf(hdr->b_l1hdr.b_tmp_cdata, ddata, l2hdr->b_asize); rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); if (rounded > csize) { @@ -5215,9 +5465,9 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) if (csize == 0) { /* zero block, indicate that there's nothing to write */ abd_free(cdata, len); - l2hdr->b_compress = ZIO_COMPRESS_EMPTY; + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); l2hdr->b_asize = 0; - l2hdr->b_tmp_cdata = NULL; + hdr->b_l1hdr.b_tmp_cdata = NULL; ARCSTAT_BUMP(arcstat_l2_compress_zeros); return (B_TRUE); } else if (csize > 0 && csize < len) { @@ -5225,9 +5475,9 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) * Compression succeeded, we'll keep the cdata around for * writing and release it afterwards. */ - l2hdr->b_compress = ZIO_COMPRESS_LZ4; + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); l2hdr->b_asize = csize; - l2hdr->b_tmp_cdata = cdata; + hdr->b_l1hdr.b_tmp_cdata = cdata; ARCSTAT_BUMP(arcstat_l2_compress_successes); return (B_TRUE); } else { @@ -5275,9 +5525,9 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) * need to fill its io_data after we're done restoring the * buffer's contents. */ - ASSERT(hdr->b_buf != NULL); - abd_zero(hdr->b_buf->b_data, hdr->b_size); - zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; + ASSERT(hdr->b_l1hdr.b_buf != NULL); + abd_zero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); + zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; } else { void *ddata; ASSERT(zio->io_data != NULL); @@ -5318,17 +5568,17 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) static void l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; - - if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { + ASSERT(HDR_HAS_L1HDR(hdr)); + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { /* * If the data was compressed, then we've allocated a * temporary buffer for it, so now we need to release it. */ - ASSERT(l2hdr->b_tmp_cdata != NULL); - abd_free(l2hdr->b_tmp_cdata, hdr->b_size); + ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); + abd_free(hdr->b_l1hdr.b_tmp_cdata, + hdr->b_size); } - l2hdr->b_tmp_cdata = NULL; + hdr->b_l1hdr.b_tmp_cdata = NULL; } /* @@ -5470,13 +5720,13 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_writing = B_FALSE; list_link_init(&adddev->l2ad_node); + mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ - adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); - list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l2node)); + list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); @@ -5522,8 +5772,8 @@ l2arc_remove_vdev(vdev_t *vd) * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); - list_destroy(remdev->l2ad_buflist); - kmem_free(remdev->l2ad_buflist, sizeof (list_t)); + list_destroy(&remdev->l2ad_buflist); + mutex_destroy(&remdev->l2ad_mtx); kmem_free(remdev, sizeof (l2arc_dev_t)); } @@ -5538,7 +5788,6 @@ l2arc_init(void) mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_FSTRANS, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; @@ -5563,7 +5812,6 @@ l2arc_fini(void) mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); - mutex_destroy(&l2arc_buflist_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); From 21308417f78bc2ef02e9175add89893166e5eba4 Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Mon, 12 Jan 2015 19:52:19 -0800 Subject: [PATCH 22/26] 5497 lock contention on arcs_mtx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed by: George Wilson Reviewed by: Matthew Ahrens Reviewed by: Richard Elling Approved by: Dan McDonald Porting notes: This patch from illumos either duplicates or obviates a number of ZoL-specific patches which have been added over time. As part of this porting effort, several functions were copied in toto from illumos, effictively undoing some of the previous ZoL-specific patches: ... trying to remember which ones ... arc_adapt() ? arc_adjust_impl() arc_adjust_meta() arc_adjust_type() Other notes: The illumos 5368 patch (ARC should cache more metadata), which was never picked up by ZoL, are mostly undone by this patch. Since ZoL relies on the kernel asynchronously calling the shrinker to actually reap memory, the shrinker wakes up arc_reclaim_waiters_cv every time it runs. Similarly, and in order to operate in a similar manner as to illumos, the arc_adapt_thread does the same, analogously to the arc_reclaim_thread in illumos. Notable conflicting ZoL commits which conflict with this patch or whose effects are either duplicated or un-done by this patch: 302f753 - Integrate ARC more tightly with Linux 39e055c - Adjust arc_p based on "bytes" in arc_shrink f521ce1 - Allow "arc_p" to drop to zero or grow to "arc_c" 77765b5 - Remove "arc_meta_used" from arc_adjust calculation 94520ca - Prune metadata from ghost lists in arc_adjust_meta arcstat.py needs to be updated to deal with new/missing stats. I have removed "recycle_miss" from it in order that it still work. Added minimal multilist trace support. --------------------------------------------------- @l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ‣ abd_t *buf_data; ‣ kmutex_t *list_lock = NULL; • to ‣ abd_t *buf_data; • #3115: void *buf_data; • #2119: abd_t *buf_data; for later, where applicable (#2129): ‣ zio_buf_free • equals ‣ abd_free ‣ zio_data_buf_free • equals ‣ abd_free ‣ zio_buf_alloc(size); • buf->b_data = zio_buf_alloc(size); • equals ‣ abd_alloc_linear(size); • buf->b_data = abd_alloc_linear(size); ‣ zio_data_buf_alloc(size); • buf->b_data = zio_data_buf_alloc(size); • equals ‣ abd_alloc_scatter(size); • buf->b_data = abd_alloc_scatter(size); @ arc_buf_add_ref(arc_buf_t *buf, void* tag) ‣ re-introduction of • arc_buf_free_on_write(void *data, size_t size, • removed due to "Revert "fix l2arc compression buffers leak"", https://github.com/dweeezil/zfs/commit/ba5ac5ce451bb9a2df15ee5a8ca9e08a38a90276 fix "warning: assignment from incompatible pointer type" error of • @ arc_buf_add_ref(arc_buf_t *buf, void* tag) • void (*free_func)(abd_t *, size_t)) (#2129) • arc_buf_free_on_write(void *data, size_t size, • void (*free_func)(void *, size_t)) ∘ to • arc_buf_free_on_write(abd_t *data, size_t size, • void (*free_func)(abd_t *, size_t)) reverting the zfs_arc_meta_adjust_restarts changes introduced by ‣ https://github.com/zfsonlinux/zfs/commit/bc88866657979c5658441e201e19df365c67ddfe ‣ Fix arc_adjust_meta() behavior ‣ thus removing the "zfs_arc_meta_adjust_restarts" parameter from code & documentation conflicts in removal & replacement due to omitting "Revert "Allow arc_evict_ghost() to only evict meta data" in ‣ @ arc_adjust(void) • if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { ‣ @ arc_adjust_meta(void) • if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { ‣ @static arc_buf_hdr_t arc_eviction_hdr; • static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, • arc_buf_contents_t type); ‣ @ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, • arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, ‣ ... ‣ see: https://github.com/dweeezil/zfs/commit/730d690da8774ff7539521c65facf68d0aa881be for the changes of the revert --- cmd/arcstat/arcstat.py | 2 - include/sys/Makefile.am | 2 + include/sys/arc.h | 8 +- include/sys/arc_impl.h | 22 +- include/sys/multilist.h | 106 ++ include/sys/trace_multilist.h | 76 ++ include/sys/zfs_context.h | 1 + lib/libzpool/Makefile.am | 1 + man/man5/zfs-module-parameters.5 | 14 - module/zfs/Makefile.in | 1 + module/zfs/arc.c | 1962 ++++++++++++++++++------------ module/zfs/dsl_pool.c | 9 +- module/zfs/multilist.c | 373 ++++++ module/zfs/trace.c | 3 + module/zfs/zio_inject.c | 6 +- 15 files changed, 1795 insertions(+), 791 deletions(-) create mode 100644 include/sys/multilist.h create mode 100644 include/sys/trace_multilist.h create mode 100644 module/zfs/multilist.c diff --git a/cmd/arcstat/arcstat.py b/cmd/arcstat/arcstat.py index b516cf285fdc..bbf43100aa4e 100755 --- a/cmd/arcstat/arcstat.py +++ b/cmd/arcstat/arcstat.py @@ -82,7 +82,6 @@ "mrug": [4, 1000, "MRU Ghost List hits per second"], "eskip": [5, 1000, "evict_skip per second"], "mtxmis": [6, 1000, "mutex_miss per second"], - "rmis": [4, 1000, "recycle_miss per second"], "dread": [5, 1000, "Demand accesses per second"], "pread": [5, 1000, "Prefetch accesses per second"], "l2hits": [6, 1000, "L2ARC hits per second"], @@ -406,7 +405,6 @@ def calculate(): v["mrug"] = d["mru_ghost_hits"] / sint v["mfug"] = d["mfu_ghost_hits"] / sint v["eskip"] = d["evict_skip"] / sint - v["rmis"] = d["recycle_miss"] / sint v["mtxmis"] = d["mutex_miss"] / sint if l2exist: diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 8b3fd524c555..72e02952bef1 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -34,6 +34,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/efi_partition.h \ $(top_srcdir)/include/sys/metaslab.h \ $(top_srcdir)/include/sys/metaslab_impl.h \ + $(top_srcdir)/include/sys/multilist.h \ $(top_srcdir)/include/sys/nvpair.h \ $(top_srcdir)/include/sys/nvpair_impl.h \ $(top_srcdir)/include/sys/range_tree.h \ @@ -54,6 +55,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/trace_dbuf.h \ $(top_srcdir)/include/sys/trace_dmu.h \ $(top_srcdir)/include/sys/trace_dnode.h \ + $(top_srcdir)/include/sys/trace_multilist.h \ $(top_srcdir)/include/sys/trace_txg.h \ $(top_srcdir)/include/sys/trace_zil.h \ $(top_srcdir)/include/sys/trace_zrlock.h \ diff --git a/include/sys/arc.h b/include/sys/arc.h index 8802450bf88e..5f1a164f39a2 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -39,6 +39,12 @@ extern "C" { #include #include +/* + * Used by arc_flush() to inform arc_evict_state() that it should evict + * all available buffers from the arc state being passed in. + */ +#define ARC_EVICT_ALL -1ULL + typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef struct arc_prune arc_prune_t; @@ -201,7 +207,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); boolean_t arc_clear_callback(arc_buf_t *buf); -void arc_flush(spa_t *spa); +void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 6e29bfdd73b3..650b9a44af8c 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -67,10 +67,22 @@ extern "C" { */ typedef struct arc_state { - list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ - uint64_t arcs_size; /* total amount of data in this state */ - kmutex_t arcs_mtx; + /* + * list of evictable buffers + */ + multilist_t arcs_list[ARC_BUFC_NUMTYPES]; + /* + * total amount of evictable data in this state + */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; + /* + * total amount of data in this state; this includes: evictable, + * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + */ + uint64_t arcs_size; + /* + * supports the "dbufs" kstat + */ arc_state_type_t arcs_state; } arc_state_t; @@ -136,7 +148,7 @@ typedef struct l1arc_buf_hdr { /* protected by arc state mutex */ arc_state_t *b_state; - list_node_t b_arc_node; + multilist_node_t b_arc_node; /* updated atomically */ clock_t b_arc_access; diff --git a/include/sys/multilist.h b/include/sys/multilist.h new file mode 100644 index 000000000000..5ebb7fe1a1d5 --- /dev/null +++ b/include/sys/multilist.h @@ -0,0 +1,106 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +#ifndef _SYS_MULTILIST_H +#define _SYS_MULTILIST_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef list_node_t multilist_node_t; +typedef struct multilist multilist_t; +typedef struct multilist_sublist multilist_sublist_t; +typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *); + +struct multilist_sublist { + /* + * The mutex used internally to implement thread safe insertions + * and removals to this individual sublist. It can also be locked + * by a consumer using multilist_sublist_{lock,unlock}, which is + * useful if a consumer needs to traverse the list in a thread + * safe manner. + */ + kmutex_t mls_lock; + /* + * The actual list object containing all objects in this sublist. + */ + list_t mls_list; + /* + * Pad to cache line (64 bytes), in an effort to try and prevent + * cache line contention. + */ + uint8_t mls_pad[24]; +}; + +struct multilist { + /* + * This is used to get to the multilist_node_t structure given + * the void *object contained on the list. + */ + size_t ml_offset; + /* + * The number of sublists used internally by this multilist. + */ + uint64_t ml_num_sublists; + /* + * The array of pointers to the actual sublists. + */ + multilist_sublist_t *ml_sublists; + /* + * Pointer to function which determines the sublist to use + * when inserting and removing objects from this multilist. + * Please see the comment above multilist_create for details. + */ + multilist_sublist_index_func_t *ml_index_func; +}; + +void multilist_destroy(multilist_t *); +void multilist_create(multilist_t *, size_t, size_t, unsigned int, + multilist_sublist_index_func_t *); + +void multilist_insert(multilist_t *, void *); +void multilist_remove(multilist_t *, void *); +int multilist_is_empty(multilist_t *); + +unsigned int multilist_get_num_sublists(multilist_t *); +unsigned int multilist_get_random_index(multilist_t *); + +multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); +void multilist_sublist_unlock(multilist_sublist_t *); + +void multilist_sublist_insert_head(multilist_sublist_t *, void *); +void multilist_sublist_insert_tail(multilist_sublist_t *, void *); +void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); +void multilist_sublist_remove(multilist_sublist_t *, void *); + +void *multilist_sublist_head(multilist_sublist_t *); +void *multilist_sublist_tail(multilist_sublist_t *); +void *multilist_sublist_next(multilist_sublist_t *, void *); +void *multilist_sublist_prev(multilist_sublist_t *, void *); + +void multilist_link_init(multilist_node_t *); +int multilist_link_active(multilist_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MULTILIST_H */ diff --git a/include/sys/trace_multilist.h b/include/sys/trace_multilist.h new file mode 100644 index 000000000000..11d2f2701ac4 --- /dev/null +++ b/include/sys/trace_multilist.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#if !defined(_TRACE_MULTILIST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MULTILIST_H + +#include +#include + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * multilist_t *, ..., + * unsigned int, ..., + * void *, ...); + */ + +DECLARE_EVENT_CLASS(zfs_multilist_insert_remove_class, + TP_PROTO(multilist_t *ml, unsigned sublist_idx, void *obj), + TP_ARGS(ml, sublist_idx, obj), + TP_STRUCT__entry( + __field(size_t, ml_offset) + __field(uint64_t, ml_num_sublists) + + __field(unsigned int, sublist_idx) + ), + TP_fast_assign( + __entry->ml_offset = ml->ml_offset; + __entry->ml_num_sublists = ml->ml_num_sublists; + + __entry->sublist_idx = sublist_idx; + ), + TP_printk("ml { offset %ld numsublists %llu sublistidx %u } ", + __entry->ml_offset, __entry->ml_num_sublists, __entry->sublist_idx) +); + +#define DEFINE_MULTILIST_INSERT_REMOVE_EVENT(name) \ +DEFINE_EVENT(zfs_multilist_insert_remove_class, name, \ + TP_PROTO(multilist_t *ml, unsigned int sublist_idx, void *obj), \ + TP_ARGS(ml, sublist_idx, obj)) +DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__insert); +DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__remove); + +#endif /* _TRACE_MULTILIST_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_multilist +#include + +#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index b8eff58bc615..cc534f2c0801 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -610,6 +610,7 @@ extern void delay(clock_t ticks); } while (0); #define max_ncpus 64 +#define num_online_cpus() (sysconf(_SC_NPROCESSORS_ONLN)) #define minclsyspri 60 #define maxclsyspri 99 diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 60bd06b7d07b..07c4787409a0 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -56,6 +56,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/lzjb.c \ $(top_srcdir)/module/zfs/lz4.c \ $(top_srcdir)/module/zfs/metaslab.c \ + $(top_srcdir)/module/zfs/multilist.c \ $(top_srcdir)/module/zfs/range_tree.c \ $(top_srcdir)/module/zfs/refcount.c \ $(top_srcdir)/module/zfs/rrwlock.c \ diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index fe31e292a792..4b3dc3666db0 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -411,20 +411,6 @@ pruning the inode and dentry caches. Default value: \fB10,000\fR. .RE -.sp -.ne 2 -.na -\fBzfs_arc_meta_adjust_restarts\fR (ulong) -.ad -.RS 12n -The number of restart passes to make while scanning the ARC attempting -the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR. -This value should not need to be tuned but is available to facilitate -performance analysis. -.sp -Default value: \fB4096\fR. -.RE - .sp .ne 2 .na diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index e24a53d5eaf6..c33976b93bbd 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -38,6 +38,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/gzip.o $(MODULE)-objs += @top_srcdir@/module/zfs/lzjb.o $(MODULE)-objs += @top_srcdir@/module/zfs/lz4.o $(MODULE)-objs += @top_srcdir@/module/zfs/metaslab.o +$(MODULE)-objs += @top_srcdir@/module/zfs/multilist.o $(MODULE)-objs += @top_srcdir@/module/zfs/range_tree.o $(MODULE)-objs += @top_srcdir@/module/zfs/refcount.o $(MODULE)-objs += @top_srcdir@/module/zfs/rrwlock.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 92c79aa352ea..0d4a226fc1c4 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -135,6 +135,7 @@ #include #include #include +#include #include #ifdef _KERNEL #include @@ -155,9 +156,14 @@ boolean_t arc_watch = B_FALSE; #endif -static kmutex_t arc_reclaim_thr_lock; -static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ -static uint8_t arc_thread_exit; +static kmutex_t arc_reclaim_lock; +static kcondvar_t arc_reclaim_thread_cv; +static boolean_t arc_reclaim_thread_exit; +static kcondvar_t arc_reclaim_waiters_cv; + +static kmutex_t arc_user_evicts_lock; +static kcondvar_t arc_user_evicts_cv; +static boolean_t arc_user_evicts_thread_exit; /* number of objects to prune from caches when arc_meta_limit is reached */ int zfs_arc_meta_prune = 10000; @@ -168,14 +174,27 @@ typedef enum arc_reclaim_strategy { } arc_reclaim_strategy_t; /* - * The number of iterations through arc_evict_*() before we - * drop & reacquire the lock. + * The number of headers to evict in arc_evict_state_impl() before + * dropping the sublist lock and evicting from another sublist. A lower + * value means we're more likely to evict the "correct" header (i.e. the + * oldest header in the arc state), but comes with higher overhead + * (i.e. more invocations of arc_evict_state_impl()). */ -int arc_evict_iterations = 100; +int zfs_arc_evict_batch_limit = 10; + +/* + * The number of sublists used for each of the arc state lists. If this + * is not set to a suitable value by the user, it will be configured to + * the number of CPUs on the system in arc_init(). + */ +int zfs_arc_num_sublists_per_state = 0; /* number of seconds before growing cache again */ int zfs_arc_grow_retry = 5; +/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ +int zfs_arc_overflow_shift = 8; + /* disable anon data aggressively growing arc_p */ int zfs_arc_p_aggressive_disable = 1; @@ -200,6 +219,12 @@ int zfs_disable_dup_eviction = 0; /* average block used to size buf_hash_table */ int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ +static int arc_min_prefetch_lifespan; + /* * If this percent of memory is free, don't throttle. */ @@ -221,11 +246,7 @@ static boolean_t arc_warm; unsigned long zfs_arc_max = 0; unsigned long zfs_arc_min = 0; unsigned long zfs_arc_meta_limit = 0; - -/* - * Limit the number of restarts in arc_adjust_meta() - */ -unsigned long zfs_arc_meta_adjust_restarts = 4096; +unsigned long zfs_arc_meta_min = 0; /* The 6 states: */ static arc_state_t ARC_anon; @@ -251,7 +272,6 @@ typedef struct arc_stats { kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_deleted; - kstat_named_t arcstat_recycle_miss; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held @@ -265,9 +285,15 @@ typedef struct arc_stats { * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; + /* + * Number of times arc_evict_state() was unable to evict enough + * buffers to reach it's target amount. + */ + kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; @@ -306,11 +332,12 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; - kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; @@ -333,6 +360,7 @@ typedef struct arc_stats { kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; + kstat_named_t arcstat_meta_min; } arc_stats_t; static arc_stats_t arc_stats = { @@ -351,12 +379,13 @@ static arc_stats_t arc_stats = { { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, - { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, + { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, + { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, @@ -395,11 +424,12 @@ static arc_stats_t arc_stats = { { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, - { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, + { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, @@ -422,6 +452,7 @@ static arc_stats_t arc_stats = { { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, + { "arc_meta_min", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -487,6 +518,7 @@ static arc_state_t *arc_l2c_only; #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ @@ -496,7 +528,6 @@ static arc_state_t *arc_l2c_only; static list_t arc_prune_list; static kmutex_t arc_prune_mtx; static arc_buf_t *arc_eviction_list; -static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; #define GHOST_STATE(state) \ @@ -638,9 +669,7 @@ static uint8_t l2arc_thread_exit; static void arc_get_data_buf(arc_buf_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); -static int arc_evict_needed(arc_buf_contents_t); -static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type); +static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); @@ -832,6 +861,7 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&hdr->b_l1hdr.b_arc_node); list_link_init(&hdr->b_l2hdr.b_l2node); + multilist_link_init(&hdr->b_l1hdr.b_arc_node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); @@ -876,6 +906,7 @@ hdr_full_dest(void *vbuf, void *unused) cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } @@ -983,18 +1014,31 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); } else { ASSERT(hdr->b_l1hdr.b_buf == NULL); ASSERT0(hdr->b_l1hdr.b_datacnt); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* - * We might be removing the L1hdr of a buffer which was just - * written out to L2ARC. If such a buffer is compressed then we - * need to free its b_tmp_cdata before destroying the header. + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. */ - if (hdr->b_l1hdr.b_tmp_cdata != NULL && - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(hdr); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + + /* + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field + * might try to be accessed, even though it was removed. + */ + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; } /* @@ -1201,14 +1245,13 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { + arc_buf_contents_t type = arc_buf_type(hdr); uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; - list_t *list = &state->arcs_list[arc_buf_type(hdr)]; - uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; + multilist_t *list = &state->arcs_list[type]; + uint64_t *size = &state->arcs_lsize[type]; + + multilist_remove(list, hdr); - ASSERT(!MUTEX_HELD(&state->arcs_mtx)); - mutex_enter(&state->arcs_mtx); - ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); - list_remove(list, hdr); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_datacnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); @@ -1217,7 +1260,6 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) ASSERT(delta > 0); ASSERT3U(*size, >=, delta); atomic_add_64(size, -delta); - mutex_exit(&state->arcs_mtx); } /* remove the prefetch flag if we get a reference */ hdr->b_flags &= ~ARC_FLAG_PREFETCH; @@ -1240,16 +1282,15 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; + arc_buf_contents_t type = arc_buf_type(hdr); + multilist_t *list = &state->arcs_list[type]; + uint64_t *size = &state->arcs_lsize[type]; + + multilist_insert(list, hdr); - ASSERT(!MUTEX_HELD(&state->arcs_mtx)); - mutex_enter(&state->arcs_mtx); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); - list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr); ASSERT(hdr->b_l1hdr.b_datacnt > 0); atomic_add_64(size, hdr->b_size * hdr->b_l1hdr.b_datacnt); - mutex_exit(&state->arcs_mtx); } return (cnt); } @@ -1301,9 +1342,10 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) abi->abi_state_index = -1; abi->abi_size = hdr->b_size; +#if 0 /* XXX tsc */ if (l1hdr && state && state_index && list_link_active(&l1hdr->b_arc_node)) { - list_t *list = &state->arcs_list[arc_buf_type(hdr)]; + multilist_t *list = &state->arcs_list[arc_buf_type(hdr)]; arc_buf_hdr_t *h; mutex_enter(&state->arcs_mtx); @@ -1314,10 +1356,11 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) } mutex_exit(&state->arcs_mtx); } +#endif } /* - * Move the supplied buffer to the indicated state. The mutex + * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void @@ -1361,15 +1404,10 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { - int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); uint64_t *size = &old_state->arcs_lsize[buftype]; - if (use_mutex) - mutex_enter(&old_state->arcs_mtx); - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); - list_remove(&old_state->arcs_list[buftype], hdr); + multilist_remove(&old_state->arcs_list[buftype], hdr); /* * If prefetching out of the ghost cache, @@ -1382,12 +1420,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, } ASSERT3U(*size, >=, from_delta); atomic_add_64(size, -from_delta); - - if (use_mutex) - mutex_exit(&old_state->arcs_mtx); } if (new_state != arc_anon && new_state != arc_l2c_only) { - int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); uint64_t *size = &new_state->arcs_lsize[buftype]; /* @@ -1397,10 +1431,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); - if (use_mutex) - mutex_enter(&new_state->arcs_mtx); - - list_insert_head(&new_state->arcs_list[buftype], hdr); + multilist_insert(&new_state->arcs_list[buftype], hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { @@ -1409,9 +1440,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, to_delta = hdr->b_size; } atomic_add_64(size, to_delta); - - if (use_mutex) - mutex_exit(&new_state->arcs_mtx); } } @@ -1433,8 +1461,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. */ - ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && - list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); + ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void @@ -1537,6 +1565,7 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type) hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_datacnt = 1; + hdr->b_l1hdr.b_tmp_cdata = NULL; arc_get_data_buf(buf); @@ -1667,6 +1696,21 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) data, metadata, hits); } +static void +arc_buf_free_on_write(abd_t *data, size_t size, + void (*free_func)(abd_t *, size_t)) +{ + l2arc_data_free_t *df; + + df = kmem_alloc(sizeof (*df), KM_SLEEP); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_func = free_func; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); +} + /* * Free the arc data buffer. If it is an l2arc write in progress, * the buffer is placed on l2arc_free_on_write to be freed later. @@ -1677,26 +1721,74 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(abd_t *, size_t)) arc_buf_hdr_t *hdr = buf->b_hdr; if (HDR_L2_WRITING(hdr)) { - l2arc_data_free_t *df; - df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); - df->l2df_data = buf->b_data; - df->l2df_size = hdr->b_size; - df->l2df_func = free_func; - mutex_enter(&l2arc_free_on_write_mtx); - list_insert_head(l2arc_free_on_write, df); - mutex_exit(&l2arc_free_on_write_mtx); + arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { free_func(buf->b_data, hdr->b_size); } } +static void +arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); + + /* + * The b_tmp_cdata field is linked off of the b_l1hdr, so if + * that doesn't exist, the header is in the arc_l2c_only state, + * and there isn't anything to free (it's already been freed). + */ + if (!HDR_HAS_L1HDR(hdr)) + return; + + /* + * The header isn't being written to the l2arc device, thus it + * shouldn't have a b_tmp_cdata to free. + */ + if (!HDR_L2_WRITING(hdr)) { + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + return; + } + + /* + * The header does not have compression enabled. This can be due + * to the buffer not being compressible, or because we're + * freeing the buffer before the second phase of + * l2arc_write_buffer() has started (which does the compression + * step). In either case, b_tmp_cdata does not point to a + * separately compressed buffer, so there's nothing to free (it + * points to the same buffer as the arc_buf_t's b_data field). + */ + if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + hdr->b_l1hdr.b_tmp_cdata = NULL; + return; + } + + /* + * There's nothing to free since the buffer was all zero's and + * compressed to a zero length buffer. + */ + if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) { + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + return; + } + + ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))); + + arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, + hdr->b_size, abd_free); + + ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); + hdr->b_l1hdr.b_tmp_cdata = NULL; +} + /* * Free up buf->b_data and if 'remove' is set, then pull the * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ static void -arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) +arc_buf_destroy(arc_buf_t *buf, boolean_t remove) { arc_buf_t **bufp; @@ -1709,17 +1801,17 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) arc_cksum_verify(buf); arc_buf_unwatch(buf); - if (!recycle) { - if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, abd_free); - arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, abd_free); - arc_space_return(size, ARC_SPACE_DATA); - } + if (type == ARC_BUFC_METADATA) { + arc_buf_data_free(buf, abd_free); + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_buf_data_free(buf, abd_free); + arc_space_return(size, ARC_SPACE_DATA); } - if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { uint64_t *cnt = &state->arcs_lsize[type]; ASSERT(refcount_is_zero( @@ -1787,6 +1879,12 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); + /* + * We don't want to leak the b_tmp_cdata buffer that was + * allocated in l2arc_write_buffers() + */ + arc_buf_l2_cdata_free(hdr); + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); @@ -1810,27 +1908,26 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (buf->b_efunc != NULL) { - mutex_enter(&arc_eviction_mtx); + mutex_enter(&arc_user_evicts_lock); mutex_enter(&buf->b_evict_lock); ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, - FALSE); + arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); hdr->b_l1hdr.b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_eviction_mtx); + cv_signal(&arc_user_evicts_cv); + mutex_exit(&arc_user_evicts_lock); } else { - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, - TRUE); + arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); } } } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); kmem_cache_free(hdr_full_cache, hdr); } else { @@ -1856,7 +1953,7 @@ arc_buf_free(arc_buf_t *buf, void *tag) (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_l1hdr.b_datacnt > 1) { - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); } else { ASSERT(buf == hdr->b_l1hdr.b_buf); ASSERT(buf->b_efunc == NULL); @@ -1870,16 +1967,16 @@ arc_buf_free(arc_buf_t *buf, void *tag) * this buffer unless the write completes before we finish * decrementing the reference count. */ - mutex_enter(&arc_eviction_mtx); + mutex_enter(&arc_user_evicts_lock); (void) remove_reference(hdr, NULL, tag); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_eviction_mtx); + mutex_exit(&arc_user_evicts_lock); if (destroy_hdr) arc_hdr_destroy(hdr); } else { if (remove_reference(hdr, NULL, tag) > 0) - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); else arc_hdr_destroy(hdr); } @@ -1909,7 +2006,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_l1hdr.b_datacnt > 1) { if (no_callback) - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); } else if (no_callback) { ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); @@ -1970,410 +2067,678 @@ arc_buf_eviction_needed(arc_buf_t *buf) } /* - * Evict buffers from list until we've removed the specified number of - * bytes. Move the removed buffers to the appropriate evict state. - * If the recycle flag is set, then attempt to "recycle" a buffer: - * - look for a buffer to evict that is `bytes' long. - * - return the data block from this buffer rather than freeing it. - * This flag is used by callers that are trying to make space for a - * new buffer in a full arc cache. + * Evict the arc_buf_hdr that is provided as a parameter. The resultant + * state of the header is dependent on it's state prior to entering this + * function. The following transitions are possible: * - * This function makes a "best effort". It skips over any buffers - * it can't get a hash_lock on, and so may not catch all candidates. - * It may also return without evicting as much space as requested. + * - arc_mru -> arc_mru_ghost + * - arc_mfu -> arc_mfu_ghost + * - arc_mru_ghost -> arc_l2c_only + * - arc_mru_ghost -> deleted + * - arc_mfu_ghost -> arc_l2c_only + * - arc_mfu_ghost -> deleted */ -static abd_t * -arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - arc_buf_contents_t type) +static int64_t +arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { - arc_state_t *evicted_state; - uint64_t bytes_evicted = 0, skipped = 0, missed = 0; - arc_buf_hdr_t *hdr, *hdr_prev = NULL; - list_t *list = &state->arcs_list[type]; - kmutex_t *hash_lock; - boolean_t have_lock; - abd_t *stolen = NULL; - arc_buf_hdr_t marker = {{{ 0 }}}; - int count = 0; - - ASSERT(state == arc_mru || state == arc_mfu); - - evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + arc_state_t *evicted_state, *state; + int64_t bytes_evicted = 0; -top: - /* - * The ghost list lock must be acquired first in order to prevent - * a 3 party deadlock: - * - * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by - * l2ad_mtx in arc_hdr_realloc - * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx - * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by - * arc_*_ghost->arcs_mtx and forms a deadlock cycle. - * - * This situation is avoided by acquiring the ghost list lock first. - */ - mutex_enter(&evicted_state->arcs_mtx); - mutex_enter(&state->arcs_mtx); - - for (hdr = list_tail(list); hdr; hdr = hdr_prev) { - hdr_prev = list_prev(list, hdr); - /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(hdr) || - ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && - ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < - zfs_arc_min_prefetch_lifespan)) { - skipped++; - continue; - } - /* "lookahead" for better eviction candidate */ - if (recycle && hdr->b_size != bytes && - hdr_prev && hdr_prev->b_size == bytes) - continue; + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); - /* ignore markers */ - if (hdr->b_spa == 0) - continue; + state = hdr->b_l1hdr.b_state; + if (GHOST_STATE(state)) { + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(hdr->b_l1hdr.b_buf == NULL); /* - * It may take a long time to evict all the bufs requested. - * To avoid blocking all arc activity, periodically drop - * the arcs_mtx and give other threads a chance to run - * before reacquiring the lock. - * - * If we are looking for a buffer to recycle, we are in - * the hot code path, so don't sleep. + * l2arc_write_buffers() relies on a header's L1 portion + * (i.e. it's b_tmp_cdata field) during it's write phase. + * Thus, we cannot push a header onto the arc_l2c_only + * state (removing it's L1 piece) until the header is + * done being written to the l2arc. */ - if (!recycle && count++ > arc_evict_iterations) { - list_insert_after(list, hdr, &marker); - mutex_exit(&state->arcs_mtx); - mutex_exit(&evicted_state->arcs_mtx); - kpreempt(KPREEMPT_SYNC); - mutex_enter(&evicted_state->arcs_mtx); - mutex_enter(&state->arcs_mtx); - hdr_prev = list_prev(list, &marker); - list_remove(list, &marker); - count = 0; - continue; + if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { + ARCSTAT_BUMP(arcstat_evict_l2_skip); + return (bytes_evicted); } - hash_lock = HDR_LOCK(hdr); - have_lock = MUTEX_HELD(hash_lock); - if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - if (!mutex_tryenter(&buf->b_evict_lock)) { - missed += 1; - break; - } - if (buf->b_data != NULL) { - bytes_evicted += hdr->b_size; - if (recycle && - arc_buf_type(hdr) == type && - hdr->b_size == bytes && - !HDR_L2_WRITING(hdr)) { - stolen = buf->b_data; - recycle = FALSE; - } - } - if (buf->b_efunc != NULL) { - mutex_enter(&arc_eviction_mtx); - arc_buf_destroy(buf, - buf->b_data == stolen, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&arc_eviction_mtx); - mutex_exit(&buf->b_evict_lock); - } else { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, - buf->b_data == stolen, TRUE); - } - } + ARCSTAT_BUMP(arcstat_deleted); + bytes_evicted += hdr->b_size; - if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_cached, - hdr->b_size); - } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_eligible, - hdr->b_size); - } else { - ARCSTAT_INCR( - arcstat_evict_l2_ineligible, - hdr->b_size); - } - } + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - if (hdr->b_l1hdr.b_datacnt == 0) { - arc_change_state(evicted_state, hdr, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); - } - if (!have_lock) - mutex_exit(hash_lock); - if (bytes >= 0 && bytes_evicted >= bytes) - break; + if (HDR_HAS_L2HDR(hdr)) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, hdr, hash_lock); + /* + * dropping from L1+L2 cached to L2-only, + * realloc to remove the L1 header. + */ + hdr = arc_hdr_realloc(hdr, hdr_full_cache, + hdr_l2only_cache); } else { - missed += 1; + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); } + return (bytes_evicted); } - mutex_exit(&state->arcs_mtx); - mutex_exit(&evicted_state->arcs_mtx); + ASSERT(state == arc_mru || state == arc_mfu); + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - if (list == &state->arcs_list[ARC_BUFC_DATA] && - (bytes < 0 || bytes_evicted < bytes)) { - /* Prevent second pass from recycling metadata into data */ - recycle = FALSE; - type = ARC_BUFC_METADATA; - list = &state->arcs_list[type]; - goto top; + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(hdr) || + ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < + arc_min_prefetch_lifespan)) { + ARCSTAT_BUMP(arcstat_evict_skip); + return (bytes_evicted); } - if (bytes_evicted < bytes) - dprintf("only evicted %lld bytes from %x\n", - (longlong_t)bytes_evicted, state->arcs_state); - - if (skipped) - ARCSTAT_INCR(arcstat_evict_skip, skipped); + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; + if (!mutex_tryenter(&buf->b_evict_lock)) { + ARCSTAT_BUMP(arcstat_mutex_miss); + break; + } + if (buf->b_data != NULL) + bytes_evicted += hdr->b_size; + if (buf->b_efunc != NULL) { + mutex_enter(&arc_user_evicts_lock); + arc_buf_destroy(buf, FALSE); + hdr->b_l1hdr.b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + cv_signal(&arc_user_evicts_cv); + mutex_exit(&arc_user_evicts_lock); + mutex_exit(&buf->b_evict_lock); + } else { + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy(buf, TRUE); + } + } - if (missed) - ARCSTAT_INCR(arcstat_mutex_miss, missed); + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); + } else { + if (l2arc_write_eligible(hdr->b_spa, hdr)) + ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); + else + ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); + } - /* - * Note: we have just evicted some data into the ghost state, - * potentially putting the ghost size over the desired size. Rather - * that evicting from the ghost list in this hot code path, leave - * this chore to the arc_reclaim_thread(). - */ + if (hdr->b_l1hdr.b_datacnt == 0) { + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); + } - return (stolen); + return (bytes_evicted); } -/* - * Remove buffers from list until we've removed the specified number of - * bytes. Destroy the buffers that are removed. - */ -static void -arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type) +static uint64_t +arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, + uint64_t spa, int64_t bytes) { - arc_buf_hdr_t *hdr, *hdr_prev; - arc_buf_hdr_t marker; - list_t *list = &state->arcs_list[type]; + multilist_sublist_t *mls; + uint64_t bytes_evicted = 0; + arc_buf_hdr_t *hdr; kmutex_t *hash_lock; - uint64_t bytes_deleted = 0; - uint64_t bufs_skipped = 0; - int count = 0; + int evict_count = 0; - ASSERT(GHOST_STATE(state)); - bzero(&marker, sizeof (marker)); -top: - mutex_enter(&state->arcs_mtx); - for (hdr = list_tail(list); hdr; hdr = hdr_prev) { - hdr_prev = list_prev(list, hdr); - if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) - panic("invalid hdr=%p", (void *)hdr); - if (spa && hdr->b_spa != spa) - continue; + ASSERT3P(marker, !=, NULL); + ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL)); + + mls = multilist_sublist_lock(ml, idx); + + for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; + hdr = multilist_sublist_prev(mls, marker)) { + if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || + (evict_count >= zfs_arc_evict_batch_limit)) + break; - /* ignore markers */ + /* + * To keep our iteration location, move the marker + * forward. Since we're not holding hdr's hash lock, we + * must be very careful and not remove 'hdr' from the + * sublist. Otherwise, other consumers might mistake the + * 'hdr' as not being on a sublist when they call the + * multilist_link_active() function (they all rely on + * the hash lock protecting concurrent insertions and + * removals). multilist_sublist_move_forward() was + * specifically implemented to ensure this is the case + * (only 'marker' will be removed and re-inserted). + */ + multilist_sublist_move_forward(mls, marker); + + /* + * The only case where the b_spa field should ever be + * zero, is the marker headers inserted by + * arc_evict_state(). It's possible for multiple threads + * to be calling arc_evict_state() concurrently (e.g. + * dsl_pool_close() and zio_inject_fault()), so we must + * skip any markers we see from these other threads. + */ if (hdr->b_spa == 0) continue; - hash_lock = HDR_LOCK(hdr); - /* caller may be trying to modify this buffer, skip it */ - if (MUTEX_HELD(hash_lock)) + /* we're only interested in evicting buffers of a certain spa */ + if (spa != 0 && hdr->b_spa != spa) { + ARCSTAT_BUMP(arcstat_evict_skip); continue; + } + + hash_lock = HDR_LOCK(hdr); /* - * It may take a long time to evict all the bufs requested. - * To avoid blocking all arc activity, periodically drop - * the arcs_mtx and give other threads a chance to run - * before reacquiring the lock. + * We aren't calling this function from any code path + * that would already be holding a hash lock, so we're + * asserting on this assumption to be defensive in case + * this ever changes. Without this check, it would be + * possible to incorrectly increment arcstat_mutex_miss + * below (e.g. if the code changed such that we called + * this function with a hash lock held). */ - if (count++ > arc_evict_iterations) { - list_insert_after(list, hdr, &marker); - mutex_exit(&state->arcs_mtx); - kpreempt(KPREEMPT_SYNC); - mutex_enter(&state->arcs_mtx); - hdr_prev = list_prev(list, &marker); - list_remove(list, &marker); - count = 0; - continue; - } + ASSERT(!MUTEX_HELD(hash_lock)); + if (mutex_tryenter(hash_lock)) { - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(!HDR_HAS_L1HDR(hdr) || - hdr->b_l1hdr.b_buf == NULL); - ARCSTAT_BUMP(arcstat_deleted); - bytes_deleted += hdr->b_size; + uint64_t evicted = arc_evict_hdr(hdr, hash_lock); + mutex_exit(hash_lock); - if (HDR_HAS_L2HDR(hdr)) { - /* - * This buffer is cached on the 2nd Level ARC; - * don't destroy the header. - */ - arc_change_state(arc_l2c_only, hdr, hash_lock); - /* - * dropping from L1+L2 cached to L2-only, - * realloc to remove the L1 header. - */ - hdr = arc_hdr_realloc(hdr, hdr_full_cache, - hdr_l2only_cache); - mutex_exit(hash_lock); - } else { - arc_change_state(arc_anon, hdr, hash_lock); - mutex_exit(hash_lock); - arc_hdr_destroy(hdr); - } + bytes_evicted += evicted; - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - if (bytes >= 0 && bytes_deleted >= bytes) - break; - } else if (bytes < 0) { /* - * Insert a list marker and then wait for the - * hash lock to become available. Once its - * available, restart from where we left off. + * If evicted is zero, arc_evict_hdr() must have + * decided to skip this header, don't increment + * evict_count in this case. */ - list_insert_after(list, hdr, &marker); - mutex_exit(&state->arcs_mtx); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - mutex_enter(&state->arcs_mtx); - hdr_prev = list_prev(list, &marker); - list_remove(list, &marker); + if (evicted != 0) + evict_count++; + + /* + * If arc_size isn't overflowing, signal any + * threads that might happen to be waiting. + * + * For each header evicted, we wake up a single + * thread. If we used cv_broadcast, we could + * wake up "too many" threads causing arc_size + * to significantly overflow arc_c; since + * arc_get_data_buf() doesn't check for overflow + * when it's woken up (it doesn't because it's + * possible for the ARC to be overflowing while + * full of un-evictable buffers, and the + * function should proceed in this case). + * + * If threads are left sleeping, due to not + * using cv_broadcast, they will be woken up + * just before arc_reclaim_thread() sleeps. + */ + mutex_enter(&arc_reclaim_lock); + if (!arc_is_overflowing()) + cv_signal(&arc_reclaim_waiters_cv); + mutex_exit(&arc_reclaim_lock); } else { - bufs_skipped += 1; + ARCSTAT_BUMP(arcstat_mutex_miss); } } - mutex_exit(&state->arcs_mtx); - if (list == &state->arcs_list[ARC_BUFC_DATA] && - (bytes < 0 || bytes_deleted < bytes)) { - list = &state->arcs_list[ARC_BUFC_METADATA]; - goto top; + multilist_sublist_unlock(mls); + + return (bytes_evicted); +} + +/* + * Evict buffers from the given arc state, until we've removed the + * specified number of bytes. Move the removed buffers to the + * appropriate evict state. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so, may not catch all candidates. + * It may also return without evicting as much space as requested. + * + * If bytes is specified using the special value ARC_EVICT_ALL, this + * will evict all available (i.e. unlocked and evictable) buffers from + * the given arc state; which is used by arc_flush(). + */ +static uint64_t +arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) +{ + uint64_t total_evicted = 0; + multilist_t *ml = &state->arcs_list[type]; + int num_sublists; + arc_buf_hdr_t **markers; + int i; + + ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL)); + + num_sublists = multilist_get_num_sublists(ml); + + /* + * If we've tried to evict from each sublist, made some + * progress, but still have not hit the target number of bytes + * to evict, we want to keep trying. The markers allow us to + * pick up where we left off for each individual sublist, rather + * than starting from the tail each time. + */ + markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); + for (i = 0; i < num_sublists; i++) { + multilist_sublist_t *mls; + + markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); + + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_adjust_type() and + * arc_evict_state_impl(). + */ + markers[i]->b_spa = 0; + + mls = multilist_sublist_lock(ml, i); + multilist_sublist_insert_tail(mls, markers[i]); + multilist_sublist_unlock(mls); + } + + /* + * While we haven't hit our target number of bytes to evict, or + * we're evicting all available buffers. + */ + while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { + /* + * Start eviction using a randomly selected sublist, + * this is to try and evenly balance eviction across all + * sublists. Always starting at the same sublist + * (e.g. index 0) would cause evictions to favor certain + * sublists over others. + */ + int sublist_idx = multilist_get_random_index(ml); + uint64_t scan_evicted = 0; + + for (i = 0; i < num_sublists; i++) { + uint64_t bytes_remaining; + uint64_t bytes_evicted; + + if (bytes == ARC_EVICT_ALL) + bytes_remaining = ARC_EVICT_ALL; + else if (total_evicted < bytes) + bytes_remaining = bytes - total_evicted; + else + break; + + bytes_evicted = arc_evict_state_impl(ml, sublist_idx, + markers[sublist_idx], spa, bytes_remaining); + + scan_evicted += bytes_evicted; + total_evicted += bytes_evicted; + + /* we've reached the end, wrap to the beginning */ + if (++sublist_idx >= num_sublists) + sublist_idx = 0; + } + + /* + * If we didn't evict anything during this scan, we have + * no reason to believe we'll evict more during another + * scan, so break the loop. + */ + if (scan_evicted == 0) { + /* This isn't possible, let's make that obvious */ + ASSERT3S(bytes, !=, 0); + + /* + * When bytes is ARC_EVICT_ALL, the only way to + * break the loop is when scan_evicted is zero. + * In that case, we actually have evicted enough, + * so we don't want to increment the kstat. + */ + if (bytes != ARC_EVICT_ALL) { + ASSERT3S(total_evicted, <, bytes); + ARCSTAT_BUMP(arcstat_evict_not_enough); + } + + break; + } + } + + for (i = 0; i < num_sublists; i++) { + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_remove(mls, markers[i]); + multilist_sublist_unlock(mls); + + kmem_cache_free(hdr_full_cache, markers[i]); + } + kmem_free(markers, sizeof (*markers) * num_sublists); + + return (total_evicted); +} + +/* + * Flush all "evictable" data of the given type from the arc state + * specified. This will not evict any "active" buffers (i.e. referenced). + * + * When 'retry' is set to FALSE, the function will make a single pass + * over the state and evict any buffers that it can. Since it doesn't + * continually retry the eviction, it might end up leaving some buffers + * in the ARC due to lock misses. + * + * When 'retry' is set to TRUE, the function will continually retry the + * eviction until *all* evictable buffers have been removed from the + * state. As a result, if concurrent insertions into the state are + * allowed (e.g. if the ARC isn't shutting down), this function might + * wind up in an infinite loop, continually trying to evict buffers. + */ +static uint64_t +arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, + boolean_t retry) +{ + uint64_t evicted = 0; + + while (state->arcs_lsize[type] != 0) { + evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); + + if (!retry) + break; + } + + return (evicted); +} + +/* + * Evict the specified number of bytes from the state specified, + * restricting eviction to the spa and type given. This function + * prevents us from trying to evict more from a state's list than + * is "evictable", and to skip evicting altogether when passed a + * negative value for "bytes". In contrast, arc_evict_state() will + * evict everything it can, when passed a negative value for "bytes". + */ +static uint64_t +arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) +{ + int64_t delta; + + if (bytes > 0 && state->arcs_lsize[type] > 0) { + delta = MIN(state->arcs_lsize[type], bytes); + return (arc_evict_state(state, spa, delta, type)); + } + + return (0); +} + +/* + * Evict metadata buffers from the cache, such that arc_meta_used is + * capped by the arc_meta_limit tunable. + */ +static uint64_t +arc_adjust_meta(void) +{ + uint64_t total_evicted = 0; + int64_t target; + + /* + * If we're over the meta limit, we want to evict enough + * metadata to get back under the meta limit. We don't want to + * evict so much that we drop the MRU below arc_p, though. If + * we're over the meta limit more than we're over arc_p, we + * evict some from the MRU here, and some from the MFU below. + */ + target = MIN((int64_t)(arc_meta_used - arc_meta_limit), + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); + + total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + + /* + * Similar to the above, we want to evict enough bytes to get us + * below the meta limit, but not so much as to drop us below the + * space alloted to the MFU (which is defined as arc_c - arc_p). + */ + target = MIN((int64_t)(arc_meta_used - arc_meta_limit), + (int64_t)(arc_mfu->arcs_size - (arc_c - arc_p))); + + total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); +} + +/* + * Return the type of the oldest buffer in the given arc state + * + * This function will select a random sublist of type ARC_BUFC_DATA and + * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist + * is compared, and the type which contains the "older" buffer will be + * returned. + */ +static arc_buf_contents_t +arc_adjust_type(arc_state_t *state) +{ + multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; + multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; + int data_idx = multilist_get_random_index(data_ml); + int meta_idx = multilist_get_random_index(meta_ml); + multilist_sublist_t *data_mls; + multilist_sublist_t *meta_mls; + arc_buf_contents_t type; + arc_buf_hdr_t *data_hdr; + arc_buf_hdr_t *meta_hdr; + + /* + * We keep the sublist lock until we're finished, to prevent + * the headers from being destroyed via arc_evict_state(). + */ + data_mls = multilist_sublist_lock(data_ml, data_idx); + meta_mls = multilist_sublist_lock(meta_ml, meta_idx); + + /* + * These two loops are to ensure we skip any markers that + * might be at the tail of the lists due to arc_evict_state(). + */ + + for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; + data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { + if (data_hdr->b_spa != 0) + break; + } + + for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; + meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { + if (meta_hdr->b_spa != 0) + break; } - if (bufs_skipped) { - ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); - ASSERT(bytes >= 0); + if (data_hdr == NULL && meta_hdr == NULL) { + type = ARC_BUFC_DATA; + } else if (data_hdr == NULL) { + ASSERT3P(meta_hdr, !=, NULL); + type = ARC_BUFC_METADATA; + } else if (meta_hdr == NULL) { + ASSERT3P(data_hdr, !=, NULL); + type = ARC_BUFC_DATA; + } else { + ASSERT3P(data_hdr, !=, NULL); + ASSERT3P(meta_hdr, !=, NULL); + + /* The headers can't be on the sublist without an L1 header */ + ASSERT(HDR_HAS_L1HDR(data_hdr)); + ASSERT(HDR_HAS_L1HDR(meta_hdr)); + + if (data_hdr->b_l1hdr.b_arc_access < + meta_hdr->b_l1hdr.b_arc_access) { + type = ARC_BUFC_DATA; + } else { + type = ARC_BUFC_METADATA; + } } - if (bytes_deleted < bytes) - dprintf("only deleted %lld bytes from %p\n", - (longlong_t)bytes_deleted, state); + multilist_sublist_unlock(meta_mls); + multilist_sublist_unlock(data_mls); + + return (type); } -static void +/* + * Evict buffers from the cache, such that arc_size is capped by arc_c. + */ +static uint64_t arc_adjust(void) { - int64_t adjustment, delta; + uint64_t total_evicted = 0; + uint64_t bytes; + int64_t target; /* - * Adjust MRU size + * If we're over arc_meta_limit, we want to correct that before + * potentially evicting data buffers below. */ + total_evicted += arc_adjust_meta(); - adjustment = MIN((int64_t)(arc_size - arc_c), - (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); - - if (adjustment > 0 && arc_mru->arcs_size > 0) { - delta = MIN(arc_mru->arcs_size, adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); - } + /* + * Adjust MRU size + * + * If we're over the target cache size, we want to evict enough + * from the list to get back to our target size. We don't want + * to evict too much from the MRU, such that it drops below + * arc_p. So, if we're over our target cache size more than + * the MRU is over arc_p, we'll evict enough to get back to + * arc_p here, and then evict more from the MFU below. + */ + target = MIN((int64_t)(arc_size - arc_c), + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - + arc_p)); /* - * Adjust MFU size + * If we're below arc_meta_min, always prefer to evict data. + * Otherwise, try to satisfy the requested number of bytes to + * evict from the type which contains older buffers; in an + * effort to keep newer buffers in the cache regardless of their + * type. If we cannot satisfy the number of bytes from this + * type, spill over into the next type. */ + if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && + arc_meta_used > arc_meta_min) { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; - adjustment = arc_size - arc_c; + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from metadata. + */ + target -= bytes; - if (adjustment > 0 && arc_mfu->arcs_size > 0) { - delta = MIN(arc_mfu->arcs_size, adjustment); - (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* - * Adjust ghost lists + * Adjust MFU size + * + * Now that we've tried to evict enough from the MRU to get its + * size back to arc_p, if we're still above the target cache + * size, we evict the rest from the MFU. */ + target = arc_size - arc_c; - adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; + if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && + arc_meta_used > arc_meta_min) { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; - if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { - delta = MIN(arc_mru_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_DATA); - } + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; - adjustment = - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from data. + */ + target -= bytes; - if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { - delta = MIN(arc_mfu_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_DATA); + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } -} -/* - * Request that arc user drop references so that N bytes can be released - * from the cache. This provides a mechanism to ensure the arc can honor - * the arc_meta_limit and reclaim buffers which are pinned in the cache - * by higher layers. (i.e. the zpl) - */ -static void -arc_do_user_prune(int64_t adjustment) -{ - arc_prune_func_t *func; - void *private; - arc_prune_t *cp, *np; + /* + * Adjust ghost lists + * + * In addition to the above, the ARC also defines target values + * for the ghost lists. The sum of the mru list and mru ghost + * list should never exceed the target size of the cache, and + * the sum of the mru list, mfu list, mru ghost list, and mfu + * ghost list should never exceed twice the target size of the + * cache. The following logic enforces these limits on the ghost + * caches, and evicts from them as needed. + */ + target = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; - mutex_enter(&arc_prune_mtx); + bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; - cp = list_head(&arc_prune_list); - while (cp != NULL) { - func = cp->p_pfunc; - private = cp->p_private; - np = list_next(&arc_prune_list, cp); - refcount_add(&cp->p_refcnt, func); - mutex_exit(&arc_prune_mtx); + target -= bytes; - if (func != NULL) - func(adjustment, private); + total_evicted += + arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); - mutex_enter(&arc_prune_mtx); + /* + * We assume the sum of the mru list and mfu list is less than + * or equal to arc_c (we enforced this above), which means we + * can use the simpler of the two equations below: + * + * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c + * mru ghost + mfu ghost <= arc_c + */ + target = arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; - /* User removed prune callback concurrently with execution */ - if (refcount_remove(&cp->p_refcnt, func) == 0) { - ASSERT(!list_link_active(&cp->p_node)); - refcount_destroy(&cp->p_refcnt); - kmem_free(cp, sizeof (*cp)); - } + bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; - cp = np; - } + target -= bytes; - ARCSTAT_BUMP(arcstat_prune); - mutex_exit(&arc_prune_mtx); + total_evicted += + arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); } static void arc_do_user_evicts(void) { - mutex_enter(&arc_eviction_mtx); + mutex_enter(&arc_user_evicts_lock); while (arc_eviction_list != NULL) { arc_buf_t *buf = arc_eviction_list; arc_eviction_list = buf->b_next; mutex_enter(&buf->b_evict_lock); buf->b_hdr = NULL; mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_eviction_mtx); + mutex_exit(&arc_user_evicts_lock); if (buf->b_efunc != NULL) VERIFY0(buf->b_efunc(buf->b_private)); @@ -2381,147 +2746,39 @@ arc_do_user_evicts(void) buf->b_efunc = NULL; buf->b_private = NULL; kmem_cache_free(buf_cache, buf); - mutex_enter(&arc_eviction_mtx); + mutex_enter(&arc_user_evicts_lock); } - mutex_exit(&arc_eviction_mtx); + mutex_exit(&arc_user_evicts_lock); } -/* - * The goal of this function is to evict enough meta data buffers from the - * ARC in order to enforce the arc_meta_limit. Achieving this is slightly - * more complicated than it appears because it is common for data buffers - * to have holds on meta data buffers. In addition, dnode meta data buffers - * will be held by the dnodes in the block preventing them from being freed. - * This means we can't simply traverse the ARC and expect to always find - * enough unheld meta data buffer to release. - * - * Therefore, this function has been updated to make alternating passes - * over the ARC releasing data buffers and then newly unheld meta data - * buffers. This ensures forward progress is maintained and arc_meta_used - * will decrease. Normally this is sufficient, but if required the ARC - * will call the registered prune callbacks causing dentry and inodes to - * be dropped from the VFS cache. This will make dnode meta data buffers - * available for reclaim. - */ -static void -arc_adjust_meta(void) +void +arc_flush(spa_t *spa, boolean_t retry) { - int64_t adjustmnt, delta, prune = 0; - arc_buf_contents_t type = ARC_BUFC_DATA; - unsigned long restarts = zfs_arc_meta_adjust_restarts; - -restart: - /* - * This slightly differs than the way we evict from the mru in - * arc_adjust because we don't have a "target" value (i.e. no - * "meta" arc_p). As a result, I think we can completely - * cannibalize the metadata in the MRU before we evict the - * metadata from the MFU. I think we probably need to implement a - * "metadata arc_p" value to do this properly. - */ - adjustmnt = arc_meta_used - arc_meta_limit; - - if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) { - delta = MIN(arc_mru->arcs_lsize[type], adjustmnt); - arc_evict(arc_mru, 0, delta, FALSE, type); - adjustmnt -= delta; - } - - /* - * We can't afford to recalculate adjustmnt here. If we do, - * new metadata buffers can sneak into the MRU or ANON lists, - * thus penalize the MFU metadata. Although the fudge factor is - * small, it has been empirically shown to be significant for - * certain workloads (e.g. creating many empty directories). As - * such, we use the original calculation for adjustmnt, and - * simply decrement the amount of data evicted from the MRU. - */ - - if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) { - delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt); - arc_evict(arc_mfu, 0, delta, FALSE, type); - } - - adjustmnt = arc_meta_used - arc_meta_limit; - - if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { - delta = MIN(adjustmnt, - arc_mru_ghost->arcs_lsize[type]); - arc_evict_ghost(arc_mru_ghost, 0, delta, type); - adjustmnt -= delta; - } - - if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) { - delta = MIN(adjustmnt, - arc_mfu_ghost->arcs_lsize[type]); - arc_evict_ghost(arc_mfu_ghost, 0, delta, type); - } + uint64_t guid = 0; /* - * If after attempting to make the requested adjustment to the ARC - * the meta limit is still being exceeded then request that the - * higher layers drop some cached objects which have holds on ARC - * meta buffers. Requests to the upper layers will be made with - * increasingly large scan sizes until the ARC is below the limit. + * If retry is TRUE, a spa must not be specified since we have + * no good way to determine if all of a spa's buffers have been + * evicted from an arc state. */ - if (arc_meta_used > arc_meta_limit) { - if (type == ARC_BUFC_DATA) { - type = ARC_BUFC_METADATA; - } else { - type = ARC_BUFC_DATA; - - if (zfs_arc_meta_prune) { - prune += zfs_arc_meta_prune; - arc_do_user_prune(prune); - } - } - - if (restarts > 0) { - restarts--; - goto restart; - } - } -} - -/* - * Flush all *evictable* data from the cache for the given spa. - * NOTE: this will not touch "active" (i.e. referenced) data. - */ -void -arc_flush(spa_t *spa) -{ - uint64_t guid = 0; + ASSERT(!retry || spa == 0); if (spa != NULL) guid = spa_load_guid(spa); - while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa != NULL) - break; - } - while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa != NULL) - break; - } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa != NULL) - break; - } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa != NULL) - break; - } + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); - arc_evict_ghost(arc_mru_ghost, guid, -1, ARC_BUFC_DATA); - arc_evict_ghost(arc_mfu_ghost, guid, -1, ARC_BUFC_DATA); + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); - mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); - mutex_exit(&arc_reclaim_thr_lock); ASSERT(spa || arc_eviction_list == NULL); } @@ -2554,7 +2811,7 @@ arc_shrink(uint64_t bytes) } if (arc_size > arc_c) - arc_adjust(); + (void) arc_adjust(); } static void @@ -2584,6 +2841,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) } } + kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); } @@ -2594,19 +2852,39 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) * reclamation has been entirely delegated to the arc_shrinker_func() * which is registered with the VM. To reflect this change in behavior * the arc_reclaim thread has been renamed to arc_adapt. + * + * The following comment from arc_reclaim_thread() in illumos is still + * applicable: + * + * Threads can block in arc_get_data_buf() waiting for this thread to evict + * enough data and signal them to proceed. When this happens, the threads in + * arc_get_data_buf() are sleeping while holding the hash lock for their + * particular arc header. Thus, we must be careful to never sleep on a + * hash lock in this thread. This is to prevent the following deadlock: + * + * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", + * waiting for the reclaim thread to signal it. + * + * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, + * fails, and goes to sleep forever. + * + * This possible deadlock is avoided by always acquiring a hash lock + * using mutex_tryenter() from arc_reclaim_thread(). */ static void arc_adapt_thread(void) { callb_cpr_t cpr; + uint64_t arc_evicted; - CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); + CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); - mutex_enter(&arc_reclaim_thr_lock); - while (arc_thread_exit == 0) { + mutex_enter(&arc_reclaim_lock); + while (arc_reclaim_thread_exit == 0) { #ifndef _KERNEL arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + mutex_exit(&arc_reclaim_lock); if (spa_get_random(100) == 0) { if (arc_no_grow) { @@ -2628,6 +2906,8 @@ arc_adapt_thread(void) arc_kmem_reap_now(last_reclaim, 0); arc_warm = B_TRUE; } +#else /* _KERNEL */ + mutex_exit(&arc_reclaim_lock); #endif /* !_KERNEL */ /* No recent memory pressure allow the ARC to grow. */ @@ -2635,18 +2915,23 @@ arc_adapt_thread(void) ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time)) arc_no_grow = FALSE; - arc_adjust_meta(); + arc_evicted = arc_adjust(); - arc_adjust(); + /* + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * up any threads before we go to sleep. + */ + if (arc_size <= arc_c || arc_evicted == 0) + cv_broadcast(&arc_reclaim_waiters_cv); - if (arc_eviction_list != NULL) - arc_do_user_evicts(); + mutex_enter(&arc_reclaim_lock); /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv, - &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); - CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + (void) cv_timedwait_interruptible(&arc_reclaim_thread_cv, + &arc_reclaim_lock, (ddi_get_lbolt() + hz)); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); /* Allow the module options to be changed */ @@ -2664,14 +2949,56 @@ arc_adapt_thread(void) zfs_arc_meta_limit <= arc_c_max && zfs_arc_meta_limit != arc_meta_limit) arc_meta_limit = zfs_arc_meta_limit; + } + + arc_reclaim_thread_exit = 0; + cv_broadcast(&arc_reclaim_thread_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ + thread_exit(); +} + +static void +arc_user_evicts_thread(void) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_user_evicts_lock); + while (!arc_user_evicts_thread_exit) { + mutex_exit(&arc_user_evicts_lock); + arc_do_user_evicts(); + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date; but that should suffice. The arc_state_t + * structures can be queried directly if more accurate + * information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + mutex_enter(&arc_user_evicts_lock); + + /* + * Block until signaled, or after one second (we need to + * call the arc's kstat update function regularly). + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_interruptible(&arc_user_evicts_cv, + &arc_user_evicts_lock, ddi_get_lbolt() + hz); + CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); } - arc_thread_exit = 0; - cv_broadcast(&arc_reclaim_thr_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ + arc_user_evicts_thread_exit = FALSE; + cv_broadcast(&arc_user_evicts_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ thread_exit(); } @@ -2772,9 +3099,11 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) return (SHRINK_STOP); /* Reclaim in progress */ - if (mutex_tryenter(&arc_reclaim_thr_lock) == 0) + if (mutex_tryenter(&arc_reclaim_lock) == 0) return (SHRINK_STOP); + mutex_exit(&arc_reclaim_lock); + /* * Evict the requested number of pages by shrinking arc_c the * requested amount. If there is nothing left to evict just @@ -2793,6 +3122,11 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) pages = SHRINK_STOP; } + /* + * We've reaped what we can, wake up threads. + */ + cv_broadcast(&arc_reclaim_waiters_cv); + /* * When direct reclaim is observed it usually indicates a rapid * increase in memory pressure. This occurs because the kswapd @@ -2808,8 +3142,6 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) ARCSTAT_BUMP(arcstat_memory_direct_count); } - mutex_exit(&arc_reclaim_thr_lock); - return (pages); } SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func); @@ -2884,43 +3216,25 @@ arc_adapt(int bytes, arc_state_t *state) } /* - * Check if the cache has reached its limits and eviction is required - * prior to insert. + * Check if arc_size has grown past our upper threshold, determined by + * zfs_arc_overflow_shift. */ -static int -arc_evict_needed(arc_buf_contents_t type) +static boolean_t +arc_is_overflowing(void) { - if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) - return (1); - - if (arc_no_grow) - return (1); + /* Always allow at least one block of overflow */ + uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, + arc_c >> zfs_arc_overflow_shift); - return (arc_size > arc_c); + return (arc_size >= arc_c + overflow); } /* - * The buffer, supplied as the first argument, needs a data block. - * So, if we are at cache max, determine which cache should be victimized. - * We have the following cases: - * - * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> - * In this situation if we're out of space, but the resident size of the MFU is - * under the limit, victimize the MFU cache to satisfy this insertion request. - * - * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> - * Here, we've used up all of the available space for the MRU, so we need to - * evict from our own cache instead. Evict from the set of resident MRU - * entries. - * - * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> - * c minus p represents the MFU space in the cache, since p is the size of the - * cache that is dedicated to the MRU. In this situation there's still space on - * the MFU side, so the MRU side needs to be victimized. - * - * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> - * MFU's resident set is consuming more space than it has been allotted. In - * this situation, we must victimize our own cache, the MFU, for this insertion. + * The buffer, supplied as the first argument, needs a data block. If we + * are hitting the hard limit for the cache size, we must sleep, waiting + * for the eviction thread to catch up. If we're past the target size + * but below the hard limit, we'll only signal the reclaim thread and + * continue on. */ static void arc_get_data_buf(arc_buf_t *buf) @@ -2928,91 +3242,54 @@ arc_get_data_buf(arc_buf_t *buf) arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = arc_buf_type(buf->b_hdr); - arc_buf_contents_t evict = ARC_BUFC_DATA; - boolean_t recycle = TRUE; arc_adapt(size, state); /* - * We have not yet reached cache maximum size, - * just allocate a new buffer. + * If arc_size is currently overflowing, and has grown past our + * upper limit, we must be adding data faster than the evict + * thread can evict. Thus, to ensure we don't compound the + * problem by adding more data and forcing arc_size to grow even + * further past it's target size, we halt and wait for the + * eviction thread to catch up. + * + * It's also possible that the reclaim thread is unable to evict + * enough buffers to get arc_size below the overflow limit (e.g. + * due to buffers being un-evictable, or hash lock collisions). + * In this case, we want to proceed regardless if we're + * overflowing; thus we don't use a while loop here. */ - if (!arc_evict_needed(type)) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = abd_alloc_linear(size); - arc_space_consume(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = abd_alloc_scatter(size); - arc_space_consume(size, ARC_SPACE_DATA); + if (arc_is_overflowing()) { + mutex_enter(&arc_reclaim_lock); + + /* + * Now that we've acquired the lock, we may no longer be + * over the overflow limit, lets check. + * + * We're ignoring the case of spurious wake ups. If that + * were to happen, it'd let this thread consume an ARC + * buffer before it should have (i.e. before we're under + * the overflow limit and were signalled by the reclaim + * thread). As long as that is a rare occurrence, it + * shouldn't cause any harm. + */ + if (arc_is_overflowing()) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); } - goto out; - } - /* - * If we are prefetching from the mfu ghost list, this buffer - * will end up on the mru list; so steal space from there. - */ - if (state == arc_mfu_ghost) - state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; - else if (state == arc_mru_ghost) - state = arc_mru; - - if (state == arc_mru || state == arc_anon) { - uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_mfu->arcs_lsize[type] >= size && - arc_p > mru_used) ? arc_mfu : arc_mru; - } else { - /* MFU cases */ - uint64_t mfu_space = arc_c - arc_p; - state = (arc_mru->arcs_lsize[type] >= size && - mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + mutex_exit(&arc_reclaim_lock); } - /* - * Evict data buffers prior to metadata buffers, unless we're - * over the metadata limit and adding a metadata buffer. - */ if (type == ARC_BUFC_METADATA) { - if (arc_meta_used >= arc_meta_limit) - evict = ARC_BUFC_METADATA; - else - /* - * In this case, we're evicting data while - * adding metadata. Thus, to prevent recycling a - * data buffer into a metadata buffer, recycling - * is disabled in the following arc_evict call. - */ - recycle = FALSE; + buf->b_data = abd_alloc_linear(size); + arc_space_consume(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = abd_alloc_scatter(size); + arc_space_consume(size, ARC_SPACE_DATA); } - if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = abd_alloc_linear(size); - arc_space_consume(size, ARC_SPACE_META); - - /* - * If we are unable to recycle an existing meta buffer - * signal the reclaim thread. It will notify users - * via the prune callback to drop references. The - * prune callback in run in the context of the reclaim - * thread to avoid deadlocking on the hash_lock. - * Of course, only do this when recycle is true. - */ - if (recycle) - cv_signal(&arc_reclaim_thr_cv); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = abd_alloc_scatter(size); - arc_space_consume(size, ARC_SPACE_DATA); - } - - /* Only bump this if we tried to recycle and failed */ - if (recycle) - ARCSTAT_BUMP(arcstat_recycle_miss); - } - ASSERT(buf->b_data != NULL); -out: /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. @@ -3021,7 +3298,17 @@ arc_get_data_buf(arc_buf_t *buf) arc_buf_hdr_t *hdr = buf->b_hdr; atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); - if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { + + /* + * If this is reached via arc_read, the link is + * protected by the hash lock. If reached via + * arc_buf_alloc, the header should not be accessed by + * any other thread. And, if reached via arc_read_done, + * the hash lock will protect it if it's found in the + * hash table; otherwise no other thread should be + * trying to [add|remove]_reference it. + */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], size); @@ -3030,8 +3317,7 @@ arc_get_data_buf(arc_buf_t *buf) * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ - if (!zfs_arc_p_aggressive_disable && - arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && + if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } @@ -3074,7 +3360,8 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) */ if (HDR_PREFETCH(hdr)) { if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - ASSERT(list_link_active( + /* link protected by hash lock */ + ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { hdr->b_flags &= ~ARC_FLAG_PREFETCH; @@ -3138,7 +3425,8 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) */ if ((HDR_PREFETCH(hdr)) != 0) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* link protected by hash_lock */ + ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); } atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); @@ -3535,7 +3823,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* if this is a prefetch, we don't have a reference */ if (*arc_flags & ARC_FLAG_PREFETCH) @@ -3831,7 +4119,7 @@ arc_clear_callback(arc_buf_t *buf) if (hdr->b_l1hdr.b_datacnt > 1) { mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); } else { ASSERT(buf == hdr->b_l1hdr.b_buf); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; @@ -3864,6 +4152,8 @@ arc_release(arc_buf_t *buf, void *tag) mutex_enter(&buf->b_evict_lock); + ASSERT(HDR_HAS_L1HDR(hdr)); + /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be @@ -3910,6 +4200,13 @@ arc_release(arc_buf_t *buf, void *tag) mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); + + /* + * We don't want to leak the b_tmp_cdata buffer that was + * allocated in l2arc_write_buffers() + */ + arc_buf_l2_cdata_free(hdr); + mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; @@ -3982,6 +4279,7 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_l1hdr.b_datacnt = 1; nhdr->b_l1hdr.b_state = arc_anon; nhdr->b_l1hdr.b_arc_access = 0; + nhdr->b_l1hdr.b_tmp_cdata = NULL; nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); @@ -3991,8 +4289,8 @@ arc_release(arc_buf_t *buf, void *tag) } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); - /* protected by hash lock */ - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* protected by hash lock, or hdr is on arc_anon */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; @@ -4315,11 +4613,50 @@ arc_kstat_update(kstat_t *ksp, int rw) return (0); } +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the ARC eviction + * code is laid out; arc_evict_state() assumes ARC buffers are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +arc_state_multilist_index_func(multilist_t *ml, void *obj) +{ + arc_buf_hdr_t *hdr = obj; + + /* + * We rely on b_dva to generate evenly distributed index + * numbers using buf_hash below. So, as an added precaution, + * let's make sure we never add empty buffers to the arc lists. + */ + ASSERT(!BUF_EMPTY(hdr)); + + /* + * The assumption here, is the hash value for a given + * arc_buf_hdr_t will remain constant throughout it's lifetime + * (i.e. it's b_spa, b_dva, and b_birth fields don't change). + * Thus, we don't need to store the header's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % + multilist_get_num_sublists(ml)); +} + void arc_init(void) { - mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); + cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ zfs_arc_min_prefetch_lifespan = 1 * hz; @@ -4367,6 +4704,9 @@ arc_init(void) if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; + if (zfs_arc_num_sublists_per_state < 1) + zfs_arc_num_sublists_per_state = num_online_cpus(); + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -4381,43 +4721,46 @@ arc_init(void) arc_l2c_only = &ARC_l2c_only; arc_size = 0; - mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - - list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; @@ -4428,12 +4771,12 @@ arc_init(void) buf_init(); - arc_thread_exit = 0; + arc_reclaim_thread_exit = FALSE; + arc_user_evicts_thread_exit = FALSE; list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); arc_eviction_list = NULL; mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, @@ -4448,6 +4791,9 @@ arc_init(void) (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0, TS_RUN, minclsyspri); + (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + arc_dead = FALSE; arc_warm = B_FALSE; @@ -4476,17 +4822,36 @@ arc_fini(void) { arc_prune_t *p; - mutex_enter(&arc_reclaim_thr_lock); #ifdef _KERNEL spl_unregister_shrinker(&arc_shrinker); #endif /* _KERNEL */ - arc_thread_exit = 1; - while (arc_thread_exit != 0) - cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); - mutex_exit(&arc_reclaim_thr_lock); + mutex_enter(&arc_reclaim_lock); + arc_reclaim_thread_exit = TRUE; + /* + * The reclaim thread will set arc_reclaim_thread_exit back to + * FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_reclaim_thread_exit) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); + } + mutex_exit(&arc_reclaim_lock); + + mutex_enter(&arc_user_evicts_lock); + arc_user_evicts_thread_exit = TRUE; + /* + * The user evicts thread will set arc_user_evicts_thread_exit + * to FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_user_evicts_thread_exit) { + cv_signal(&arc_user_evicts_cv); + cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); + } + mutex_exit(&arc_user_evicts_lock); - arc_flush(NULL); + /* Use TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, TRUE); arc_dead = TRUE; @@ -4506,25 +4871,23 @@ arc_fini(void) list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); - mutex_destroy(&arc_eviction_mtx); - mutex_destroy(&arc_reclaim_thr_lock); - cv_destroy(&arc_reclaim_thr_cv); - - list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - - mutex_destroy(&arc_anon->arcs_mtx); - mutex_destroy(&arc_mru->arcs_mtx); - mutex_destroy(&arc_mru_ghost->arcs_mtx); - mutex_destroy(&arc_mfu->arcs_mtx); - mutex_destroy(&arc_mfu_ghost->arcs_mtx); - mutex_destroy(&arc_l2c_only->arcs_mtx); + mutex_destroy(&arc_reclaim_lock); + cv_destroy(&arc_reclaim_thread_cv); + cv_destroy(&arc_reclaim_waiters_cv); + + mutex_destroy(&arc_user_evicts_lock); + cv_destroy(&arc_user_evicts_cv); + + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); buf_fini(); @@ -4853,34 +5216,62 @@ l2arc_write_done(zio_t *zio) if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); - mutex_enter(&dev->l2ad_mtx); - /* * All writes completed, or an error was hit. */ +top: + mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); + + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ if (!mutex_tryenter(hash_lock)) { /* - * This buffer misses out. It may be in a stage - * of eviction. Its ARC_FLAG_L2_WRITING flag will be - * left set, denying reads to this buffer. + * Missed the hash lock. We must retry so we + * don't leave the ARC_FLAG_L2_WRITING bit set. */ - ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); - continue; + ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); + + /* + * We don't want to rescan the headers we've + * already marked as having been written out, so + * we reinsert the head node so we can pick up + * where we left off. + */ + list_remove(buflist, head); + list_insert_after(buflist, hdr, head); + + mutex_exit(&dev->l2ad_mtx); + + /* + * We wait for the hash lock to become available + * to try and prevent busy waiting, and increase + * the chance we'll be able to acquire the lock + * the next time around. + */ + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; } /* - * It's possible that this buffer got evicted from the L1 cache - * before we grabbed the vdev + hash locks, in which case - * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. - * Only free the buffer if we still have an L1 hdr. + * We could not have been moved into the arc_l2c_only + * state while in-flight due to our ARC_FLAG_L2_WRITING + * bit being set. Let's just ensure that's being enforced. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + + /* + * We may have allocated a buffer for L2ARC compression, + * we must release it to avoid leaking this data. */ - if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(hdr); + l2arc_release_cdata_buf(hdr); if (zio->io_error != 0) { /* @@ -4894,7 +5285,8 @@ l2arc_write_done(zio_t *zio) } /* - * Allow ARC to begin reads to this L2ARC entry. + * Allow ARC to begin reads and ghost list evictions to + * this L2ARC entry. */ hdr->b_flags &= ~ARC_FLAG_L2_WRITING; @@ -5002,35 +5394,37 @@ l2arc_read_done(zio_t *zio) * the data lists. This function returns a locked list, and also returns * the lock pointer. */ -static list_t * -l2arc_list_locked(int list_num, kmutex_t **lock) +static multilist_sublist_t * +l2arc_sublist_lock(int list_num) { - list_t *list = NULL; + multilist_t *ml = NULL; + unsigned int idx; ASSERT(list_num >= 0 && list_num <= 3); switch (list_num) { case 0: - list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mfu->arcs_mtx; + ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: - list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mru->arcs_mtx; + ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: - list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mfu->arcs_mtx; + ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: - list = &arc_mru->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mru->arcs_mtx; + ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; } - ASSERT(!(MUTEX_HELD(*lock))); - mutex_enter(*lock); - return (list); + /* + * Return a randomly-selected sublist. This is acceptable + * because the caller feeds only a little bit of data for each + * call (8MB). Subsequent calls will result in different + * sublists being selected. + */ + idx = multilist_get_random_index(ml); + return (multilist_sublist_lock(ml, idx)); } /* @@ -5076,6 +5470,12 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); + + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. @@ -5140,8 +5540,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; list_remove(buflist, hdr); - /* This may have been leftover after a failed write. */ - hdr->b_flags &= ~ARC_FLAG_L2_WRITING; + /* Ensure this header has finished being written */ + ASSERT(!HDR_L2_WRITING(hdr)); + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); } mutex_exit(hash_lock); } @@ -5167,11 +5568,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { arc_buf_hdr_t *hdr, *hdr_prev, *head; - list_t *list; uint64_t write_asize, write_psize, write_sz, headroom, buf_compress_minsz; abd_t *buf_data; - kmutex_t *list_lock = NULL; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; @@ -5200,12 +5599,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Copy buffers for L2ARC writing. */ - mutex_enter(&dev->l2ad_mtx); for (try = 0; try <= 3; try++) { + multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; - list = l2arc_list_locked(try, &list_lock); - /* * L2ARC fast warmup. * @@ -5213,9 +5610,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) - hdr = list_head(list); + hdr = multilist_sublist_head(mls); else - hdr = list_tail(list); + hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; if (do_headroom_boost) @@ -5226,9 +5623,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, uint64_t buf_sz; if (arc_warm == B_FALSE) - hdr_prev = list_next(list, hdr); + hdr_prev = multilist_sublist_next(mls, hdr); else - hdr_prev = list_prev(list, hdr); + hdr_prev = multilist_sublist_prev(mls, hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { @@ -5264,7 +5661,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * l2arc_write_done() can find where the * write buffers begin without searching. */ + mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); + mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc(sizeof (l2arc_write_callback_t), KM_SLEEP); @@ -5296,7 +5695,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, buf_sz = hdr->b_size; hdr->b_flags |= ARC_FLAG_HAS_L2HDR; + mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); /* * Compute and store the buffer cksum before @@ -5310,7 +5711,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, write_sz += buf_sz; } - mutex_exit(list_lock); + multilist_sublist_unlock(mls); if (full == B_TRUE) break; @@ -5319,12 +5720,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_sz); - mutex_exit(&dev->l2ad_mtx); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); return (0); } + mutex_enter(&dev->l2ad_mtx); + /* * Now start writing the buffers. We're starting at the write head * and work backwards, retracing the course of the buffer selector @@ -5334,6 +5736,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, hdr = list_prev(&dev->l2ad_buflist, hdr)) { uint64_t buf_sz; + /* + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + /* * We shouldn't need to lock the buffer here, since we flagged * it as ARC_FLAG_L2_WRITING in the previous step, but we must @@ -5568,8 +5978,26 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) static void l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) { + enum zio_compress comp = HDR_GET_COMPRESS(hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { + ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); + + if (comp == ZIO_COMPRESS_OFF) { + /* + * In this case, b_tmp_cdata points to the same buffer + * as the arc_buf_t's b_data field. We don't want to + * free it, since the arc_buf_t will handle that. + */ + hdr->b_l1hdr.b_tmp_cdata = NULL; + } else if (comp == ZIO_COMPRESS_EMPTY) { + /* + * In this case, b_tmp_cdata was compressed to an empty + * buffer, thus there's nothing to free and b_tmp_cdata + * should have been set to NULL in l2arc_write_buffers(). + */ + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + } else { /* * If the data was compressed, then we've allocated a * temporary buffer for it, so now we need to release it. @@ -5577,8 +6005,9 @@ l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); abd_free(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size); + hdr->b_l1hdr.b_tmp_cdata = NULL; } - hdr->b_l1hdr.b_tmp_cdata = NULL; + } /* @@ -5861,13 +6290,12 @@ MODULE_PARM_DESC(zfs_arc_max, "Max arc size"); module_param(zfs_arc_meta_limit, ulong, 0644); MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); +module_param(zfs_arc_meta_min, ulong, 0644); +MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata"); + module_param(zfs_arc_meta_prune, int, 0644); MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune"); -module_param(zfs_arc_meta_adjust_restarts, ulong, 0644); -MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts, - "Limit number of restarts in arc_adjust_meta"); - module_param(zfs_arc_grow_retry, int, 0644); MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index b54c03bc33fe..bee9f4cab392 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -316,7 +316,14 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); - arc_flush(dp->dp_spa); + /* + * We can't set retry to TRUE since we're explicitly specifying + * a spa to flush. This is good enough; any missed buffers for + * this spa won't cause trouble, and they'll eventually fall + * out of the ARC just like any other unused buffer. + */ + arc_flush(dp->dp_spa, FALSE); + txg_fini(dp); dsl_scan_fini(dp); rrw_destroy(&dp->dp_config_rwlock); diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c new file mode 100644 index 000000000000..7527f9626c1f --- /dev/null +++ b/module/zfs/multilist.c @@ -0,0 +1,373 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +#include +#include +#include + +/* needed for spa_get_random() */ +#include + +/* + * Given the object contained on the list, return a pointer to the + * object's multilist_node_t structure it contains. + */ +static multilist_node_t * +multilist_d2l(multilist_t *ml, void *obj) +{ + return ((multilist_node_t *)((char *)obj + ml->ml_offset)); +} + +/* + * Initialize a new mutlilist using the parameters specified. + * + * - 'size' denotes the size of the structure containing the + * multilist_node_t. + * - 'offset' denotes the byte offset of the mutlilist_node_t within + * the structure that contains it. + * - 'num' specifies the number of internal sublists to create. + * - 'index_func' is used to determine which sublist to insert into + * when the multilist_insert() function is called; as well as which + * sublist to remove from when multilist_remove() is called. The + * requirements this function must meet, are the following: + * + * - It must always return the same value when called on the same + * object (to ensure the object is removed from the list it was + * inserted into). + * + * - It must return a value in the range [0, number of sublists). + * The multilist_get_num_sublists() function may be used to + * determine the number of sublists in the multilist. + * + * Also, in order to reduce internal contention between the sublists + * during insertion and removal, this function should choose evenly + * between all available sublists when inserting. This isn't a hard + * requirement, but a general rule of thumb in order to garner the + * best multi-threaded performance out of the data structure. + */ +void +multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num, + multilist_sublist_index_func_t *index_func) +{ + int i; + + ASSERT3P(ml, !=, NULL); + ASSERT3U(size, >, 0); + ASSERT3U(size, >=, offset + sizeof (multilist_node_t)); + ASSERT3U(num, >, 0); + ASSERT3P(index_func, !=, NULL); + + ml->ml_offset = offset; + ml->ml_num_sublists = num; + ml->ml_index_func = index_func; + + ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) * + ml->ml_num_sublists, KM_SLEEP); + + ASSERT3P(ml->ml_sublists, !=, NULL); + + for (i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&mls->mls_list, size, offset); + } +} + +/* + * Destroy the given multilist object, and free up any memory it holds. + */ +void +multilist_destroy(multilist_t *ml) +{ + int i; + + ASSERT(multilist_is_empty(ml)); + + for (i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + + ASSERT(list_is_empty(&mls->mls_list)); + + list_destroy(&mls->mls_list); + mutex_destroy(&mls->mls_lock); + } + + ASSERT3P(ml->ml_sublists, !=, NULL); + kmem_free(ml->ml_sublists, + sizeof (multilist_sublist_t) * ml->ml_num_sublists); + + ml->ml_num_sublists = 0; + ml->ml_offset = 0; +} + +/* + * Insert the given object into the multilist. + * + * This function will insert the object specified into the sublist + * determined using the function given at multilist creation time. + * + * The sublist locks are automatically acquired if not already held, to + * ensure consistency when inserting and removing from multiple threads. + */ +void +multilist_insert(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__insert, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + + /* + * Note: Callers may already hold the sublist lock by calling + * multilist_sublist_lock(). Here we rely on MUTEX_HELD() + * returning TRUE if and only if the current thread holds the + * lock. While it's a little ugly to make the lock recursive in + * this way, it works and allows the calling code to be much + * simpler -- otherwise it would have to pass around a flag + * indicating that it already has the lock. + */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(!multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_insert_head(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Remove the given object from the multilist. + * + * This function will remove the object specified from the sublist + * determined using the function given at multilist creation time. + * + * The necessary sublist locks are automatically acquired, to ensure + * consistency when inserting and removing from multiple threads. + */ +void +multilist_remove(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__remove, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + /* See comment in multilist_insert(). */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_remove(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Check to see if this multilist object is empty. + * + * This will return TRUE if it finds all of the sublists of this + * multilist to be empty, and FALSE otherwise. Each sublist lock will be + * automatically acquired as necessary. + * + * If concurrent insertions and removals are occurring, the semantics + * of this function become a little fuzzy. Instead of locking all + * sublists for the entire call time of the function, each sublist is + * only locked as it is individually checked for emptiness. Thus, it's + * possible for this function to return TRUE with non-empty sublists at + * the time the function returns. This would be due to another thread + * inserting into a given sublist, after that specific sublist was check + * and deemed empty, but before all sublists have been checked. + */ +int +multilist_is_empty(multilist_t *ml) +{ + int i; + + for (i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + /* See comment in multilist_insert(). */ + boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + if (!list_is_empty(&mls->mls_list)) { + if (need_lock) + mutex_exit(&mls->mls_lock); + + return (FALSE); + } + + if (need_lock) + mutex_exit(&mls->mls_lock); + } + + return (TRUE); +} + +/* Return the number of sublists composing this multilist */ +unsigned int +multilist_get_num_sublists(multilist_t *ml) +{ + return (ml->ml_num_sublists); +} + +/* Return a randomly selected, valid sublist index for this multilist */ +unsigned int +multilist_get_random_index(multilist_t *ml) +{ + return (spa_get_random(ml->ml_num_sublists)); +} + +/* Lock and return the sublist specified at the given index */ +multilist_sublist_t * +multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) +{ + multilist_sublist_t *mls; + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + mls = &ml->ml_sublists[sublist_idx]; + mutex_enter(&mls->mls_lock); + + return (mls); +} + +void +multilist_sublist_unlock(multilist_sublist_t *mls) +{ + mutex_exit(&mls->mls_lock); +} + +/* + * We're allowing any object to be inserted into this specific sublist, + * but this can lead to trouble if multilist_remove() is called to + * remove this object. Specifically, if calling ml_index_func on this + * object returns an index for sublist different than what is passed as + * a parameter here, any call to multilist_remove() with this newly + * inserted object is undefined! (the call to multilist_remove() will + * remove the object from a list that it isn't contained in) + */ +void +multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_head(&mls->mls_list, obj); +} + +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_tail(&mls->mls_list, obj); +} + +/* + * Move the object one element forward in the list. + * + * This function will move the given object forward in the list (towards + * the head) by one object. So, in essence, it will swap its position in + * the list with its "prev" pointer. If the given object is already at the + * head of the list, it cannot be moved forward any more than it already + * is, so no action is taken. + * + * NOTE: This function **must not** remove any object from the list other + * than the object given as the parameter. This is relied upon in + * arc_evict_state_impl(). + */ +void +multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj) +{ + void *prev = list_prev(&mls->mls_list, obj); + + ASSERT(MUTEX_HELD(&mls->mls_lock)); + ASSERT(!list_is_empty(&mls->mls_list)); + + /* 'obj' must be at the head of the list, nothing to do */ + if (prev == NULL) + return; + + list_remove(&mls->mls_list, obj); + list_insert_before(&mls->mls_list, prev, obj); +} + +void +multilist_sublist_remove(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_remove(&mls->mls_list, obj); +} + +void * +multilist_sublist_head(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_head(&mls->mls_list)); +} + +void * +multilist_sublist_tail(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_tail(&mls->mls_list)); +} + +void * +multilist_sublist_next(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_next(&mls->mls_list, obj)); +} + +void * +multilist_sublist_prev(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_prev(&mls->mls_list, obj)); +} + +void +multilist_link_init(multilist_node_t *link) +{ + list_link_init(link); +} + +int +multilist_link_active(multilist_node_t *link) +{ + return (list_link_active(link)); +} diff --git a/module/zfs/trace.c b/module/zfs/trace.c index 470cf18bff30..0c9990e8547b 100644 --- a/module/zfs/trace.c +++ b/module/zfs/trace.c @@ -23,6 +23,7 @@ * (and only one) C file, so this dummy file exists for that purpose. */ +#include #include #include #include @@ -31,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index c168f3b47f2e..e629c43b94c0 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -439,7 +439,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) * fault injection isn't a performance critical path. */ if (flags & ZINJECT_FLUSH_ARC) - arc_flush(NULL); + /* + * We must use FALSE to ensure arc_flush returns, since + * we're not preventing concurrent ARC insertions. + */ + arc_flush(NULL, FALSE); return (0); } From 87b8ecc37e24f700a16a1a18f67fba5576f952ee Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Thu, 30 Apr 2015 15:20:56 -0500 Subject: [PATCH 23/26] applying fix for spinlock found in https://github.com/zfsonlinux/zfs/issues/3349 written by tuxoko --- module/zfs/zfs_znode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 0c0091f0445e..1655c7e27560 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -950,6 +950,7 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) mutex_exit(&zp->z_lock); sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zsb, obj_num); + cond_resched(); goto again; } *zpp = zp; From c99dfad8b839e9560349caada1e08135c82e41e7 Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Fri, 15 May 2015 23:31:29 +0200 Subject: [PATCH 24/26] [test port] 5701 zpool list reports incorrect "alloc" value for cache devices Reviewed by: Matthew Ahrens Reviewed by: George Wilson Reviewed by: Alek Pinchuk Approved by: Dan McDonald additional changes & notes for ZFSOnLinux: changes related to @struct l2arc_dev { are in include/sys/arc_impl.h changes for module/zfs/arc.c removed (dupe) superfluous comment from @l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) /* * Tell ARC this no longer exists in L2ARC. */ @arc_hdr_destroy(arc_buf_hdr_t *hdr) arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); arc_space_return functions are also existent in https://github.com/dweeezil/zfs/commit/ab37b291d4cf5ac90e1395f9fada0be2d4aeab95 (#3115 pull-request) thus neither uncommenting or removing since it might hamper functionality or data integrity comments ? --- include/sys/arc_impl.h | 2 +- module/zfs/arc.c | 177 +++++++++++++++++++++++++++++++---------- 2 files changed, 138 insertions(+), 41 deletions(-) diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 650b9a44af8c..539da98aeafe 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -172,12 +172,12 @@ typedef struct l2arc_dev { uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ - uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ kmutex_t l2ad_mtx; /* lock for buffer list */ list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ + refcount_t l2ad_alloc; /* allocated bytes */ } l2arc_dev_t; typedef struct l2arc_buf_hdr { diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 70b7a263bd76..367f73aad3c2 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -619,6 +619,16 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ +/* + * Used to distinguish headers that are being process by + * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk + * address. This can happen when the header is added to the l2arc's list + * of buffers to write in the first stage of l2arc_write_buffers(), but + * has not yet been written out which happens in the second stage of + * l2arc_write_buffers(). + */ +#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) + #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -1006,6 +1016,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) buf_hash_remove(hdr); bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + if (new == hdr_full_cache) { nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; /* @@ -1062,6 +1073,20 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) mutex_exit(&dev->l2ad_mtx); + /* + * Since we're using the pointer address as the tag when + * incrementing and decrementing the l2ad_alloc refcount, we + * must remove the old pointer (that we're about to destroy) and + * add the new pointer to the refcount. Otherwise we'd remove + * the wrong pointer address when calling arc_hdr_destroy() later. + */ + + (void) refcount_remove_many(&dev->l2ad_alloc, + hdr->b_l2hdr.b_asize, hdr); + + (void) refcount_add_many(&dev->l2ad_alloc, + nhdr->b_l2hdr.b_asize, nhdr); + buf_discard_identity(hdr); hdr->b_freeze_cksum = NULL; kmem_cache_free(old, hdr); @@ -1856,6 +1881,57 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t remove) kmem_cache_free(buf_cache, buf); } +static void +arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) +{ + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + l2arc_dev_t *dev = l2hdr->b_dev; + + ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + ASSERT(HDR_HAS_L2HDR(hdr)); + + list_remove(&dev->l2ad_buflist, hdr); + + /* + * We don't want to leak the b_tmp_cdata buffer that was + * allocated in l2arc_write_buffers() + */ + arc_buf_l2_cdata_free(hdr); + + /* + * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then + * this header is being processed by l2arc_write_buffers() (i.e. + * it's in the first stage of l2arc_write_buffers()). + * Re-affirming that truth here, just to serve as a reminder. If + * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or + * may not have its HDR_L2_WRITING flag set. (the write may have + * completed, in which case HDR_L2_WRITING will be false and the + * b_daddr field will point to the address of the buffer on disk). + */ + IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); + + /* + * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with + * l2arc_write_buffers(). Since we've just removed this header + * from the l2arc buffer list, this header will never reach the + * second stage of l2arc_write_buffers(), which increments the + * accounting stats for this header. Thus, we must be careful + * not to decrement them for this header either. + */ + if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { + ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + + vdev_space_update(dev->l2ad_vdev, + -l2hdr->b_asize, 0, 0); + + (void) refcount_remove_many(&dev->l2ad_alloc, + l2hdr->b_asize, hdr); + } + + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; +} + static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { @@ -1869,30 +1945,28 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (HDR_HAS_L2HDR(hdr)) { - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx); - - if (!buflist_held) { - mutex_enter(&l2hdr->b_dev->l2ad_mtx); - l2hdr = &hdr->b_l2hdr; - } + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); - list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); + if (!buflist_held) + mutex_enter(&dev->l2ad_mtx); /* - * We don't want to leak the b_tmp_cdata buffer that was - * allocated in l2arc_write_buffers() + * Even though we checked this conditional above, we + * need to check this again now that we have the + * l2ad_mtx. This is because we could be racing with + * another thread calling l2arc_evict() which might have + * destroyed this header's L2 portion as we were waiting + * to acquire the l2ad_mtx. If that happens, we don't + * want to re-destroy the header's L2 portion. */ - arc_buf_l2_cdata_free(hdr); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); + if (HDR_HAS_L2HDR(hdr)) + arc_hdr_l2hdr_destroy(hdr); if (!buflist_held) - mutex_exit(&l2hdr->b_dev->l2ad_mtx); - - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + mutex_exit(&dev->l2ad_mtx); } if (!BUF_EMPTY(hdr)) @@ -4201,21 +4275,20 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); - list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); /* - * We don't want to leak the b_tmp_cdata buffer that was - * allocated in l2arc_write_buffers() + * We have to recheck this conditional again now that + * we're holding the l2ad_mtx to prevent a race with + * another thread which might be concurrently calling + * l2arc_evict(). In that case, l2arc_evict() might have + * destroyed the header's L2 portion as we were waiting + * to acquire the l2ad_mtx. */ - arc_buf_l2_cdata_free(hdr); + if (HDR_HAS_L2HDR(hdr)) + arc_hdr_l2hdr_destroy(hdr); mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); - - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; } /* @@ -5288,6 +5361,10 @@ l2arc_write_done(zio_t *zio) ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + + bytes_dropped += hdr->b_l2hdr.b_asize; + (void) refcount_remove_many(&dev->l2ad_alloc, + hdr->b_l2hdr.b_asize, hdr); } /* @@ -5446,7 +5523,6 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; - int64_t bytes_evicted = 0; buflist = &dev->l2ad_buflist; @@ -5537,25 +5613,15 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) hdr->b_flags |= ARC_FLAG_L2_EVICTED; } - /* - * Tell ARC this no longer exists in L2ARC. - */ - /* Tell ARC this no longer exists in L2ARC. */ - ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; - list_remove(buflist, hdr); - /* Ensure this header has finished being written */ ASSERT(!HDR_L2_WRITING(hdr)); ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + + arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); - - vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); - dev->l2ad_evict = taddr; } /* @@ -5698,6 +5764,29 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, hdr->b_l2hdr.b_hits = 0; hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; + /* + * Explicitly set the b_daddr field to a known + * value which means "invalid address". This + * enables us to differentiate which stage of + * l2arc_write_buffers() the particular header + * is in (e.g. this loop, or the one below). + * ARC_FLAG_L2_WRITING is not enough to make + * this distinction, and we need to know in + * order to do proper l2arc vdev accounting in + * arc_release() and arc_hdr_destroy(). + * + * Note, we can't use a new flag to distinguish + * the two stages because we don't hold the + * header's hash_lock below, in the second stage + * of this function. Thus, we can't simply + * change the b_flags field to denote that the + * IO has been sent. We can change the b_daddr + * field of the L2 portion, though, since we'll + * be holding the l2ad_mtx; which is why we're + * using it to denote the header's state change. + */ + hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; + buf_sz = hdr->b_size; hdr->b_flags |= ARC_FLAG_HAS_L2HDR; @@ -5777,6 +5866,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, buf_data = hdr->b_l1hdr.b_tmp_cdata; buf_sz = hdr->b_l2hdr.b_asize; + /* + * We need to do this regardless if buf_sz is zero or + * not, otherwise, when this l2hdr is evicted we'll + * remove a reference that was never added. + */ + (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); + /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { uint64_t buf_p_sz; @@ -5791,6 +5887,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, (void) zio_nowait(wzio); write_asize += buf_sz; + /* * Keep the clock hand suitably device-aligned. */ @@ -5815,7 +5912,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { dev->l2ad_hand = dev->l2ad_start; - dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; } @@ -6153,7 +6249,6 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; - adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; list_link_init(&adddev->l2ad_node); @@ -6167,6 +6262,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); + refcount_create(&adddev->l2ad_alloc); /* * Add device to global list @@ -6212,6 +6308,7 @@ l2arc_remove_vdev(vdev_t *vd) l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); mutex_destroy(&remdev->l2ad_mtx); + refcount_destroy(&remdev->l2ad_alloc); kmem_free(remdev, sizeof (l2arc_dev_t)); } From 19b64084510e2924e5144f6c24b2f2265d36f1af Mon Sep 17 00:00:00 2001 From: kernelOfTruth Date: Sat, 16 May 2015 00:16:58 +0200 Subject: [PATCH 25/26] [test] commenting out @arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) line 1912 // IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); --- module/zfs/arc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 367f73aad3c2..81f1335155b8 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1908,7 +1908,7 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) * completed, in which case HDR_L2_WRITING will be false and the * b_daddr field will point to the address of the buffer on disk). */ - IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); +// IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); /* * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with From a7683f54ad10f2d99e12a3b2e56d9c60feb51a9c Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Thu, 28 May 2015 20:39:29 +0200 Subject: [PATCH 26/26] [RFC, discussion, feedback] account for ashift when choosing buffers to be written to l2arc device If we don't account for that, then we might end up overwriting disk area of buffers that have not been evicted yet, because l2arc_evict operates in terms of disk addresses. The discrepancy between the write size calculation and the actual increment to l2ad_hand was introduced in commit e14bb3258d05c1b1077e2db7cf77088924e56919 Also, consistently use asize / a_sz for the allocated size, psize / p_sz for the physical size. Where the latter accounts for possible size reduction because of compression, whereas the former accounts for possible size expansion because of alignment requirements. The code still assumes that either underlying storage subsystems or hardware is able to do read-modify-write when an L2ARC buffer size is not a multiple of a disk's block size. This is true for 4KB sector disks that provide 512B sector emulation, but may not be true in general. In other words, we currently do not have any code to make sure that an L2ARC buffer, whether compressed or not, which is used for physical I/O has a suitable size. modified to account for the changes of https://github.com/zfsonlinux/zfs/pull/2129 (ABD) , https://github.com/zfsonlinux/zfs/pull/3115 (Illumos - 5408 managing ZFS cache devices requires lots of RAM) and Illumos - 5701 zpool list reports incorrect "alloc" value for cache devices --- module/zfs/arc.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 81f1335155b8..13e1dec4757a 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5693,6 +5693,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; uint64_t buf_sz; + uint64_t buf_a_sz; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); @@ -5721,7 +5722,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - if ((write_sz + hdr->b_size) > target_sz) { + /* + * Assume that the buffer is not going to be compressed + * and could take more space on disk because of a larger + * disk block size. + */ + buf_sz = hdr->b_size; + buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); + + if ((write_asize + buf_a_sz) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -5787,7 +5796,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, */ hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; - buf_sz = hdr->b_size; hdr->b_flags |= ARC_FLAG_HAS_L2HDR; mutex_enter(&dev->l2ad_mtx); @@ -5804,6 +5812,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, mutex_exit(hash_lock); write_sz += buf_sz; + write_asize += buf_a_sz; } multilist_sublist_unlock(mls); @@ -5827,6 +5836,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * and work backwards, retracing the course of the buffer selector * loop above. */ + write_asize = 0; for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; hdr = list_prev(&dev->l2ad_buflist, hdr)) { uint64_t buf_sz; @@ -5875,7 +5885,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { - uint64_t buf_p_sz; + uint64_t buf_a_sz; wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, @@ -5886,14 +5896,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, zio_t *, wzio); (void) zio_nowait(wzio); - write_asize += buf_sz; - + write_psize += buf_sz; /* * Keep the clock hand suitably device-aligned. */ - buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); - write_psize += buf_p_sz; - dev->l2ad_hand += buf_p_sz; + buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); + write_asize += buf_a_sz; + dev->l2ad_hand += buf_a_sz; } } @@ -5903,8 +5912,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); - ARCSTAT_INCR(arcstat_l2_asize, write_asize); - vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_asize, write_psize); + vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); /* * Bump device hand to the device start if it is approaching the end.