From 1375829f53afcce515de43300175edf230d0a6eb Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 30 Nov 2022 16:03:32 -0500 Subject: [PATCH] Implement uncached prefetch. Before this change primarycache property was handled only on dbuf layer, controlling dbuf cache and through a hack an ARC evictions. Since speculative prefetcher is implemented on ARC level, it had to be disabled for uncacheable buffers because otherwise falsely prefetched and never read buffers would stay cached in ARC. This change gives ARC a knowledge about uncacheable buffers. It is passed to arc_read() and arc_write() and stored in ARC header. When remove_reference() drops last reference on the ARC header, it can either immediately destroy it, or if it is marked as prefetch, put it into new arc_uncached state. That state is scanned every second, looking for stale buffers that were not demand read (in which case they are evicted immediately). To handle cases of short or misaligned reads, this change tracks at dbuf layer buffers that were read from the beginning, but not to the end. It is assumed that such buffers may receive further reads, and so they are stored in dbuf cache. If some of following reads reaches the end of such buffer, it is immediately evicted. Otherwise it will follow regular dbuf cache eviction and will be evicted also from ARC the same moment. Since dbuf layer does not know the actual file size, this logic is not applied to the last buffers of dnodes, which are always evicted same as before. Since uncacheable buffers should no longer stay in ARC for long, this patch also tries to optimize I/O by allocating ARC physical buffers as linear to allow buffer sharing. It allows to avoid one of two memory copies for uncompressed data for both reads and writes by the cost of some higher KVA usage in case of prefetch. In case decompression is needed the sharing is impossible, but this still allows to avoid extra memory copy since decompression still require a linear buffer. With the combination of enabled prefetch and avoided memory copy this change improves sequential single-threaded read speed from a wide NVMe pool from 2049 to 3932 MiB/s. During write profiler shows 22% reduction of unhalted CPU cycles at the same throughput of 3653 MiB/s. Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. --- include/os/linux/zfs/sys/trace_arc.h | 2 + include/sys/arc.h | 6 +- include/sys/arc_impl.h | 19 +++ include/sys/dbuf.h | 5 + include/sys/dnode.h | 8 +- module/os/freebsd/zfs/sysctl_os.c | 17 ++- module/zfs/arc.c | 199 ++++++++++++++++++--------- module/zfs/dbuf.c | 99 ++++++------- module/zfs/dmu.c | 18 ++- module/zfs/dmu_objset.c | 2 +- module/zfs/dmu_tx.c | 7 +- 11 files changed, 243 insertions(+), 139 deletions(-) diff --git a/include/os/linux/zfs/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h index a1595c76540c..c494f48bb48b 100644 --- a/include/os/linux/zfs/sys/trace_arc.h +++ b/include/os/linux/zfs/sys/trace_arc.h @@ -108,6 +108,7 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu); +DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__uncached); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss); @@ -392,6 +393,7 @@ DEFINE_DTRACE_PROBE1(arc__evict); DEFINE_DTRACE_PROBE1(arc__delete); DEFINE_DTRACE_PROBE1(new_state__mru); DEFINE_DTRACE_PROBE1(new_state__mfu); +DEFINE_DTRACE_PROBE1(new_state__uncached); DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync); DEFINE_DTRACE_PROBE1(l2arc__hit); DEFINE_DTRACE_PROBE1(l2arc__miss); diff --git a/include/sys/arc.h b/include/sys/arc.h index b2f8f20e16e5..09a66c31d5f0 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -115,6 +115,7 @@ typedef enum arc_flags ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ + ARC_FLAG_UNCACHED = 1 << 5, /* evict after use */ ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ /* @@ -228,6 +229,7 @@ typedef enum arc_state_type { ARC_STATE_MFU, ARC_STATE_MFU_GHOST, ARC_STATE_L2C_ONLY, + ARC_STATE_UNCACHED, ARC_STATE_NUMTYPES } arc_state_type_t; @@ -301,8 +303,8 @@ int arc_referenced(arc_buf_t *buf); int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *priv, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); -zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, +zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, arc_write_done_func_t *physdone, arc_write_done_func_t *done, void *priv, zio_priority_t priority, int zio_flags, diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 9c77284ffe75..161f909d84ab 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -46,6 +46,7 @@ extern "C" { * ARC_mru_ghost - recently used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_uncached - uncacheable prefetch, to be evicted * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are @@ -542,6 +543,7 @@ typedef struct arc_stats { kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_uncached_hits; kstat_named_t arcstat_deleted; /* * Number of buffers that could not be evicted because the hash lock @@ -744,6 +746,21 @@ typedef struct arc_stats { * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_metadata; + /* + * Total number of bytes that are going to be evicted from ARC due to + * ARC_FLAG_UNCACHED being set. + */ + kstat_named_t arcstat_uncached_size; + /* + * Number of data bytes that are going to be evicted from ARC due to + * ARC_FLAG_UNCACHED being set. + */ + kstat_named_t arcstat_uncached_evictable_data; + /* + * Number of metadata bytes that that are going to be evicted from ARC + * due to ARC_FLAG_UNCACHED being set. + */ + kstat_named_t arcstat_uncached_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; /* @@ -900,6 +917,7 @@ typedef struct arc_sums { wmsum_t arcstat_mru_ghost_hits; wmsum_t arcstat_mfu_hits; wmsum_t arcstat_mfu_ghost_hits; + wmsum_t arcstat_uncached_hits; wmsum_t arcstat_deleted; wmsum_t arcstat_mutex_miss; wmsum_t arcstat_access_skip; @@ -1006,6 +1024,7 @@ typedef struct arc_evict_waiter { #define arc_mfu (&ARC_mfu) #define arc_mfu_ghost (&ARC_mfu_ghost) #define arc_l2c_only (&ARC_l2c_only) +#define arc_uncached (&ARC_uncached) extern taskq_t *arc_prune_taskq; extern arc_stats_t arc_stats; diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 6215a2c129a2..a1ce76b1c763 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -55,6 +55,8 @@ extern "C" { #define DB_RF_NEVERWAIT (1 << 4) #define DB_RF_CACHED (1 << 5) #define DB_RF_NO_DECRYPT (1 << 6) +#define DB_RF_PARTIAL_FIRST (1 << 7) +#define DB_RF_PARTIAL_MORE (1 << 8) /* * The simplified state transition diagram for dbufs looks like: @@ -321,6 +323,9 @@ typedef struct dmu_buf_impl { uint8_t db_pending_evict; uint8_t db_dirtycnt; + + /* The buffer was partially read. More reads may follow. */ + uint8_t db_partial_read; } dmu_buf_impl_t; #define DBUF_HASH_MUTEX(h, idx) \ diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 9553988e7ddd..2d741ea36bd0 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -457,15 +457,11 @@ void dnode_free_interior_slots(dnode_t *dn); #define DNODE_IS_DIRTY(_dn) \ ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) -#define DNODE_IS_CACHEABLE(_dn) \ +#define DNODE_LEVEL_IS_CACHEABLE(_dn, _level) \ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ - (DMU_OT_IS_METADATA((_dn)->dn_type) && \ + (((_level) > 0 || DMU_OT_IS_METADATA((_dn)->dn_type)) && \ (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)) -#define DNODE_META_IS_CACHEABLE(_dn) \ - ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ - (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA) - /* * Used for dnodestats kstat. */ diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 48af1eaf8ea7..05d58ad74e53 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -366,10 +366,10 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of anonymous state"); + "size of metadata in anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of anonymous state"); + "size of data in anonymous state"); /* END CSTYLED */ extern arc_state_t ARC_mru; @@ -424,6 +424,19 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, "size of data in mfu ghost state"); /* END CSTYLED */ +extern arc_state_t ARC_uncached; + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_size, CTLFLAG_RD, + &ARC_uncached.arcs_size.rc_count, 0, "size of uncached state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD, + &ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in uncached state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD, + &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in uncached state"); +/* END CSTYLED */ + extern arc_state_t ARC_l2c_only; /* BEGIN CSTYLED */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1521caa6edc6..0d8aff9470fc 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -333,6 +333,7 @@ static int arc_state_evict_marker_count; static kmutex_t arc_evict_lock; static boolean_t arc_evict_needed = B_FALSE; +static clock_t arc_last_uncached_flush; /* * Count of bytes evicted since boot. @@ -473,13 +474,14 @@ static uint_t zfs_arc_lotsfree_percent = 10; */ static int zfs_arc_prune_task_threads = 1; -/* The 6 states: */ +/* The 7 states: */ arc_state_t ARC_anon; arc_state_t ARC_mru; arc_state_t ARC_mru_ghost; arc_state_t ARC_mfu; arc_state_t ARC_mfu_ghost; arc_state_t ARC_l2c_only; +arc_state_t ARC_uncached; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, @@ -501,6 +503,7 @@ arc_stats_t arc_stats = { { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "uncached_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "access_skip", KSTAT_DATA_UINT64 }, @@ -549,6 +552,9 @@ arc_stats_t arc_stats = { { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, + { "uncached_size", KSTAT_DATA_UINT64 }, + { "uncached_evictable_data", KSTAT_DATA_UINT64 }, + { "uncached_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, @@ -702,6 +708,7 @@ taskq_t *arc_prune_taskq; ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_UNCACHED(hdr) ((hdr)->b_flags & ARC_FLAG_UNCACHED) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) @@ -854,6 +861,7 @@ enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, ARC_HDR_DO_ADAPT = 0x2, ARC_HDR_USE_RESERVE = 0x4, + ARC_HDR_ALLOC_LINEAR = 0x8, }; @@ -866,8 +874,10 @@ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); +static void arc_hdr_destroy(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t); static void arc_buf_watch(arc_buf_t *); +static void arc_change_state(arc_state_t *, arc_buf_hdr_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -2321,18 +2331,23 @@ remove_reference(arc_buf_hdr_t *hdr, const void *tag) ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr))); - ASSERT(!GHOST_STATE(state)); + ASSERT(!GHOST_STATE(state)); /* arc_l2c_only counts as a ghost. */ - /* - * arc_l2c_only counts as a ghost state so we don't need to explicitly - * check to prevent usage of the arc_l2c_only list. - */ - if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && - (state != arc_anon)) { - multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); - arc_evictable_space_increment(hdr, state); + if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0) + return (cnt); + + if (state == arc_anon) { + arc_hdr_destroy(hdr); + return (0); } - return (cnt); + if (state == arc_uncached && !HDR_PREFETCH(hdr)) { + arc_change_state(arc_anon, hdr); + arc_hdr_destroy(hdr); + return (0); + } + multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); + arc_evictable_space_increment(hdr, state); + return (0); } /* @@ -2439,8 +2454,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_remove(&old_state->arcs_list[buftype], hdr); - arc_evictable_space_decrement(hdr, old_state); + /* remove_reference() saves on insert. */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + multilist_remove(&old_state->arcs_list[buftype], + hdr); + arc_evictable_space_decrement(hdr, old_state); + } } if (new_state != arc_anon && new_state != arc_l2c_only) { /* @@ -3845,7 +3864,6 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, tag)); - arc_hdr_destroy(hdr); return; } @@ -3858,8 +3876,8 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); - (void) remove_reference(hdr, tag); arc_buf_destroy_impl(buf); + (void) remove_reference(hdr, tag); mutex_exit(hash_lock); } @@ -3874,6 +3892,7 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted + * - arc_uncached -> deleted * * Return total size of evicted data buffers for eviction progress tracking. * When evicting from ghost states return logical buffer size to make eviction @@ -3941,8 +3960,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) return (bytes_evicted); } - ASSERT(state == arc_mru || state == arc_mfu); - evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached); + evicted_state = (state == arc_uncached) ? arc_anon : + ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost); /* prefetch buffers have a minimum lifespan */ if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && @@ -4013,9 +4033,13 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) arc_hdr_free_abd(hdr, B_TRUE); arc_change_state(evicted_state, hdr); - ASSERT(HDR_IN_HASH_TABLE(hdr)); - arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); + if (evicted_state == arc_anon) { + arc_hdr_destroy(hdr); + *real_evicted += HDR_FULL_SIZE; + } else { + ASSERT(HDR_IN_HASH_TABLE(hdr)); + } } return (bytes_evicted); @@ -4779,6 +4803,9 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry); } void @@ -4910,7 +4937,16 @@ arc_evict_cb_check(void *arg, zthr_t *zthr) * which is held before this function is called, and is held by * arc_wait_for_eviction() when it calls zthr_wakeup(). */ - return (arc_evict_needed); + if (arc_evict_needed) + return (B_TRUE); + + /* + * If we have buffers in uncached state, evict them periodically. + */ + return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) && + ddi_get_lbolt() - arc_last_uncached_flush > + MSEC_TO_TICK(arc_min_prefetch_ms / 2))); } /* @@ -4925,8 +4961,14 @@ arc_evict_cb(void *arg, zthr_t *zthr) uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); - /* Evict from cache */ - evicted = arc_evict(); + /* Always try to evict from uncached state. */ + arc_last_uncached_flush = ddi_get_lbolt(); + evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE); + evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE); + + /* Evict from other states only if told to. */ + if (arc_evict_needed) + evicted += arc_evict(); /* * If evicted is zero, we couldn't evict anything @@ -5205,12 +5247,10 @@ arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, alloc_flags); - if (type == ARC_BUFC_METADATA) { - return (abd_alloc(size, B_TRUE)); - } else { - ASSERT(type == ARC_BUFC_DATA); - return (abd_alloc(size, B_FALSE)); - } + if (alloc_flags & ARC_HDR_ALLOC_LINEAR) + return (abd_alloc_linear(size, type == ARC_BUFC_METADATA)); + else + return (abd_alloc(size, type == ARC_BUFC_METADATA)); } static void * @@ -5475,16 +5515,22 @@ arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) clock_t now = ddi_get_lbolt(); if (hdr->b_l1hdr.b_state == arc_anon) { + arc_state_t *new_state; /* - * This buffer is not in the cache, and does not - * appear in our "ghost" list. Add the new buffer - * to the MRU state. + * This buffer is not in the cache, and does not appear in + * our "ghost" lists. Add it to the MRU or uncached state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = now; - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mru, hdr); - + if (HDR_UNCACHED(hdr)) { + new_state = arc_uncached; + DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *, + hdr); + } else { + new_state = arc_mru; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + } + arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mru) { /* * This buffer has been accessed once recently and either @@ -5556,6 +5602,14 @@ arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); + } else if (hdr->b_l1hdr.b_state == arc_uncached) { + /* + * This buffer is uncacheable, but we got a hit. Probably + * a demand read after prefetch. Nothing more to do here. + */ + if (!HDR_IO_IN_PROGRESS(hdr)) + ARCSTAT_BUMP(arcstat_uncached_hits); + hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC and was not accessed @@ -5603,7 +5657,8 @@ arc_buf_access(arc_buf_t *buf) mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); + hdr->b_l1hdr.b_state == arc_mfu || + hdr->b_l1hdr.b_state == arc_uncached); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, 0, B_TRUE); @@ -5671,7 +5726,6 @@ arc_read_done(zio_t *zio) kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; arc_callback_t *acb; - boolean_t freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists @@ -5821,9 +5875,6 @@ arc_read_done(zio_t *zio) */ ASSERT(callback_cnt < 2 || hash_lock != NULL); - arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ - if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { @@ -5832,7 +5883,6 @@ arc_read_done(zio_t *zio) arc_change_state(arc_anon, hdr); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* @@ -5842,18 +5892,11 @@ arc_read_done(zio_t *zio) */ cv_broadcast(&hdr->b_l1hdr.b_cv); - if (hash_lock != NULL) { + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + (void) remove_reference(hdr, hdr); + + if (hash_lock != NULL) mutex_exit(hash_lock); - } else { - /* - * This block was freed while we waited for the read to - * complete. It has been removed from the hash table and - * moved to the anonymous state (so that it won't show up - * in the cache). - */ - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); - } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { @@ -5889,9 +5932,6 @@ arc_read_done(zio_t *zio) kmem_free(acb, sizeof (arc_callback_t)); } } - - if (freeable) - arc_hdr_destroy(hdr); } /* @@ -6072,7 +6112,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, } ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); + hdr->b_l1hdr.b_state == arc_mfu || + hdr->b_l1hdr.b_state == arc_uncached); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, *arc_flags, B_TRUE); @@ -6099,9 +6140,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, } } if (rc != 0) { - (void) remove_reference(hdr, private); arc_buf_destroy_impl(buf); buf = NULL; + (void) remove_reference(hdr, private); } /* assert any errors weren't due to unloaded keys */ @@ -6191,6 +6232,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, goto top; } } + if (*arc_flags & ARC_FLAG_UNCACHED) { + arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); + if (!encrypted_read) + alloc_flags |= ARC_HDR_ALLOC_LINEAR; + } /* * Call arc_adapt() explicitly before arc_access() to allow @@ -6624,7 +6670,7 @@ arc_release(arc_buf_t *buf, const void *tag) VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); - (void) remove_reference(hdr, tag); + VERIFY3S(remove_reference(hdr, tag), >, 0); if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); @@ -6897,7 +6943,8 @@ arc_write_ready(zio_t *zio) arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); - } else if (!abd_size_alloc_linear(arc_buf_size(buf)) || + } else if (!(HDR_UNCACHED(hdr) || + abd_size_alloc_linear(arc_buf_size(buf))) || !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the @@ -7025,17 +7072,16 @@ arc_write_done(zio_t *zio) } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ + VERIFY3S(remove_reference(hdr, hdr), >, 0); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, 0, B_FALSE); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ + VERIFY3S(remove_reference(hdr, hdr), >, 0); } - ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); abd_free(zio->io_abd); @@ -7044,7 +7090,7 @@ arc_write_done(zio_t *zio) zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, + blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, arc_write_done_func_t *done, void *private, zio_priority_t priority, @@ -7061,7 +7107,9 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); - if (l2arc) + if (uncached) + arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); + else if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (ARC_BUF_ENCRYPTED(buf)) { @@ -7283,6 +7331,8 @@ arc_kstat_update(kstat_t *ksp, int rw) wmsum_value(&arc_sums.arcstat_mfu_hits); as->arcstat_mfu_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_ghost_hits); + as->arcstat_uncached_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_uncached_hits); as->arcstat_deleted.value.ui64 = wmsum_value(&arc_sums.arcstat_deleted); as->arcstat_mutex_miss.value.ui64 = @@ -7352,6 +7402,10 @@ arc_kstat_update(kstat_t *ksp, int rw) &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); + arc_kstat_update_state(arc_uncached, + &as->arcstat_uncached_size, + &as->arcstat_uncached_evictable_data, + &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = aggsum_value(&arc_sums.arcstat_dnode_size); @@ -7660,6 +7714,10 @@ arc_state_init(void) arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA], + arc_state_multilist_index_func, &num_sublists); /* * L2 headers should never be on the L2 state list since they don't @@ -7689,6 +7747,8 @@ arc_state_init(void) zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size); zfs_refcount_create(&arc_mru->arcs_size); @@ -7696,6 +7756,7 @@ arc_state_init(void) zfs_refcount_create(&arc_mfu->arcs_size); zfs_refcount_create(&arc_mfu_ghost->arcs_size); zfs_refcount_create(&arc_l2c_only->arcs_size); + zfs_refcount_create(&arc_uncached->arcs_size); wmsum_init(&arc_sums.arcstat_hits, 0); wmsum_init(&arc_sums.arcstat_iohits, 0); @@ -7716,6 +7777,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0); + wmsum_init(&arc_sums.arcstat_uncached_hits, 0); wmsum_init(&arc_sums.arcstat_deleted, 0); wmsum_init(&arc_sums.arcstat_mutex_miss, 0); wmsum_init(&arc_sums.arcstat_access_skip, 0); @@ -7800,6 +7862,7 @@ arc_state_init(void) arc_mfu->arcs_state = ARC_STATE_MFU; arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; + arc_uncached->arcs_state = ARC_STATE_UNCACHED; } static void @@ -7817,6 +7880,8 @@ arc_state_fini(void) zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size); zfs_refcount_destroy(&arc_mru->arcs_size); @@ -7824,6 +7889,7 @@ arc_state_fini(void) zfs_refcount_destroy(&arc_mfu->arcs_size); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); zfs_refcount_destroy(&arc_l2c_only->arcs_size); + zfs_refcount_destroy(&arc_uncached->arcs_size); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); @@ -7835,6 +7901,8 @@ arc_state_fini(void) multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]); wmsum_fini(&arc_sums.arcstat_hits); wmsum_fini(&arc_sums.arcstat_iohits); @@ -7855,6 +7923,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); wmsum_fini(&arc_sums.arcstat_mfu_hits); wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits); + wmsum_fini(&arc_sums.arcstat_uncached_hits); wmsum_fini(&arc_sums.arcstat_deleted); wmsum_fini(&arc_sums.arcstat_mutex_miss); wmsum_fini(&arc_sums.arcstat_access_skip); @@ -8039,8 +8108,8 @@ arc_init(void) arc_state_evict_markers = arc_state_alloc_markers(arc_state_evict_marker_count); - arc_evict_zthr = zthr_create("arc_evict", - arc_evict_cb_check, arc_evict_cb, NULL, defclsyspri); + arc_evict_zthr = zthr_create_timer("arc_evict", + arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri); arc_reap_zthr = zthr_create_timer("arc_reap", arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 5e1ed83863bb..efaa13317be0 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1608,7 +1608,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); - if (dbuf_is_l2cacheable(db)) + if (!DBUF_IS_CACHEABLE(db)) + aflags |= ARC_FLAG_UNCACHED; + else if (dbuf_is_l2cacheable(db)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); @@ -1736,10 +1738,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) dn = DB_DNODE(db); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && - DBUF_IS_CACHEABLE(db); + (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL; mutex_enter(&db->db_mtx); + if (flags & DB_RF_PARTIAL_FIRST) + db->db_partial_read = B_TRUE; + else if (!(flags & DB_RF_PARTIAL_MORE)) + db->db_partial_read = B_FALSE; if (db->db_state == DB_CACHED) { /* * Ensure that this block's dnode has been decrypted if @@ -3463,8 +3468,9 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, dpa->dpa_cb = cb; dpa->dpa_arg = arg; - /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (dnode_level_is_l2cacheable(&bp, dn, level)) + if (!DNODE_LEVEL_IS_CACHEABLE(dn, level)) + dpa->dpa_aflags |= ARC_FLAG_UNCACHED; + else if (dnode_level_is_l2cacheable(&bp, dn, level)) dpa->dpa_aflags |= ARC_FLAG_L2CACHE; /* @@ -3853,59 +3859,38 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) * This dbuf has anonymous data associated with it. */ dbuf_destroy(db); - } else { - boolean_t do_arc_evict = B_FALSE; - blkptr_t bp; - spa_t *spa = dmu_objset_spa(db->db_objset); - - if (!DBUF_IS_CACHEABLE(db) && - db->db_blkptr != NULL && - !BP_IS_HOLE(db->db_blkptr) && - !BP_IS_EMBEDDED(db->db_blkptr)) { - do_arc_evict = B_TRUE; - bp = *db->db_blkptr; - } - - if (!DBUF_IS_CACHEABLE(db) || - db->db_pending_evict) { - dbuf_destroy(db); - } else if (!multilist_link_active(&db->db_cache_link)) { - ASSERT3U(db->db_caching_status, ==, - DB_NO_CACHE); - - dbuf_cached_state_t dcs = - dbuf_include_in_metadata_cache(db) ? - DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; - db->db_caching_status = dcs; - - multilist_insert(&dbuf_caches[dcs].cache, db); - uint64_t db_size = db->db.db_size; - size = zfs_refcount_add_many( - &dbuf_caches[dcs].size, db_size, db); - uint8_t db_level = db->db_level; - mutex_exit(&db->db_mtx); - - if (dcs == DB_DBUF_METADATA_CACHE) { - DBUF_STAT_BUMP(metadata_cache_count); - DBUF_STAT_MAX( - metadata_cache_size_bytes_max, - size); - } else { - DBUF_STAT_BUMP(cache_count); - DBUF_STAT_MAX(cache_size_bytes_max, - size); - DBUF_STAT_BUMP(cache_levels[db_level]); - DBUF_STAT_INCR( - cache_levels_bytes[db_level], - db_size); - } + } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) || + db->db_pending_evict) { + dbuf_destroy(db); + } else if (!multilist_link_active(&db->db_cache_link)) { + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + + dbuf_cached_state_t dcs = + dbuf_include_in_metadata_cache(db) ? + DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; + db->db_caching_status = dcs; + + multilist_insert(&dbuf_caches[dcs].cache, db); + uint64_t db_size = db->db.db_size; + size = zfs_refcount_add_many( + &dbuf_caches[dcs].size, db_size, db); + uint8_t db_level = db->db_level; + mutex_exit(&db->db_mtx); - if (dcs == DB_DBUF_CACHE && !evicting) - dbuf_evict_notify(size); + if (dcs == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMP(metadata_cache_count); + DBUF_STAT_MAX(metadata_cache_size_bytes_max, + size); + } else { + DBUF_STAT_BUMP(cache_count); + DBUF_STAT_MAX(cache_size_bytes_max, size); + DBUF_STAT_BUMP(cache_levels[db_level]); + DBUF_STAT_INCR(cache_levels_bytes[db_level], + db_size); } - if (do_arc_evict) - arc_freed(spa, &bp); + if (dcs == DB_DBUF_CACHE && !evicting) + dbuf_evict_notify(size); } } else { mutex_exit(&db->db_mtx); @@ -5083,8 +5068,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) children_ready_cb = dbuf_write_children_ready; dr->dr_zio = arc_write(pio, os->os_spa, txg, - &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db), - &zp, dbuf_write_ready, + &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db), + dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 45304e7ddf7a..d6a9f813c270 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -549,14 +549,14 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); if ((flags & DMU_READ_NO_PREFETCH) == 0 && - DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { + length <= zfetch_array_rd_sz) { /* * Prepare the zfetch before initiating the demand reads, so * that if multiple threads block on same indirect block, we * base predictions on the original less racy request order. */ - zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, - read && DNODE_IS_CACHEABLE(dn), B_TRUE); + zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read, + B_TRUE); } for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); @@ -579,6 +579,14 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, * state will not yet be CACHED. */ if (read) { + if (i == nblks - 1 && blkid + i < dn->dn_maxblkid && + offset + length < db->db.db_offset + + db->db.db_size) { + if (offset <= db->db.db_offset) + dbuf_flags |= DB_RF_PARTIAL_FIRST; + else + dbuf_flags |= DB_RF_PARTIAL_MORE; + } (void) dbuf_read(db, zio, dbuf_flags); if (db->db_state != DB_CACHED) missed = B_TRUE; @@ -1850,8 +1858,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; - zio_nowait(arc_write(pio, os->os_spa, txg, - zgd->zgd_bp, dr->dt.dl.dr_data, dbuf_is_l2cacheable(db), + zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, + dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index c17c829a04d8..5083b1763412 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1694,7 +1694,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) } zio = arc_write(pio, os->os_spa, tx->tx_txg, - blkptr_copy, os->os_phys_buf, dmu_os_is_l2cacheable(os), + blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 28f64369d8dd..a7a554691a78 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -214,7 +214,12 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); - err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); + /* + * PARTIAL_FIRST allows caching for uncacheable blocks. It will + * be cleared after dmu_buf_will_dirty() call dbuf_read() again. + */ + err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH | + (level == 0 ? DB_RF_PARTIAL_FIRST : 0)); dbuf_rele(db, FTAG); return (err); }