Implement uncached prefetch.

Before this change primarycache property was handled only on dbuf layer, controlling dbuf cache and through a hack an ARC evictions. Since speculative prefetcher is implemented on ARC level, it had to be disabled for uncacheable buffers because otherwise falsely prefetched and never read buffers would stay cached in ARC. This change gives ARC a knowledge about uncacheable buffers. It is passed to arc_read() and arc_write() and stored in ARC header. When remove_reference() drops last reference on the ARC header, it can either immediately destroy it, or if it is marked as prefetch, put it into new arc_uncached state. That state is scanned every second, looking for stale buffers that were not demand read (in which case they are evicted immediately). To handle cases of short or misaligned reads, this change tracks at dbuf layer buffers that were read from the beginning, but not to the end. It is assumed that such buffers may receive further reads, and so they are stored in dbuf cache. If some of following reads reaches the end of such buffer, it is immediately evicted. Otherwise it will follow regular dbuf cache eviction and will be evicted also from ARC the same moment. Since dbuf layer does not know the actual file size, this logic is not applied to the last buffers of dnodes, which are always evicted same as before. Since uncacheable buffers should no longer stay in ARC for long, this patch also tries to optimize I/O by allocating ARC physical buffers as linear to allow buffer sharing. It allows to avoid one of two memory copies for uncompressed data for both reads and writes by the cost of some higher KVA usage in case of prefetch. In case decompression is needed the sharing is impossible, but this still allows to avoid extra memory copy since decompression still require a linear buffer. With the combination of enabled prefetch and avoided memory copy this change improves sequential single-threaded read speed from a wide NVMe pool from 2049 to 3932 MiB/s. During write profiler shows 22% reduction of unhalted CPU cycles at the same throughput of 3653 MiB/s. Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc.
openzfs · Dec 22, 2022 · f9647b9 · f9647b9
1 parent 2407d0f
commit f9647b9
Show file tree

Hide file tree

Showing 10 changed files with 237 additions and 138 deletions.
diff --git a/include/os/linux/zfs/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h
@@ -108,6 +108,7 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu);
+DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__uncached);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit);
 DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);
@@ -392,6 +393,7 @@ DEFINE_DTRACE_PROBE1(arc__evict);
 DEFINE_DTRACE_PROBE1(arc__delete);
 DEFINE_DTRACE_PROBE1(new_state__mru);
 DEFINE_DTRACE_PROBE1(new_state__mfu);
+DEFINE_DTRACE_PROBE1(new_state__uncached);
 DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync);
 DEFINE_DTRACE_PROBE1(l2arc__hit);
 DEFINE_DTRACE_PROBE1(l2arc__miss);

diff --git a/include/sys/arc.h b/include/sys/arc.h
@@ -115,6 +115,7 @@ typedef enum arc_flags
 	ARC_FLAG_PREFETCH		= 1 << 2,	/* I/O is a prefetch */
 	ARC_FLAG_CACHED			= 1 << 3,	/* I/O was in cache */
 	ARC_FLAG_L2CACHE		= 1 << 4,	/* cache in L2ARC */
+	ARC_FLAG_UNCACHED		= 1 << 5,	/* evict after use */
 	ARC_FLAG_PRESCIENT_PREFETCH	= 1 << 6,	/* long min lifespan */
 
 	/*
@@ -228,6 +229,7 @@ typedef enum arc_state_type {
 	ARC_STATE_MFU,
 	ARC_STATE_MFU_GHOST,
 	ARC_STATE_L2C_ONLY,
+	ARC_STATE_UNCACHED,
 	ARC_STATE_NUMTYPES
 } arc_state_type_t;
 
@@ -301,8 +303,8 @@ int arc_referenced(arc_buf_t *buf);
 int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_read_done_func_t *done, void *priv, zio_priority_t priority,
     int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp,
     arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
     arc_write_done_func_t *physdone, arc_write_done_func_t *done,
     void *priv, zio_priority_t priority, int zio_flags,

diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
@@ -46,6 +46,7 @@ extern "C" {
  *	ARC_mru_ghost	- recently used, no longer in cache
  *	ARC_mfu		- frequently used, currently cached
  *	ARC_mfu_ghost	- frequently used, no longer in cache
+ *	ARC_uncached	- uncacheable prefetch, to be evicted
  *	ARC_l2c_only	- exists in L2ARC but not other states
  * When there are no active references to the buffer, they are
  * are linked onto a list in one of these arc states.  These are
@@ -542,6 +543,7 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_mru_ghost_hits;
 	kstat_named_t arcstat_mfu_hits;
 	kstat_named_t arcstat_mfu_ghost_hits;
+	kstat_named_t arcstat_uncached_hits;
 	kstat_named_t arcstat_deleted;
 	/*
 	 * Number of buffers that could not be evicted because the hash lock
@@ -744,6 +746,21 @@ typedef struct arc_stats {
 	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
 	 */
 	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
+	/*
+	 * Total number of bytes that are going to be evicted from ARC due to
+	 * ARC_FLAG_UNCACHED being set.
+	 */
+	kstat_named_t arcstat_uncached_size;
+	/*
+	 * Number of data bytes that are going to be evicted from ARC due to
+	 * ARC_FLAG_UNCACHED being set.
+	 */
+	kstat_named_t arcstat_uncached_evictable_data;
+	/*
+	 * Number of metadata bytes that that are going to be evicted from ARC
+	 * due to ARC_FLAG_UNCACHED being set.
+	 */
+	kstat_named_t arcstat_uncached_evictable_metadata;
 	kstat_named_t arcstat_l2_hits;
 	kstat_named_t arcstat_l2_misses;
 	/*
@@ -900,6 +917,7 @@ typedef struct arc_sums {
 	wmsum_t arcstat_mru_ghost_hits;
 	wmsum_t arcstat_mfu_hits;
 	wmsum_t arcstat_mfu_ghost_hits;
+	wmsum_t arcstat_uncached_hits;
 	wmsum_t arcstat_deleted;
 	wmsum_t arcstat_mutex_miss;
 	wmsum_t arcstat_access_skip;
@@ -1006,6 +1024,7 @@ typedef struct arc_evict_waiter {
 #define	arc_mfu		(&ARC_mfu)
 #define	arc_mfu_ghost	(&ARC_mfu_ghost)
 #define	arc_l2c_only	(&ARC_l2c_only)
+#define	arc_uncached	(&ARC_uncached)
 
 extern taskq_t *arc_prune_taskq;
 extern arc_stats_t arc_stats;

diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
@@ -55,6 +55,8 @@ extern "C" {
 #define	DB_RF_NEVERWAIT		(1 << 4)
 #define	DB_RF_CACHED		(1 << 5)
 #define	DB_RF_NO_DECRYPT	(1 << 6)
+#define	DB_RF_PARTIAL_FIRST	(1 << 7)
+#define	DB_RF_PARTIAL_MORE	(1 << 8)
 
 /*
  * The simplified state transition diagram for dbufs looks like:
@@ -321,6 +323,9 @@ typedef struct dmu_buf_impl {
 	uint8_t db_pending_evict;
 
 	uint8_t db_dirtycnt;
+
+	/* The buffer was partially read.  More reads may follow. */
+	uint8_t db_partial_read;
 } dmu_buf_impl_t;
 
 #define	DBUF_HASH_MUTEX(h, idx) \

diff --git a/include/sys/dnode.h b/include/sys/dnode.h
@@ -457,15 +457,11 @@ void dnode_free_interior_slots(dnode_t *dn);
 #define	DNODE_IS_DIRTY(_dn)						\
 	((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))
 
-#define	DNODE_IS_CACHEABLE(_dn)						\
+#define	DNODE_LEVEL_IS_CACHEABLE(_dn, _level)				\
 	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
-	(DMU_OT_IS_METADATA((_dn)->dn_type) &&				\
+	(((_level) > 0 || DMU_OT_IS_METADATA((_dn)->dn_type)) &&	\
 	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
 
-#define	DNODE_META_IS_CACHEABLE(_dn)					\
-	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
-	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
-
 /*
  * Used for dnodestats kstat.
  */

diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c
@@ -366,10 +366,10 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
 	&ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
 	&ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-	"size of anonymous state");
+	"size of metadata in anonymous state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
 	&ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-	"size of anonymous state");
+	"size of data in anonymous state");
 /* END CSTYLED */
 
 extern arc_state_t ARC_mru;
@@ -424,6 +424,19 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
 	"size of data in mfu ghost state");
 /* END CSTYLED */
 
+extern arc_state_t ARC_uncached;
+
+/* BEGIN CSTYLED */
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_size, CTLFLAG_RD,
+	&ARC_uncached.arcs_size.rc_count, 0, "size of uncached state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD,
+	&ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+	"size of metadata in uncached state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD,
+	&ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+	"size of data in uncached state");
+/* END CSTYLED */
+
 extern arc_state_t ARC_l2c_only;
 
 /* BEGIN CSTYLED */