From 0689f76c08c5e553ff25ac43a852b56c430bb61e Mon Sep 17 00:00:00 2001 From: Adam Leventhal Date: Fri, 1 Mar 2013 15:46:07 -0800 Subject: [PATCH] 3582 zfs_delay() should support a variable resolution 3584 DTrace sdt probes for ZFS txg states Reviewed by: Matthew Ahrens Reviewed by: George Wilson Reviewed by: Christopher Siden Reviewed by: Dan McDonald Reviewed by: Richard Elling Approved by: Garrett D'Amore --- .../cmd/cmd-inet/usr.sbin/ifconfig/revarp.c | 2 -- usr/src/lib/libzpool/common/kernel.c | 35 +++++++++++++++++++ usr/src/lib/libzpool/common/sys/zfs_context.h | 2 ++ usr/src/uts/common/conf/param.c | 8 ++--- usr/src/uts/common/fs/zfs/dsl_dir.c | 3 +- usr/src/uts/common/fs/zfs/dsl_pool.c | 18 ++++++---- usr/src/uts/common/fs/zfs/dsl_scan.c | 8 ++--- usr/src/uts/common/fs/zfs/spa_misc.c | 6 ++-- usr/src/uts/common/fs/zfs/sys/txg.h | 9 ++--- usr/src/uts/common/fs/zfs/sys/txg_impl.h | 12 ++++--- usr/src/uts/common/fs/zfs/txg.c | 27 ++++++++------ usr/src/uts/common/os/condvar.c | 2 -- usr/src/uts/common/sys/condvar.h | 2 ++ usr/src/uts/common/sys/time.h | 3 ++ 14 files changed, 94 insertions(+), 43 deletions(-) diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c index aba479494273..83279f7db278 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/revarp.c @@ -35,8 +35,6 @@ #define IPADDRL sizeof (struct in_addr) #define RARPRETRIES 5 -#define MSEC2NSEC(msec) ((msec) * 1000000) -#define NSEC2MSEC(nsec) ((nsec) / 1000000) /* * The following value (8) is determined to work reliably in switched 10/100MB diff --git a/usr/src/lib/libzpool/common/kernel.c b/usr/src/lib/libzpool/common/kernel.c index 96280941a646..4dd614f7c1bc 100644 --- a/usr/src/lib/libzpool/common/kernel.c +++ b/usr/src/lib/libzpool/common/kernel.c @@ -329,6 +329,41 @@ cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) return (1); } +/*ARGSUSED*/ +clock_t +cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + int error; + timestruc_t ts; + hrtime_t delta; + + ASSERT(flag == 0); + +top: + delta = tim - gethrtime(); + if (delta <= 0) + return (-1); + + ts.tv_sec = delta / NANOSEC; + ts.tv_nsec = delta % NANOSEC; + + ASSERT(mutex_owner(mp) == curthread); + mp->m_owner = NULL; + error = cond_reltimedwait(cv, &mp->m_lock, &ts); + mp->m_owner = curthread; + + if (error == ETIME) + return (-1); + + if (error == EINTR) + goto top; + + ASSERT(error == 0); + + return (1); +} + void cv_signal(kcondvar_t *cv) { diff --git a/usr/src/lib/libzpool/common/sys/zfs_context.h b/usr/src/lib/libzpool/common/sys/zfs_context.h index 7802da4e2b00..a46c657a1d70 100644 --- a/usr/src/lib/libzpool/common/sys/zfs_context.h +++ b/usr/src/lib/libzpool/common/sys/zfs_context.h @@ -254,6 +254,8 @@ extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); extern void cv_destroy(kcondvar_t *cv); extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); +extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag); extern void cv_signal(kcondvar_t *cv); extern void cv_broadcast(kcondvar_t *cv); diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c index d72cfb0b8f1e..a01b8797560c 100644 --- a/usr/src/uts/common/conf/param.c +++ b/usr/src/uts/common/conf/param.c @@ -695,10 +695,10 @@ param_init(void) * should re-evaluate their usage and specify the appropriate * resolution. */ - time_res[TR_NANOSEC] = SEC; - time_res[TR_MICROSEC] = MILLISEC; - time_res[TR_MILLISEC] = MICROSEC; - time_res[TR_SEC] = NANOSEC; + time_res[TR_NANOSEC] = NANOSEC / NANOSEC; + time_res[TR_MICROSEC] = NANOSEC / MICROSEC; + time_res[TR_MILLISEC] = NANOSEC / MILLISEC; + time_res[TR_SEC] = NANOSEC / SEC; time_res[TR_CLOCK_TICK] = nsec_per_tick; } diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 1e7ba6d6cbef..c53c3c8e877c 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -738,7 +738,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); } else { if (err == EAGAIN) { - txg_delay(dd->dd_pool, tx->tx_txg, 1); + txg_delay(dd->dd_pool, tx->tx_txg, + MSEC2NSEC(10), MSEC2NSEC(10)); err = ERESTART; } dsl_pool_memory_pressure(dd->dd_pool); diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 6af631679e9e..c83068a00101 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -58,6 +58,9 @@ kmutex_t zfs_write_limit_lock; static pgcnt_t old_physmem = 0; +hrtime_t zfs_throttle_delay = MSEC2NSEC(10); +hrtime_t zfs_throttle_resolution = MSEC2NSEC(10); + int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { @@ -511,12 +514,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * Weight the throughput calculation towards the current value: * thru = 3/4 old_thru + 1/4 new_thru * - * Note: write_time is in nanosecs, so write_time/MICROSEC - * yields millisecs + * Note: write_time is in nanosecs while dp_throughput is expressed in + * bytes per millisecond. */ ASSERT(zfs_write_limit_min > 0); - if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { - uint64_t throughput = data_written / (write_time / MICROSEC); + if (data_written > zfs_write_limit_min / 8 && + write_time > MSEC2NSEC(1)) { + uint64_t throughput = data_written / NSEC2MSEC(write_time); if (dp->dp_throughput) dp->dp_throughput = throughput / 4 + @@ -614,8 +618,10 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) * the caller 1 clock tick. This will slow down the "fill" * rate until the sync process can catch up with us. */ - if (reserved && reserved > (write_limit - (write_limit >> 3))) - txg_delay(dp, tx->tx_txg, 1); + if (reserved && reserved > (write_limit - (write_limit >> 3))) { + txg_delay(dp, tx->tx_txg, zfs_throttle_delay, + zfs_throttle_resolution); + } return (0); } diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c index 3de3c6e4d7f9..167f3825e5c6 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scan.c +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -403,7 +403,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) zfs_resilver_min_time_ms : zfs_scan_min_time_ms; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || - (elapsed_nanosecs / MICROSEC > mintime && + (NSEC2MSEC(elapsed_nanosecs) > mintime && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)) { if (zb) { @@ -1308,7 +1308,7 @@ dsl_scan_free_should_pause(dsl_scan_t *scn) elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || - (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && + (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); } @@ -1433,7 +1433,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) "free_bpobj/bptree txg %llu", (longlong_t)scn->scn_visited_this_txg, (longlong_t) - (gethrtime() - scn->scn_sync_start_time) / MICROSEC, + NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)tx->tx_txg); scn->scn_visited_this_txg = 0; /* @@ -1481,7 +1481,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) zfs_dbgmsg("visited %llu blocks in %llums", (longlong_t)scn->scn_visited_this_txg, - (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC); + (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); if (!scn->scn_pausing) { /* finished with scan. */ diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 733d2609e55e..1663abbb5eac 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -499,8 +499,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) hdlr.cyh_arg = spa; hdlr.cyh_level = CY_LOW_LEVEL; - spa->spa_deadman_synctime = zfs_deadman_synctime * - zfs_txg_synctime_ms * MICROSEC; + spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime * + zfs_txg_synctime_ms); /* * This determines how often we need to check for hung I/Os after @@ -508,7 +508,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) * an expensive operation we don't want to check too frequently. * Instead wait for 5 synctimes before checking again. */ - when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC; + when.cyt_interval = MSEC2NSEC(5 * zfs_txg_synctime_ms); when.cyt_when = CY_INFINITY; mutex_enter(&cpu_lock); spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); diff --git a/usr/src/uts/common/fs/zfs/sys/txg.h b/usr/src/uts/common/fs/zfs/sys/txg.h index 2df33f0fb0a7..1529e5ac6da4 100644 --- a/usr/src/uts/common/fs/zfs/sys/txg.h +++ b/usr/src/uts/common/fs/zfs/sys/txg.h @@ -74,13 +74,8 @@ extern void txg_rele_to_quiesce(txg_handle_t *txghp); extern void txg_rele_to_sync(txg_handle_t *txghp); extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); -/* - * Delay the caller by the specified number of ticks or until - * the txg closes (whichever comes first). This is intended - * to be used to throttle writers when the system nears its - * capacity. - */ -extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks); +extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta, + hrtime_t resolution); /* * Wait until the given transaction group has finished syncing. diff --git a/usr/src/uts/common/fs/zfs/sys/txg_impl.h b/usr/src/uts/common/fs/zfs/sys/txg_impl.h index 7b356eac1293..8f1b21b3d460 100644 --- a/usr/src/uts/common/fs/zfs/sys/txg_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #ifndef _SYS_TXG_IMPL_H #define _SYS_TXG_IMPL_H @@ -36,14 +40,14 @@ extern "C" { struct tx_cpu { kmutex_t tc_lock; kcondvar_t tc_cv[TXG_SIZE]; - uint64_t tc_count[TXG_SIZE]; + uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ - char tc_pad[16]; + char tc_pad[16]; /* pad to fill 3 cache lines */ }; typedef struct tx_state { - tx_cpu_t *tx_cpu; /* protects right to enter txg */ - kmutex_t tx_sync_lock; /* protects tx_state_t */ + tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */ + kmutex_t tx_sync_lock; /* protects the rest of this struct */ uint64_t tx_open_txg; /* currently open txg id */ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ uint64_t tx_syncing_txg; /* currently syncing txg id */ diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 58690e325fee..232cdd961edb 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -232,7 +232,7 @@ txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) } static void -txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) +txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) { CALLB_CPR_SAFE_BEGIN(cpr); @@ -353,6 +353,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) ASSERT(txg == tx->tx_open_txg); tx->tx_open_txg++; + DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); + DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); + /* * Now that we've incremented tx_open_txg, we can let threads * enter the next transaction group. @@ -475,6 +478,7 @@ txg_sync_thread(dsl_pool_t *dp) txg = tx->tx_quiesced_txg; tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; + DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_quiesce_more_cv); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", @@ -488,6 +492,7 @@ txg_sync_thread(dsl_pool_t *dp) mutex_enter(&tx->tx_sync_lock); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; + DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_sync_done_cv); /* @@ -536,21 +541,22 @@ txg_quiesce_thread(dsl_pool_t *dp) */ dprintf("quiesce done, handing off txg %llu\n", txg); tx->tx_quiesced_txg = txg; + DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_sync_more_cv); cv_broadcast(&tx->tx_quiesce_done_cv); } } /* - * Delay this thread by 'ticks' if we are still in the open transaction - * group and there is already a waiting txg quiesing or quiesced. Abort - * the delay if this txg stalls or enters the quiesing state. + * Delay this thread by delay nanoseconds if we are still in the open + * transaction group and there is already a waiting txg quiesing or quiesced. + * Abort the delay if this txg stalls or enters the quiesing state. */ void -txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) +txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) { tx_state_t *tx = &dp->dp_tx; - clock_t timeout = ddi_get_lbolt() + ticks; + hrtime_t start = gethrtime(); /* don't delay if this txg could transition to quiesing immediately */ if (tx->tx_open_txg > txg || @@ -563,10 +569,11 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) return; } - while (ddi_get_lbolt() < timeout && - tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) - (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, - timeout); + while (gethrtime() - start < delay && + tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) { + (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv, + &tx->tx_sync_lock, delay, resolution, 0); + } mutex_exit(&tx->tx_sync_lock); } diff --git a/usr/src/uts/common/os/condvar.c b/usr/src/uts/common/os/condvar.c index 60ff344d827b..e9c418ffbd31 100644 --- a/usr/src/uts/common/os/condvar.c +++ b/usr/src/uts/common/os/condvar.c @@ -43,8 +43,6 @@ #include #include -clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t, int); - /* * CV_MAX_WAITERS is the maximum number of waiters we track; once * the number becomes higher than that, we look at the sleepq to diff --git a/usr/src/uts/common/sys/condvar.h b/usr/src/uts/common/sys/condvar.h index 56e660e5e206..ef72c3c567fa 100644 --- a/usr/src/uts/common/sys/condvar.h +++ b/usr/src/uts/common/sys/condvar.h @@ -94,6 +94,8 @@ extern void cv_destroy(kcondvar_t *); extern void cv_wait(kcondvar_t *, kmutex_t *); extern void cv_wait_stop(kcondvar_t *, kmutex_t *, int); extern clock_t cv_timedwait(kcondvar_t *, kmutex_t *, clock_t); +extern clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t, + int); extern clock_t cv_reltimedwait(kcondvar_t *, kmutex_t *, clock_t, time_res_t); extern int cv_wait_sig(kcondvar_t *, kmutex_t *); extern clock_t cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t); diff --git a/usr/src/uts/common/sys/time.h b/usr/src/uts/common/sys/time.h index fcb9a290feac..9fc470486079 100644 --- a/usr/src/uts/common/sys/time.h +++ b/usr/src/uts/common/sys/time.h @@ -236,6 +236,9 @@ struct itimerval32 { #define MICROSEC 1000000 #define NANOSEC 1000000000 +#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) +#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) + #endif /* !defined(__XOPEN_OR_POSIX) || defined(__EXTENSIONS__) */ #ifndef _ASM