Skip to content

Commit

Permalink
deadlock between spa_errlog_lock and dp_config_rwlock
Browse files Browse the repository at this point in the history
There is a lock order inversion deadlock between `spa_errlog_lock` and
`dp_config_rwlock`:

A thread in `spa_delete_dataset_errlog()` is running from a sync task.
It is holding the `dp_config_rwlock` for writer (see
`dsl_sync_task_sync()`), and waiting for the `spa_errlog_lock`.

A thread in `dsl_pool_config_enter()` is holding the `spa_errlog_lock`
(see `spa_get_errlog_size()`) and waiting for the `dp_config_rwlock` (as
reader).

Note that this was introduced by openzfs#12812.

This commit address this by defining the lock ordering to be
dp_config_rwlock first, then spa_errlog_lock / spa_errlist_lock.
spa_get_errlog() and spa_get_errlog_size() can acquire the locks in this
order, and then process_error_block() and get_head_and_birth_txg() can
verify that the dp_config_rwlock is already held.

Tested by having some errors in the pool (via `zinject -t data
/path/to/file`), one thread running `zpool iostat 0.001`, and another
thread runs `zfs destroy` (in a loop, although it hits the first time).
This reproduces the problem easily without the fix, and works with the
fix.

Closes openzfs#14239
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
  • Loading branch information
ahrens committed Dec 13, 2022
1 parent 5f73bbb commit a76caae
Showing 1 changed file with 23 additions and 11 deletions.
34 changes: 23 additions & 11 deletions module/zfs/spa_errlog.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,8 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
dsl_dataset_t *ds;
objset_t *os;

dsl_pool_config_enter(dp, FTAG);
int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds);
if (error != 0) {
dsl_pool_config_exit(dp, FTAG);
return (error);
}
ASSERT(head_dataset_id);
Expand All @@ -172,7 +170,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
error = dmu_objset_from_ds(ds, &os);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
dsl_pool_config_exit(dp, FTAG);
return (error);
}

Expand All @@ -189,7 +186,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
ZFS_KEYSTATUS_UNAVAILABLE) {
zep->zb_birth = 0;
dsl_dataset_rele(ds, FTAG);
dsl_pool_config_exit(dp, FTAG);
return (0);
}

Expand All @@ -199,7 +195,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
error = dnode_hold(os, zep->zb_object, FTAG, &dn);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
dsl_pool_config_exit(dp, FTAG);
return (error);
}

Expand All @@ -225,7 +220,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
dsl_dataset_rele(ds, FTAG);
dsl_pool_config_exit(dp, FTAG);
return (error);
}

Expand Down Expand Up @@ -479,9 +473,6 @@ static int
process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
uint64_t *count, void *uaddr, boolean_t only_count)
{
dsl_pool_t *dp = spa->spa_dsl_pool;
uint64_t top_affected_fs;

/*
* If the zb_birth is 0 it means we failed to retrieve the birth txg
* of the block pointer. This happens when an encrypted filesystem is
Expand All @@ -504,13 +495,12 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
return (0);
}

dsl_pool_config_enter(dp, FTAG);
uint64_t top_affected_fs;
int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
if (error == 0)
error = check_filesystem(spa, top_affected_fs, zep, count,
uaddr, only_count);

dsl_pool_config_exit(dp, FTAG);
return (error);
}

Expand Down Expand Up @@ -687,6 +677,12 @@ spa_get_errlog_size(spa_t *spa)
{
uint64_t total = 0;

/*
* The pool config lock is needed to hold a dataset_t via (among other
* places) get_errlist_size() -> get_head_and_birth_txg(), and lock
* ordering requires that we get it before the spa_errlog_lock.
*/
dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
mutex_enter(&spa->spa_errlog_lock);
uint64_t count;
Expand Down Expand Up @@ -718,6 +714,7 @@ spa_get_errlog_size(spa_t *spa)
mutex_exit(&spa->spa_errlist_lock);
#endif
}
dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
return (total);
}

Expand Down Expand Up @@ -988,6 +985,12 @@ spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
int ret = 0;

#ifdef _KERNEL
/*
* The pool config lock is needed to hold a dataset_t via (among other
* places) process_error_list() -> get_head_and_birth_txg(), and lock
* ordering requires that we get it before the spa_errlog_lock.
*/
dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
mutex_enter(&spa->spa_errlog_lock);

ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
Expand All @@ -1006,6 +1009,7 @@ spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
mutex_exit(&spa->spa_errlist_lock);

mutex_exit(&spa->spa_errlog_lock);
dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
#else
(void) spa, (void) uaddr, (void) count;
#endif
Expand Down Expand Up @@ -1174,6 +1178,13 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
spa->spa_scrub_finished = B_FALSE;

mutex_exit(&spa->spa_errlist_lock);

/*
* The pool config lock is needed to hold a dataset_t via
* sync_error_list() -> get_head_and_birth_txg(), and lock ordering
* requires that we get it before the spa_errlog_lock.
*/
dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
mutex_enter(&spa->spa_errlog_lock);

tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
Expand Down Expand Up @@ -1218,6 +1229,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
dmu_tx_commit(tx);

mutex_exit(&spa->spa_errlog_lock);
dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
}

static void
Expand Down

0 comments on commit a76caae

Please sign in to comment.