Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[blob, buildbot] Illumos 5056 - ZFS deadlock on db_mtx and dn_holds #3240 #3241

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -1928,8 +1928,8 @@ dump_dir(objset_t *os)
if (dds.dds_type == DMU_OST_META) {
dds.dds_creation_txg = TXG_INITIAL;
usedobjs = BP_GET_FILL(os->os_rootbp);
refdbytes = os->os_spa->spa_dsl_pool->
dp_mos_dir->dd_phys->dd_used_bytes;
refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
dd_used_bytes;
} else {
dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
}
Expand Down
9 changes: 9 additions & 0 deletions include/sys/avl.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
* Use is subject to license terms.
*/

/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/

#ifndef _AVL_H
#define _AVL_H

Expand Down Expand Up @@ -259,6 +263,11 @@ extern boolean_t avl_update(avl_tree_t *, void *);
extern boolean_t avl_update_lt(avl_tree_t *, void *);
extern boolean_t avl_update_gt(avl_tree_t *, void *);

/*
* Swaps the contents of the two trees.
*/
extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);

/*
* Return the number of nodes in the tree
*/
Expand Down
16 changes: 10 additions & 6 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/

#ifndef _SYS_DBUF_H
Expand Down Expand Up @@ -66,8 +67,13 @@ extern "C" {
* | |
* | |
* +--------> NOFILL -------+
*
* DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
* to find all dbufs in a range of a dnode and must be less than any other
* dbuf_states_t (see comment on dn_dbufs in dnode.h).
*/
typedef enum dbuf_states {
DB_SEARCH = -1,
DB_UNCACHED,
DB_FILL,
DB_NOFILL,
Expand Down Expand Up @@ -217,14 +223,12 @@ typedef struct dmu_buf_impl {
* Our link on the owner dnodes's dn_dbufs list.
* Protected by its dn_dbufs_mtx.
*/
list_node_t db_link;
avl_node_t db_link;

/* Data which is unique to data (leaf) blocks: */

/* stuff we store for the user (see dmu_buf_set_user) */
void *db_user_ptr;
void **db_user_data_ptr_ptr;
dmu_buf_evict_func_t *db_evict_func;
/* User callback information. */
dmu_buf_user_t *db_user;

uint8_t db_immediate_evict;
uint8_t db_freed_in_flight;
Expand Down
140 changes: 110 additions & 30 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright 2014 HybridCluster. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/

/* Portions Copyright 2010 Robert Milkowski */
Expand All @@ -39,11 +40,9 @@
* dmu_spa.h.
*/

#include <sys/zfs_context.h>
#include <sys/inttypes.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/cred.h>
#include <sys/time.h>
#include <sys/fs/zfs.h>
#include <sys/uio.h>

Expand Down Expand Up @@ -288,8 +287,6 @@ typedef struct dmu_buf {
void *db_data; /* data in buffer */
} dmu_buf_t;

typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);

/*
* The names of zap entries in the DIRECTORY_OBJECT of the MOS.
*/
Expand Down Expand Up @@ -475,43 +472,126 @@ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);

typedef void dmu_buf_evict_func_t(void *user_ptr);

/*
* A DMU buffer user object may be associated with a dbuf for the
* duration of its lifetime. This allows the user of a dbuf (client)
* to attach private data to a dbuf (e.g. in-core only data such as a
* dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
* when that dbuf has been evicted. Clients typically respond to the
* eviction notification by freeing their private data, thus ensuring
* the same lifetime for both dbuf and private data.
*
* The mapping from a dmu_buf_user_t to any client private data is the
* client's responsibility. All current consumers of the API with private
* data embed a dmu_buf_user_t as the first member of the structure for
* their private data. This allows conversions between the two types
* with a simple cast. Since the DMU buf user API never needs access
* to the private data, other strategies can be employed if necessary
* or convenient for the client (e.g. using container_of() to do the
* conversion for private data that cannot have the dmu_buf_user_t as
* its first member).
*
* Eviction callbacks are executed without the dbuf mutex held or any
* other type of mechanism to guarantee that the dbuf is still available.
* For this reason, users must assume the dbuf has already been freed
* and not reference the dbuf from the callback context.
*
* Users requesting "immediate eviction" are notified as soon as the dbuf
* is only referenced by dirty records (dirties == holds). Otherwise the
* notification occurs after eviction processing for the dbuf begins.
*/
typedef struct dmu_buf_user {
/*
* Asynchronous user eviction callback state.
*/
taskq_ent_t dbu_tqent;

/* This instance's eviction function pointer. */
dmu_buf_evict_func_t *dbu_evict_func;
#ifdef ZFS_DEBUG
/*
* Pointer to user's dbuf pointer. NULL for clients that do
* not associate a dbuf with their user data.
*
* The dbuf pointer is cleared upon eviction so as to catch
* use-after-evict bugs in clients.
*/
dmu_buf_t **dbu_clear_on_evict_dbufp;
#endif
} dmu_buf_user_t;

/*
* Initialize the given dmu_buf_user_t instance with the eviction function
* evict_func, to be called when the user is evicted.
*
* NOTE: This function should only be called once on a given dmu_buf_user_t.
* To allow enforcement of this, dbu must already be zeroed on entry.
*/
#ifdef __lint
/* Very ugly, but it beats issuing suppression directives in many Makefiles. */
extern void
dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
dmu_buf_t **clear_on_evict_dbufp);
#else /* __lint */
static inline void
dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
dmu_buf_t **clear_on_evict_dbufp)
{
ASSERT(dbu->dbu_evict_func == NULL);
ASSERT(evict_func != NULL);
dbu->dbu_evict_func = evict_func;
#ifdef ZFS_DEBUG
dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
#endif
}
#endif /* __lint */

/*
* Returns NULL on success, or the existing user ptr if it's already
* been set.
* Attach user data to a dbuf and mark it for normal (when the dbuf's
* data is cleared or its reference count goes to zero) eviction processing.
*
* user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
*
* user_data_ptr_ptr should be NULL, or a pointer to a pointer which
* will be set to db->db_data when you are allowed to access it. Note
* that db->db_data (the pointer) can change when you do dmu_buf_read(),
* dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
* *user_data_ptr_ptr will be set to the new value when it changes.
* Returns NULL on success, or the existing user if another user currently
* owns the buffer.
*/
void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);

/*
* Attach user data to a dbuf and mark it for immediate (its dirty and
* reference counts are equal) eviction processing.
*
* If non-NULL, pageout func will be called when this buffer is being
* excised from the cache, so that you can clean up the data structure
* pointed to by user_ptr.
* Returns NULL on success, or the existing user if another user currently
* owns the buffer.
*/
void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);

/*
* Replace the current user of a dbuf.
*
* dmu_evict_user() will call the pageout func for all buffers in a
* objset with a given pageout func.
* If given the current user of a dbuf, replaces the dbuf's user with
* "new_user" and returns the user data pointer that was replaced.
* Otherwise returns the current, and unmodified, dbuf user pointer.
*/
void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
dmu_buf_evict_func_t *pageout_func);
void *dmu_buf_replace_user(dmu_buf_t *db,
dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);

/*
* set_user_ie is the same as set_user, but request immediate eviction
* when hold count goes to zero.
* Remove the specified user data for a DMU buffer.
*
* Returns the user that was removed on success, or the current user if
* another user currently owns the buffer.
*/
void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
void *user_ptr, void *user_data_ptr_ptr,
dmu_buf_evict_func_t *pageout_func);
void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);

/*
* Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
* Returns the user data (dmu_buf_user_t *) associated with this dbuf.
*/
void *dmu_buf_get_user(dmu_buf_t *db);

/* Block until any in-progress dmu buf user evictions complete. */
void dmu_buf_user_evict_wait(void);

/*
* Returns the blkptr associated with this dbuf, or NULL if not set.
*/
Expand Down
14 changes: 10 additions & 4 deletions include/sys/dmu_objset.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/

/* Portions Copyright 2010 Robert Milkowski */
Expand Down Expand Up @@ -74,22 +75,25 @@ struct objset {
arc_buf_t *os_phys_buf;
objset_phys_t *os_phys;
/*
* The following "special" dnodes have no parent and are exempt from
* dnode_move(), but they root their descendents in this objset using
* handles anyway, so that all access to dnodes from dbufs consistently
* uses handles.
* The following "special" dnodes have no parent, are exempt
* from dnode_move(), and are not recorded in os_dnodes, but they
* root their descendents in this objset using handles anyway, so
* that all access to dnodes from dbufs consistently uses handles.
*/
dnode_handle_t os_meta_dnode;
dnode_handle_t os_userused_dnode;
dnode_handle_t os_groupused_dnode;
zilog_t *os_zil;

list_node_t os_evicting_node;

/* can change, under dsl_dir's locks: */
enum zio_checksum os_checksum;
enum zio_compress os_compress;
uint8_t os_copies;
enum zio_checksum os_dedup_checksum;
boolean_t os_dedup_verify;
boolean_t os_evicting;
zfs_logbias_op_t os_logbias;
zfs_cache_type_t os_primary_cache;
zfs_cache_type_t os_secondary_cache;
Expand Down Expand Up @@ -168,6 +172,8 @@ int dmu_objset_userspace_upgrade(objset_t *os);
boolean_t dmu_objset_userspace_present(objset_t *os);
int dmu_fsname(const char *snapname, char *buf);

void dmu_objset_evict_done(objset_t *os);

void dmu_objset_init(void);
void dmu_objset_fini(void);

Expand Down
3 changes: 2 additions & 1 deletion include/sys/dmu_send.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/

#ifndef _DMU_SEND_H
Expand Down Expand Up @@ -56,6 +56,7 @@ typedef struct dmu_recv_cookie {
zio_cksum_t drc_cksum;
uint64_t drc_newsnapobj;
void *drc_owner;
cred_t *drc_cred;
} dmu_recv_cookie_t;

int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
Expand Down
19 changes: 16 additions & 3 deletions include/sys/dnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/

#ifndef _SYS_DNODE_H
Expand Down Expand Up @@ -233,7 +234,18 @@ typedef struct dnode {
refcount_t dn_holds;

kmutex_t dn_dbufs_mtx;
list_t dn_dbufs; /* descendent dbufs */
/*
* Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
* can contain multiple dbufs of the same (level, blkid) when a
* dbuf is marked DB_EVICTING without being removed from
* dn_dbufs. To maintain the avl invariant that there cannot be
* duplicate entries, we order the dbufs by an arbitrary value -
* their address in memory. This means that dn_dbufs cannot be used to
* directly look up a dbuf. Instead, callers must use avl_walk, have
* a reference to the dbuf, or look up a non-existant node with
* db_state = DB_SEARCH (see dbuf_free_range for an example).
*/
avl_tree_t dn_dbufs;

/* protected by dn_struct_rwlock */
struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
Expand Down Expand Up @@ -266,8 +278,9 @@ typedef struct dnode_handle {
} dnode_handle_t;

typedef struct dnode_children {
dmu_buf_user_t dnc_dbu; /* User evict data */
size_t dnc_count; /* number of children */
dnode_handle_t dnc_children[1]; /* sized dynamically */
dnode_handle_t dnc_children[]; /* sized dynamically */
} dnode_children_t;

typedef struct free_range {
Expand All @@ -276,7 +289,7 @@ typedef struct free_range {
uint64_t fr_nblks;
} free_range_t;

dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
void dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
uint64_t object, dnode_handle_t *dnh);
void dnode_special_close(dnode_handle_t *dnh);

Expand Down
Loading