diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 1d76f2a7da3e..29d26abc59c9 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1928,8 +1928,8 @@ dump_dir(objset_t *os) if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; usedobjs = BP_GET_FILL(os->os_rootbp); - refdbytes = os->os_spa->spa_dsl_pool-> - dp_mos_dir->dd_phys->dd_used_bytes; + refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> + dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } diff --git a/include/sys/avl.h b/include/sys/avl.h index ba305c908239..9bc2b4a32e0e 100644 --- a/include/sys/avl.h +++ b/include/sys/avl.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + #ifndef _AVL_H #define _AVL_H @@ -259,6 +263,11 @@ extern boolean_t avl_update(avl_tree_t *, void *); extern boolean_t avl_update_lt(avl_tree_t *, void *); extern boolean_t avl_update_gt(avl_tree_t *, void *); +/* + * Swaps the contents of the two trees. + */ +extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2); + /* * Return the number of nodes in the tree */ diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 1eabfd7daccf..c2f4f8bd0efc 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -20,8 +20,9 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DBUF_H @@ -66,8 +67,13 @@ extern "C" { * | | * | | * +--------> NOFILL -------+ + * + * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range + * to find all dbufs in a range of a dnode and must be less than any other + * dbuf_states_t (see comment on dn_dbufs in dnode.h). */ typedef enum dbuf_states { + DB_SEARCH = -1, DB_UNCACHED, DB_FILL, DB_NOFILL, @@ -217,14 +223,12 @@ typedef struct dmu_buf_impl { * Our link on the owner dnodes's dn_dbufs list. * Protected by its dn_dbufs_mtx. */ - list_node_t db_link; + avl_node_t db_link; /* Data which is unique to data (leaf) blocks: */ - /* stuff we store for the user (see dmu_buf_set_user) */ - void *db_user_ptr; - void **db_user_data_ptr_ptr; - dmu_buf_evict_func_t *db_evict_func; + /* User callback information. */ + dmu_buf_user_t *db_user; uint8_t db_immediate_evict; uint8_t db_freed_in_flight; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index c9c687b5aa62..b2f1efae011a 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -24,6 +24,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -39,11 +40,9 @@ * dmu_spa.h. */ +#include #include -#include -#include #include -#include #include #include @@ -288,8 +287,6 @@ typedef struct dmu_buf { void *db_data; /* data in buffer */ } dmu_buf_t; -typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); - /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ @@ -475,43 +472,126 @@ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); +typedef void dmu_buf_evict_func_t(void *user_ptr); + +/* + * A DMU buffer user object may be associated with a dbuf for the + * duration of its lifetime. This allows the user of a dbuf (client) + * to attach private data to a dbuf (e.g. in-core only data such as a + * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified + * when that dbuf has been evicted. Clients typically respond to the + * eviction notification by freeing their private data, thus ensuring + * the same lifetime for both dbuf and private data. + * + * The mapping from a dmu_buf_user_t to any client private data is the + * client's responsibility. All current consumers of the API with private + * data embed a dmu_buf_user_t as the first member of the structure for + * their private data. This allows conversions between the two types + * with a simple cast. Since the DMU buf user API never needs access + * to the private data, other strategies can be employed if necessary + * or convenient for the client (e.g. using container_of() to do the + * conversion for private data that cannot have the dmu_buf_user_t as + * its first member). + * + * Eviction callbacks are executed without the dbuf mutex held or any + * other type of mechanism to guarantee that the dbuf is still available. + * For this reason, users must assume the dbuf has already been freed + * and not reference the dbuf from the callback context. + * + * Users requesting "immediate eviction" are notified as soon as the dbuf + * is only referenced by dirty records (dirties == holds). Otherwise the + * notification occurs after eviction processing for the dbuf begins. + */ +typedef struct dmu_buf_user { + /* + * Asynchronous user eviction callback state. + */ + taskq_ent_t dbu_tqent; + + /* This instance's eviction function pointer. */ + dmu_buf_evict_func_t *dbu_evict_func; +#ifdef ZFS_DEBUG + /* + * Pointer to user's dbuf pointer. NULL for clients that do + * not associate a dbuf with their user data. + * + * The dbuf pointer is cleared upon eviction so as to catch + * use-after-evict bugs in clients. + */ + dmu_buf_t **dbu_clear_on_evict_dbufp; +#endif +} dmu_buf_user_t; + +/* + * Initialize the given dmu_buf_user_t instance with the eviction function + * evict_func, to be called when the user is evicted. + * + * NOTE: This function should only be called once on a given dmu_buf_user_t. + * To allow enforcement of this, dbu must already be zeroed on entry. + */ +#ifdef __lint +/* Very ugly, but it beats issuing suppression directives in many Makefiles. */ +extern void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, + dmu_buf_t **clear_on_evict_dbufp); +#else /* __lint */ +static inline void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, + dmu_buf_t **clear_on_evict_dbufp) +{ + ASSERT(dbu->dbu_evict_func == NULL); + ASSERT(evict_func != NULL); + dbu->dbu_evict_func = evict_func; +#ifdef ZFS_DEBUG + dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; +#endif +} +#endif /* __lint */ + /* - * Returns NULL on success, or the existing user ptr if it's already - * been set. + * Attach user data to a dbuf and mark it for normal (when the dbuf's + * data is cleared or its reference count goes to zero) eviction processing. * - * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). - * - * user_data_ptr_ptr should be NULL, or a pointer to a pointer which - * will be set to db->db_data when you are allowed to access it. Note - * that db->db_data (the pointer) can change when you do dmu_buf_read(), - * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill(). - * *user_data_ptr_ptr will be set to the new value when it changes. + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. + */ +void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Attach user data to a dbuf and mark it for immediate (its dirty and + * reference counts are equal) eviction processing. * - * If non-NULL, pageout func will be called when this buffer is being - * excised from the cache, so that you can clean up the data structure - * pointed to by user_ptr. + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. + */ +void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Replace the current user of a dbuf. * - * dmu_evict_user() will call the pageout func for all buffers in a - * objset with a given pageout func. + * If given the current user of a dbuf, replaces the dbuf's user with + * "new_user" and returns the user data pointer that was replaced. + * Otherwise returns the current, and unmodified, dbuf user pointer. */ -void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *pageout_func); +void *dmu_buf_replace_user(dmu_buf_t *db, + dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); + /* - * set_user_ie is the same as set_user, but request immediate eviction - * when hold count goes to zero. + * Remove the specified user data for a DMU buffer. + * + * Returns the user that was removed on success, or the current user if + * another user currently owns the buffer. */ -void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, - void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); -void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, - void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *pageout_func); -void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); +void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* - * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. + * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); +/* Block until any in-progress dmu buf user evictions complete. */ +void dmu_buf_user_evict_wait(void); + /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index cbf0394e6db1..65ae850f4d28 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -74,22 +75,25 @@ struct objset { arc_buf_t *os_phys_buf; objset_phys_t *os_phys; /* - * The following "special" dnodes have no parent and are exempt from - * dnode_move(), but they root their descendents in this objset using - * handles anyway, so that all access to dnodes from dbufs consistently - * uses handles. + * The following "special" dnodes have no parent, are exempt + * from dnode_move(), and are not recorded in os_dnodes, but they + * root their descendents in this objset using handles anyway, so + * that all access to dnodes from dbufs consistently uses handles. */ dnode_handle_t os_meta_dnode; dnode_handle_t os_userused_dnode; dnode_handle_t os_groupused_dnode; zilog_t *os_zil; + list_node_t os_evicting_node; + /* can change, under dsl_dir's locks: */ enum zio_checksum os_checksum; enum zio_compress os_compress; uint8_t os_copies; enum zio_checksum os_dedup_checksum; boolean_t os_dedup_verify; + boolean_t os_evicting; zfs_logbias_op_t os_logbias; zfs_cache_type_t os_primary_cache; zfs_cache_type_t os_secondary_cache; @@ -168,6 +172,8 @@ int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); int dmu_fsname(const char *snapname, char *buf); +void dmu_objset_evict_done(objset_t *os); + void dmu_objset_init(void); void dmu_objset_fini(void); diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h index de590f1d503b..dc183c02c350 100644 --- a/include/sys/dmu_send.h +++ b/include/sys/dmu_send.h @@ -23,7 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ #ifndef _DMU_SEND_H @@ -56,6 +56,7 @@ typedef struct dmu_recv_cookie { zio_cksum_t drc_cksum; uint64_t drc_newsnapobj; void *drc_owner; + cred_t *drc_cred; } dmu_recv_cookie_t; int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, diff --git a/include/sys/dnode.h b/include/sys/dnode.h index b63549b46753..50e01155903a 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DNODE_H @@ -233,7 +234,18 @@ typedef struct dnode { refcount_t dn_holds; kmutex_t dn_dbufs_mtx; - list_t dn_dbufs; /* descendent dbufs */ + /* + * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs + * can contain multiple dbufs of the same (level, blkid) when a + * dbuf is marked DB_EVICTING without being removed from + * dn_dbufs. To maintain the avl invariant that there cannot be + * duplicate entries, we order the dbufs by an arbitrary value - + * their address in memory. This means that dn_dbufs cannot be used to + * directly look up a dbuf. Instead, callers must use avl_walk, have + * a reference to the dbuf, or look up a non-existant node with + * db_state = DB_SEARCH (see dbuf_free_range for an example). + */ + avl_tree_t dn_dbufs; /* protected by dn_struct_rwlock */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ @@ -266,8 +278,9 @@ typedef struct dnode_handle { } dnode_handle_t; typedef struct dnode_children { + dmu_buf_user_t dnc_dbu; /* User evict data */ size_t dnc_count; /* number of children */ - dnode_handle_t dnc_children[1]; /* sized dynamically */ + dnode_handle_t dnc_children[]; /* sized dynamically */ } dnode_children_t; typedef struct free_range { @@ -276,7 +289,7 @@ typedef struct free_range { uint64_t fr_nblks; } free_range_t; -dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp, +void dnode_special_open(struct objset *dd, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh); void dnode_special_close(dnode_handle_t *dnh); diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 4ef70adc27cf..7e5c0a7cba7b 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -21,8 +21,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DSL_DATASET_H @@ -48,7 +49,7 @@ struct dsl_pool; #define DS_FLAG_INCONSISTENT (1ULL<<0) #define DS_IS_INCONSISTENT(ds) \ - ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) + (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) /* * Do not allow this dataset to be promoted. @@ -68,7 +69,7 @@ struct dsl_pool; */ #define DS_FLAG_DEFER_DESTROY (1ULL<<3) #define DS_IS_DEFER_DESTROY(ds) \ - ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY) + (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY) /* * DS_FIELD_* are strings that are used in the "extensified" dataset zap object. @@ -125,12 +126,14 @@ typedef struct dsl_dataset_phys { } dsl_dataset_phys_t; typedef struct dsl_dataset { + dmu_buf_user_t ds_dbu; + /* Immutable: */ struct dsl_dir *ds_dir; - dsl_dataset_phys_t *ds_phys; dmu_buf_t *ds_dbuf; uint64_t ds_object; uint64_t ds_fsid_guid; + boolean_t ds_is_snapshot; /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; @@ -177,17 +180,20 @@ typedef struct dsl_dataset { char ds_snapname[MAXNAMELEN]; } dsl_dataset_t; +static inline dsl_dataset_phys_t * +dsl_dataset_phys(dsl_dataset_t *ds) +{ + return (ds->ds_dbuf->db_data); +} + /* * The max length of a temporary tag prefix is the number of hex digits * required to express UINT64_MAX plus one for the hyphen. */ #define MAX_TAG_PREFIX_LEN 17 -#define dsl_dataset_is_snapshot(ds) \ - ((ds)->ds_phys->ds_num_children != 0) - #define DS_UNIQUE_IS_ACCURATE(ds) \ - (((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) + ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag, dsl_dataset_t **dsp); @@ -266,7 +272,7 @@ int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, dmu_tx_t *tx); int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx, boolean_t recv); + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr); void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx); @@ -276,7 +282,8 @@ void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds); int dsl_dataset_get_snapname(dsl_dataset_t *ds); int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value); -int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx); +int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, + boolean_t adj_cnt); void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, zprop_source_t source, uint64_t value, dmu_tx_t *tx); void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx); diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index a0a3ef1ded19..55f3a8e5baa9 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -21,6 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DSL_DIR_H @@ -38,6 +40,14 @@ extern "C" { struct dsl_dataset; +/* + * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object. + * They should be of the format :. + */ + +#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" +#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" + typedef enum dd_used { DD_USED_HEAD, DD_USED_SNAP, @@ -75,12 +85,15 @@ typedef struct dsl_dir_phys { } dsl_dir_phys_t; struct dsl_dir { + dmu_buf_user_t dd_dbu; + /* These are immutable; no lock needed: */ uint64_t dd_object; - dsl_dir_phys_t *dd_phys; - dmu_buf_t *dd_dbuf; dsl_pool_t *dd_pool; + /* Stable until user eviction; no lock needed: */ + dmu_buf_t *dd_dbuf; + /* protected by lock on pool's dp_dirty_dirs list */ txg_node_t dd_dirty_link; @@ -102,7 +115,14 @@ struct dsl_dir { char dd_myname[MAXNAMELEN]; }; +static inline dsl_dir_phys_t * +dsl_dir_phys(dsl_dir_t *dd) +{ + return (dd->dd_dbuf->db_data); +} + void dsl_dir_rele(dsl_dir_t *dd, void *tag); +void dsl_dir_async_rele(dsl_dir_t *dd, void *tag); int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **, const char **tail); int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, @@ -129,8 +149,13 @@ int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota); int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation); +int dsl_dir_activate_fs_ss_limit(const char *); +int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *, + cred_t *); +void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *); int dsl_dir_rename(const char *oldname, const char *newname); -int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); +int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *); boolean_t dsl_dir_is_clone(dsl_dir_t *dd); void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, uint64_t reservation, cred_t *cr, dmu_tx_t *tx); @@ -138,6 +163,8 @@ void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx); +void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx); +boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 477d98daa173..87a91fef0e03 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -23,7 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -142,6 +142,10 @@ typedef enum { ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, ZFS_PROP_INCONSISTENT, /* not exposed to the user */ + ZFS_PROP_FILESYSTEM_LIMIT, + ZFS_PROP_SNAPSHOT_LIMIT, + ZFS_PROP_FILESYSTEM_COUNT, + ZFS_PROP_SNAPSHOT_COUNT, ZFS_PROP_SNAPDEV, ZFS_PROP_ACLTYPE, ZFS_PROP_SELINUX_CONTEXT, diff --git a/include/sys/sa.h b/include/sys/sa.h index 7b5b03a5629f..48e3bcd7cdf3 100644 --- a/include/sys/sa.h +++ b/include/sys/sa.h @@ -133,7 +133,6 @@ int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); void sa_object_info(sa_handle_t *, dmu_object_info_t *); void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); -void sa_update_user(sa_handle_t *, sa_handle_t *); void *sa_get_userdata(sa_handle_t *); void sa_set_userp(sa_handle_t *, void *); dmu_buf_t *sa_get_db(sa_handle_t *); diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h index fcbd8eb34e91..6f2f1db6dcf9 100644 --- a/include/sys/sa_impl.h +++ b/include/sys/sa_impl.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_SA_IMPL_H @@ -208,11 +209,12 @@ typedef enum sa_data_op { */ struct sa_handle { + dmu_buf_user_t sa_dbu; kmutex_t sa_lock; dmu_buf_t *sa_bonus; dmu_buf_t *sa_spill; objset_t *sa_os; - void *sa_userp; + void *sa_userp; sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ }; diff --git a/include/sys/spa.h b/include/sys/spa.h index 83b6723a4f73..75bf1a0acbd8 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_SPA_H @@ -680,6 +681,7 @@ extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); +extern void spa_async_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 @@ -789,6 +791,9 @@ extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); +extern void spa_evicting_os_register(spa_t *, objset_t *os); +extern void spa_evicting_os_deregister(spa_t *, objset_t *os); +extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern int spa_busy(void); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 72e04e8cfe35..0b5f33bd8c20 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_SPA_IMPL_H @@ -144,6 +145,9 @@ struct spa { uint64_t spa_claim_max_txg; /* highest claimed birth txg */ timespec_t spa_loaded_ts; /* 1st successful open time */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ + kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ + list_t spa_evicting_os_list; /* Objsets being evicted. */ + kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ uint64_t spa_config_guid; /* config pool guid */ diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 1dc322e02f6f..028018a16591 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_ZAP_IMPL_H @@ -70,7 +71,7 @@ typedef struct mzap_ent { } mzap_ent_t; #define MZE_PHYS(zap, mze) \ - (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid]) + (&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid]) /* * The (fat) zap is stored in one object. It is an array of @@ -104,7 +105,7 @@ struct zap_leaf; * word number (1<zap_f.zap_phys) \ + ((uint64_t *)zap_f_phys(zap)) \ [(idx) + (1<zap_dbuf->db_data); +} + +static inline mzap_phys_t * +zap_m_phys(zap_t *zap) +{ + return (zap->zap_dbuf->db_data); +} + typedef struct zap_name { zap_t *zn_zap; int zn_key_intlen; @@ -187,7 +198,7 @@ boolean_t zap_match(zap_name_t *zn, const char *matchname); int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap); -void zap_evict(dmu_buf_t *db, void *vmzap); +void zap_evict(void *dbu); zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h index f6947a72d70e..e784c5963b2e 100644 --- a/include/sys/zap_leaf.h +++ b/include/sys/zap_leaf.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_ZAP_LEAF_H @@ -83,7 +84,7 @@ struct zap_stats; */ #define ZAP_LEAF_CHUNK(l, idx) \ ((zap_leaf_chunk_t *) \ - ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] + (zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] #define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) typedef enum zap_chunk_type { @@ -152,13 +153,18 @@ typedef union zap_leaf_chunk { } zap_leaf_chunk_t; typedef struct zap_leaf { + dmu_buf_user_t l_dbu; krwlock_t l_rwlock; uint64_t l_blkid; /* 1<l_dbuf->db_data); +} typedef struct zap_entry_handle { /* Set by zap_leaf and public to ZAP */ diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 80074db4fbcc..e3215bebd818 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -48,6 +48,7 @@ typedef enum spa_feature { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_EMBEDDED_DATA, SPA_FEATURE_BOOKMARKS, + SPA_FEATURE_FS_SS_LIMIT, SPA_FEATURES } spa_feature_t; diff --git a/lib/libspl/include/sys/policy.h b/lib/libspl/include/sys/policy.h new file mode 100644 index 000000000000..f48ebf893d4f --- /dev/null +++ b/lib/libspl/include/sys/policy.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _LIBSPL_SYS_POLICY_H +#define _LIBSPL_SYS_POLICY_H + +#endif /* _LIBSPL_SYS_POLICY_H */ diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index dbcb92102a08..79ea8923348e 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . @@ -1934,6 +1935,10 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + case ZFS_PROP_FILESYSTEM_COUNT: + case ZFS_PROP_SNAPSHOT_COUNT: *val = getprop_uint64(zhp, prop, source); if (*source == NULL) { @@ -2339,6 +2344,30 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, } break; + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + case ZFS_PROP_FILESYSTEM_COUNT: + case ZFS_PROP_SNAPSHOT_COUNT: + + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + + /* + * If limit is UINT64_MAX, we translate this into 'none' (unless + * literal is set), and indicate that it's the default value. + * Otherwise, we print the number nicely and indicate that it's + * set locally. + */ + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)val); + } else if (val == UINT64_MAX) { + (void) strlcpy(propbuf, "none", proplen); + } else { + zfs_nicenum(val, propbuf, proplen); + } + break; + case ZFS_PROP_REFRATIO: case ZFS_PROP_COMPRESSRATIO: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index d212858d5cd5..d340fa49ded9 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ @@ -1478,6 +1479,16 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, "use 'none' to disable quota/refquota")); goto error; } + + /* + * Special handling for "*_limit=none". In this case it's not + * 0 but UINT64_MAX. + */ + if ((type & ZFS_TYPE_DATASET) && isnone && + (prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT)) { + *ivalp = UINT64_MAX; + } break; case PROP_TYPE_INDEX: diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index d80bb02bae3a..27034264bc15 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -1,6 +1,7 @@ '\" te .\" Copyright (c) 2013 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at @@ -198,6 +199,27 @@ This feature is \fBactive\fR while there are any filesystems, volumes, or snapshots which were created after enabling this feature. .RE +.sp +.ne 2 +.na +\fB\fBfilesystem_limits\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.joyent:filesystem_limits +READ\-ONLY COMPATIBLE yes +DEPENDENCIES extensible_dataset +.TE + +This feature enables filesystem and snapshot limits. These limits can be used +to control how many filesystems and/or snapshots can be created at the point in +the tree on which the limits are set. + +This feature is \fBactive\fR once either of the limit properties has been +set on a dataset. Once activated the feature is never deactivated. +.RE + .sp .ne 2 .na diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index ab3d0214b739..9f47f6b7a428 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -26,6 +26,7 @@ .\" Copyright (c) 2012, Joyent, Inc. All rights reserved. .\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" .TH zfs 8 "Nov 19, 2013" "ZFS pool 28, filesystem 5" "System Administration Commands" .SH NAME @@ -449,6 +450,18 @@ This property is \fBon\fR if the snapshot has been marked for deferred destructi .ne 2 .mk .na +\fB\fBfilesystem_count\fR +.ad +.sp .6 +.RS 4n +The total number of filesystems and volumes that exist under this location in the +dataset tree. This value is only available when a \fBfilesystem_limit\fR has +been set somewhere in the tree under which the dataset resides. +.RE + +.sp +.ne 2 +.na \fB\fBlogicalreferenced\fR\fR .ad .sp .6 @@ -531,6 +544,18 @@ property. .ne 2 .mk .na +\fB\fBsnapshot_count\fR +.ad +.sp .6 +.RS 4n +The total number of snapshots that exist under this location in the dataset tree. +This value is only available when a \fBsnapshot_limit\fR has been set somewhere +in the tree under which the dataset resides. +.RE + +.sp +.ne 2 +.na \fB\fBtype\fR\fR .ad .sp .6 @@ -892,6 +917,21 @@ Zones are a Solaris feature and are not relevant on Linux. .ne 2 .mk .na +\fB\fBfilesystem_limit\fR=\fIcount\fR | \fBnone\fR\fR +.ad +.sp .6 +.RS 4n +Limits the number of filesystems and volumes that can exist under this point in +the dataset tree. The limit is not enforced if the user is allowed to change +the limit. Setting a filesystem_limit on a descendent of a filesystem that +already has a filesystem_limit does not override the ancestor's filesystem_limit, +but rather imposes an additional limit. This feature must be enabled to be used +(see \fBzpool-features\fR(5)). +.RE + +.sp +.ne 2 +.na \fB\fBmountpoint\fR=\fIpath\fR | \fBnone\fR | \fBlegacy\fR\fR .ad .sp .6 @@ -940,6 +980,22 @@ Quotas cannot be set on volumes, as the \fBvolsize\fR property acts as an implic .ne 2 .mk .na +\fB\fBsnapshot_limit\fR=\fIcount\fR | \fBnone\fR\fR +.ad +.sp .6 +.RS 4n +Limits the number of snapshots that can be created on a dataset and its +descendents. Setting a snapshot_limit on a descendent of a dataset that already +has a snapshot_limit does not override the ancestor's snapshot_limit, but +rather imposes an additional limit. The limit is not enforced if the user is +allowed to change the limit. For example, this means that recursive snapshots +taken from the global zone are counted against each delegated dataset within +a zone. This feature must be enabled to be used (see \fBzpool-features\fR(5)). +.RE + +.sp +.ne 2 +.na \fB\fBuserquota@\fR\fIuser\fR=\fIsize\fR | \fBnone\fR\fR .ad .sp .6 @@ -2983,6 +3039,7 @@ copies property dedup property devices property exec property +filesystem_limit property logbias property mlslabel property mountpoint property @@ -3001,6 +3058,7 @@ shareiscsi property sharenfs property sharesmb property snapdir property +snapshot_limit property utf8only property version property volblocksize property diff --git a/module/avl/avl.c b/module/avl/avl.c index f9971da20a0f..44ceafb9f0e5 100644 --- a/module/avl/avl.c +++ b/module/avl/avl.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + /* * AVL - generic AVL tree implementation for kernel use * @@ -85,6 +89,12 @@ * is a modified "avl_node_t *". The bottom bit (normally 0 for a * pointer) is set to indicate if that the new node has a value greater * than the value of the indicated "avl_node_t *". + * + * Note - in addition to userland (e.g. libavl and libutil) and the kernel + * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module, + * which each have their own compilation environments and subsequent + * requirements. Each of these environments must be considered when adding + * dependencies from avl.c. */ #include @@ -863,6 +873,24 @@ avl_update(avl_tree_t *t, void *obj) return (B_FALSE); } +void +avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) +{ + avl_node_t *temp_node; + ulong_t temp_numnodes; + + ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar); + ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset); + ASSERT3U(tree1->avl_size, ==, tree2->avl_size); + + temp_node = tree1->avl_root; + temp_numnodes = tree1->avl_numnodes; + tree1->avl_root = tree2->avl_root; + tree1->avl_numnodes = tree2->avl_numnodes; + tree2->avl_root = temp_node; + tree2->avl_numnodes = temp_numnodes; +} + /* * initialize a new AVL tree */ diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 192f8f2210be..70f1d93dfc98 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -411,6 +411,18 @@ zfs_prop_init(void) zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "REFRESERV"); + zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, + " | none", "FSLIMIT"); + zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + " | none", "SSLIMIT"); + zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, + "", "FSCOUNT"); + zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "", "SSCOUNT"); /* inherit number properties */ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 7a0c666395c8..4ee5ba2df359 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -23,6 +23,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -78,10 +79,16 @@ static void dbuf_destroy(dmu_buf_impl_t *db); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); +#ifndef __lint +extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, + dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp); +#endif /* ! __lint */ + /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_cache; +static taskq_t *dbu_evict_taskq; /* ARGSUSED */ static int @@ -93,7 +100,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag) mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); refcount_create(&db->db_holds); - list_link_init(&db->db_link); + return (0); } @@ -247,20 +254,72 @@ dbuf_hash_remove(dmu_buf_impl_t *db) static arc_evict_func_t dbuf_do_evict; +typedef enum { + DBVU_EVICTING, + DBVU_NOT_EVICTING +} dbvu_verify_type_t; + +static void +dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) +{ +#ifdef ZFS_DEBUG + int64_t holds; + + if (db->db_user == NULL) + return; + + /* Only data blocks support the attachment of user data. */ + ASSERT(db->db_level == 0); + + /* Clients must resolve a dbuf before attaching user data. */ + ASSERT(db->db.db_data != NULL); + ASSERT3U(db->db_state, ==, DB_CACHED); + + holds = refcount_count(&db->db_holds); + if (verify_type == DBVU_EVICTING) { + /* + * Immediate eviction occurs when holds == dirtycnt. + * For normal eviction buffers, holds is zero on + * eviction, except when dbuf_fix_old_data() calls + * dbuf_clear_data(). However, the hold count can grow + * during eviction even though db_mtx is held (see + * dmu_bonus_hold() for an example), so we can only + * test the generic invariant that holds >= dirtycnt. + */ + ASSERT3U(holds, >=, db->db_dirtycnt); + } else { + if (db->db_immediate_evict == TRUE) + ASSERT3U(holds, >=, db->db_dirtycnt); + else + ASSERT3U(holds, >, 0); + } +#endif +} + static void dbuf_evict_user(dmu_buf_impl_t *db) { + dmu_buf_user_t *dbu = db->db_user; + ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level != 0 || db->db_evict_func == NULL) + if (dbu == NULL) return; - if (db->db_user_data_ptr_ptr) - *db->db_user_data_ptr_ptr = db->db.db_data; - db->db_evict_func(&db->db, db->db_user_ptr); - db->db_user_ptr = NULL; - db->db_user_data_ptr_ptr = NULL; - db->db_evict_func = NULL; + dbuf_verify_user(db, DBVU_EVICTING); + db->db_user = NULL; + +#ifdef ZFS_DEBUG + if (dbu->dbu_clear_on_evict_dbufp != NULL) + *dbu->dbu_clear_on_evict_dbufp = NULL; +#endif + + /* + * Invoke the callback from a taskq to avoid lock order reversals + * and limit stack depth. + */ + taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, + &dbu->dbu_tqent); } boolean_t @@ -334,6 +393,12 @@ dbuf_init(void) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_FSTRANS, NULL); dbuf_stats_init(h); + + /* + * All entries are queued via taskq_dispatch_ent(), so min/maxalloc + * configuration is not required. + */ + dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); } void @@ -356,6 +421,7 @@ dbuf_fini(void) kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); #endif kmem_cache_destroy(dbuf_cache); + taskq_destroy(dbu_evict_taskq); } /* @@ -386,7 +452,7 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || - !list_is_empty(&dn->dn_dbufs)); + !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); @@ -474,32 +540,27 @@ dbuf_verify(dmu_buf_impl_t *db) #endif static void -dbuf_update_data(dmu_buf_impl_t *db) +dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level == 0 && db->db_user_data_ptr_ptr) { - ASSERT(!refcount_is_zero(&db->db_holds)); - *db->db_user_data_ptr_ptr = db->db.db_data; - } + dbuf_evict_user(db); + db->db_buf = NULL; + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + db->db_state = DB_UNCACHED; } static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(buf != NULL); + db->db_buf = buf; - if (buf != NULL) { - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); - dbuf_update_data(db); - } else { - dbuf_evict_user(db); - db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) - db->db_state = DB_UNCACHED; - } + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); } /* @@ -521,7 +582,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); - dbuf_set_data(db, NULL); + dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); @@ -600,7 +661,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); DB_DNODE_EXIT(db); - dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return (0); @@ -762,7 +822,7 @@ dbuf_noread(dmu_buf_impl_t *db) dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { - dbuf_set_data(db, NULL); + dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } @@ -818,7 +878,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { - dbuf_set_data(db, NULL); + dbuf_clear_data(db); } } @@ -866,21 +926,33 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * receive; see comment below for details. */ void -dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) +dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) { + dmu_buf_impl_t db_search; dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; + avl_index_t where; boolean_t freespill = - (start == DMU_SPILL_BLKID || end == DMU_SPILL_BLKID); + (start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID); + + if (end_blkid > dn->dn_maxblkid && !freespill) + end_blkid = dn->dn_maxblkid; + dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); - if (end > dn->dn_maxblkid && !freespill) - end = dn->dn_maxblkid; - dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + db_search.db_level = 0; + db_search.db_blkid = start_blkid; + db_search.db_state = DB_SEARCH; mutex_enter(&dn->dn_dbufs_mtx); - if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz && - !freespill) { + if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) { /* There can't be any dbufs in this range; no need to search. */ +#ifdef DEBUG + db = avl_find(&dn->dn_dbufs, &db_search, &where); + ASSERT3P(db, ==, NULL); + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + ASSERT(db == NULL || db->db_level > 0); +#endif mutex_exit(&dn->dn_dbufs_mtx); return; } else if (dmu_objset_is_receiving(dn->dn_objset)) { @@ -894,20 +966,26 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) atomic_inc_64(&zfs_free_range_recv_miss); } - for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); + db = avl_find(&dn->dn_dbufs, &db_search, &where); + ASSERT3P(db, ==, NULL); + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + for (; db != NULL; db = db_next) { + db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* Skip indirect blocks. */ if (db->db_level != 0) - continue; + break; /* Skip direct blocks outside the range. */ - if (!freespill && (db->db_blkid < start || db->db_blkid > end)) - continue; + if (!freespill && db->db_blkid > end_blkid) + break; /* Skip all direct blocks, only free spill blocks. */ if (freespill && (db->db_blkid != DMU_SPILL_BLKID)) continue; + ASSERT3U(db->db_blkid, >=, start_blkid); + /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); if (dbuf_undirty(db, tx)) { @@ -1423,6 +1501,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); } + + if (db->db_level != 0) { + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); @@ -1432,7 +1516,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) arc_buf_t *buf = db->db_buf; ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_set_data(db, NULL); + dbuf_clear_data(db); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); return (B_TRUE); @@ -1651,7 +1735,7 @@ dbuf_clear(dmu_buf_impl_t *db) dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { - list_remove(&dn->dn_dbufs, db); + avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); @@ -1781,9 +1865,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_parent = parent; db->db_blkptr = blkptr; - db->db_user_ptr = NULL; - db->db_user_data_ptr_ptr = NULL; - db->db_evict_func = NULL; + db->db_user = NULL; db->db_immediate_evict = 0; db->db_freed_in_flight = 0; @@ -1823,7 +1905,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, mutex_exit(&dn->dn_dbufs_mtx); return (odb); } - list_insert_head(&dn->dn_dbufs, db); + avl_add(&dn->dn_dbufs, db); if (db->db_level == 0 && db->db_blkid >= dn->dn_unlisted_l0_blkid) dn->dn_unlisted_l0_blkid = db->db_blkid + 1; @@ -1882,7 +1964,7 @@ dbuf_destroy(dmu_buf_impl_t *db) DB_DNODE_ENTER(db); dn = DB_DNODE(db); mutex_enter(&dn->dn_dbufs_mtx); - list_remove(&dn->dn_dbufs, db); + avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); mutex_exit(&dn->dn_dbufs_mtx); DB_DNODE_EXIT(db); @@ -1900,7 +1982,6 @@ dbuf_destroy(dmu_buf_impl_t *db) db->db_parent = NULL; db->db_buf = NULL; - ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); @@ -2034,7 +2115,6 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) } (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); - dbuf_update_data(dh->dh_db); DBUF_VERIFY(dh->dh_db); mutex_exit(&dh->dh_db->db_mtx); @@ -2272,7 +2352,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) /* * This dbuf has anonymous data associated with it. */ - dbuf_set_data(db, NULL); + dbuf_clear_data(db); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); } else { @@ -2305,7 +2385,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) } else { dbuf_clear(db); } - } else if (arc_buf_eviction_needed(db->db_buf)) { + } else if (db->db_objset->os_evicting || + arc_buf_eviction_needed(db->db_buf)) { dbuf_clear(db); } else { mutex_exit(&db->db_mtx); @@ -2324,56 +2405,57 @@ dbuf_refcount(dmu_buf_impl_t *db) } void * -dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user) { - return (dmu_buf_update_user(db_fake, NULL, user_ptr, - user_data_ptr_ptr, evict_func)); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + mutex_enter(&db->db_mtx); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + if (db->db_user == old_user) + db->db_user = new_user; + else + old_user = db->db_user; + dbuf_verify_user(db, DBVU_NOT_EVICTING); + mutex_exit(&db->db_mtx); + + return (old_user); } void * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_immediate_evict = TRUE; - return (dmu_buf_update_user(db_fake, NULL, user_ptr, - user_data_ptr_ptr, evict_func)); + return (dmu_buf_replace_user(db_fake, NULL, user)); } void * -dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, - void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) +dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_level == 0); - - ASSERT((user_ptr == NULL) == (evict_func == NULL)); - mutex_enter(&db->db_mtx); - - if (db->db_user_ptr == old_user_ptr) { - db->db_user_ptr = user_ptr; - db->db_user_data_ptr_ptr = user_data_ptr_ptr; - db->db_evict_func = evict_func; - - dbuf_update_data(db); - } else { - old_user_ptr = db->db_user_ptr; - } + db->db_immediate_evict = TRUE; + return (dmu_buf_set_user(db_fake, user)); +} - mutex_exit(&db->db_mtx); - return (old_user_ptr); +void * +dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(!refcount_is_zero(&db->db_holds)); - return (db->db_user_ptr); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + return (db->db_user); +} + +void +dmu_buf_user_evict_wait() +{ + taskq_wait(dbu_evict_taskq); } boolean_t @@ -3041,7 +3123,6 @@ EXPORT_SYMBOL(dbuf_refcount); EXPORT_SYMBOL(dbuf_sync_list); EXPORT_SYMBOL(dmu_buf_set_user); EXPORT_SYMBOL(dmu_buf_set_user_ie); -EXPORT_SYMBOL(dmu_buf_update_user); EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_freeable); EXPORT_SYMBOL(dmu_buf_get_blkptr); diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index 30fabbb07957..32e451a77773 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -194,7 +194,7 @@ dmu_diff(const char *tosnap_name, const char *fromsnap_name, return (SET_ERROR(EXDEV)); } - fromtxg = fromsnap->ds_phys->ds_creation_txg; + fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg; dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_long_hold(tosnap, FTAG); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f438ca62a11f..4c470da8e4d2 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -22,6 +22,8 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -346,7 +348,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), @@ -403,7 +405,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_secondary_cache = ZFS_CACHE_ALL; } - if (ds == NULL || !dsl_dataset_is_snapshot(ds)) + if (ds == NULL || !ds->ds_is_snapshot) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); @@ -422,16 +424,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - DMU_META_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, - &os->os_meta_dnode); + dnode_special_open(os, &os->os_phys->os_meta_dnode, + DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { - DMU_USERUSED_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, - &os->os_userused_dnode); - DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, - &os->os_groupused_dnode); + dnode_special_open(os, &os->os_phys->os_userused_dnode, + DMU_USERUSED_OBJECT, &os->os_userused_dnode); + dnode_special_open(os, &os->os_phys->os_groupused_dnode, + DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); } *osp = os; @@ -519,7 +518,7 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EINVAL)); - } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + } else if (!readonly && ds->ds_is_snapshot) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EROFS)); } @@ -575,41 +574,57 @@ dmu_objset_disown(objset_t *os, void *tag) void dmu_objset_evict_dbufs(objset_t *os) { + dnode_t *dn_marker; dnode_t *dn; + dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP); + mutex_enter(&os->os_lock); + dn = list_head(&os->os_dnodes); + while (dn != NULL) { + /* + * Skip dnodes without holds. We have to do this dance + * because dnode_add_ref() only works if there is already a + * hold. If the dnode has no holds, then it has no dbufs. + */ + if (dnode_add_ref(dn, FTAG)) { + list_insert_after(&os->os_dnodes, dn, dn_marker); + mutex_exit(&os->os_lock); - /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&os->os_dnodes, DMU_META_DNODE(os)); - list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); - /* - * Find the first dnode with holds. We have to do this dance - * because dnode_add_ref() only works if you already have a - * hold. If there are no holds then it has no dbufs so OK to - * skip. - */ - for (dn = list_head(&os->os_dnodes); - dn && !dnode_add_ref(dn, FTAG); - dn = list_next(&os->os_dnodes, dn)) - continue; - - while (dn) { - dnode_t *next_dn = dn; + mutex_enter(&os->os_lock); + dn = list_next(&os->os_dnodes, dn_marker); + list_remove(&os->os_dnodes, dn_marker); + } else { + dn = list_next(&os->os_dnodes, dn); + } + } + mutex_exit(&os->os_lock); - do { - next_dn = list_next(&os->os_dnodes, next_dn); - } while (next_dn && !dnode_add_ref(next_dn, FTAG)); + kmem_free(dn_marker, sizeof (dnode_t)); - mutex_exit(&os->os_lock); - dnode_evict_dbufs(dn); - dnode_rele(dn, FTAG); - mutex_enter(&os->os_lock); - dn = next_dn; + if (DMU_USERUSED_DNODE(os) != NULL) { + dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); + dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); } - mutex_exit(&os->os_lock); + dnode_evict_dbufs(DMU_META_DNODE(os)); } +/* + * Objset eviction processing is split into into two pieces. + * The first marks the objset as evicting, evicts any dbufs that + * have a refcount of zero, and then queues up the objset for the + * second phase of eviction. Once os->os_dnodes has been cleared by + * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. + * The second phase closes the special dnodes, dequeues the objset from + * the list of those undergoing eviction, and finally frees the objset. + * + * NOTE: Due to asynchronous eviction processing (invocation of + * dnode_buf_pageout()), it is possible for the meta dnode for the + * objset to have no holds even though os->os_dnodes is not empty. + */ void dmu_objset_evict(objset_t *os) { @@ -621,7 +636,7 @@ dmu_objset_evict(objset_t *os) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); @@ -655,8 +670,24 @@ dmu_objset_evict(objset_t *os) if (os->os_sa) sa_tear_down(os); + os->os_evicting = B_TRUE; dmu_objset_evict_dbufs(os); + mutex_enter(&os->os_lock); + spa_evicting_os_register(os->os_spa, os); + if (list_is_empty(&os->os_dnodes)) { + mutex_exit(&os->os_lock); + dmu_objset_evict_done(os); + } else { + mutex_exit(&os->os_lock); + } +} + +void +dmu_objset_evict_done(objset_t *os) +{ + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); @@ -664,8 +695,6 @@ dmu_objset_evict(objset_t *os) } zil_free(os->os_zil); - ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* @@ -680,6 +709,7 @@ dmu_objset_evict(objset_t *os) mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); + spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } @@ -780,9 +810,11 @@ dmu_objset_create_check(void *arg, dmu_tx_t *tx) dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, + doca->doca_cred); dsl_dir_rele(pdd, FTAG); - return (0); + return (error); } static void @@ -866,6 +898,12 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EXDEV)); } + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, + doca->doca_cred); + if (error != 0) { + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EDQUOT)); + } dsl_dir_rele(pdd, FTAG); error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); @@ -879,7 +917,7 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) } /* You can only clone snapshots, not the head datasets. */ - if (!dsl_dataset_is_snapshot(origin)) { + if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EINVAL)); } @@ -1444,7 +1482,7 @@ int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) - return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); + return (os->os_dsl_dataset->ds_is_snapshot); else return (B_FALSE); } @@ -1456,12 +1494,12 @@ dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; - if (ds->ds_phys->ds_snapnames_zapobj == 0) + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, - real, maxlen, conflict)); + dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored, + MT_FIRST, real, maxlen, conflict)); } int @@ -1474,12 +1512,12 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); - if (ds->ds_phys->ds_snapnames_zapobj == 0) + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, *offp); + dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); @@ -1519,12 +1557,12 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, /* there is no next dir on a snapshot! */ if (os->os_dsl_dataset->ds_object != - dd->dd_phys->dd_head_dataset_obj) + dsl_dir_phys(dd)->dd_head_dataset_obj) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset, - dd->dd_phys->dd_child_dir_zapobj, *offp); + dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); @@ -1572,7 +1610,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, return (0); } - thisobj = dd->dd_phys->dd_head_dataset_obj; + thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* @@ -1580,7 +1618,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, - dd->dd_phys->dd_child_dir_zapobj); + dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, @@ -1609,7 +1647,9 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { - uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + uint64_t snapobj; + + snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); @@ -1684,7 +1724,7 @@ dmu_objset_find_impl(spa_t *spa, const char *name, return (0); } - thisobj = dd->dd_phys->dd_head_dataset_obj; + thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* @@ -1692,7 +1732,7 @@ dmu_objset_find_impl(spa_t *spa, const char *name, */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, - dd->dd_phys->dd_child_dir_zapobj); + dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, @@ -1725,7 +1765,9 @@ dmu_objset_find_impl(spa_t *spa, const char *name, err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { - uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + uint64_t snapobj; + + snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 1f61368c5d65..108fa098b985 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ @@ -602,12 +602,12 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, featureflags); drr->drr_u.drr_begin.drr_creation_time = - ds->ds_phys->ds_creation_time; + dsl_dataset_phys(ds)->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (is_clone) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; - drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; - if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; if (fromzb != NULL) { @@ -615,7 +615,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, fromtxg = fromzb->zbm_creation_txg; } dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } @@ -628,7 +628,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dsp->dsa_proc = curproc; dsp->dsa_os = os; dsp->dsa_off = off; - dsp->dsa_toguid = ds->ds_phys->ds_guid; + dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_incremental = (fromzb != NULL); @@ -713,9 +713,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, } if (!dsl_dataset_is_before(ds, fromds, 0)) err = SET_ERROR(EXDEV); - zb.zbm_creation_time = fromds->ds_phys->ds_creation_time; - zb.zbm_creation_txg = fromds->ds_phys->ds_creation_txg; - zb.zbm_guid = fromds->ds_phys->ds_guid; + zb.zbm_creation_time = + dsl_dataset_phys(fromds)->ds_creation_time; + zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; + zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, @@ -781,10 +782,10 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, if (!dsl_dataset_is_before(ds, fromds, 0)) err = SET_ERROR(EXDEV); zb.zbm_creation_time = - fromds->ds_phys->ds_creation_time; + dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = - fromds->ds_phys->ds_creation_txg; - zb.zbm_guid = fromds->ds_phys->ds_guid; + dsl_dataset_phys(fromds)->ds_creation_txg; + zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (ds->ds_dir != fromds->ds_dir); dsl_dataset_rele(fromds, FTAG); } @@ -819,7 +820,7 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) ASSERT(dsl_pool_config_held(dp)); /* tosnap must be a snapshot */ - if (!dsl_dataset_is_snapshot(ds)) + if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* @@ -831,7 +832,7 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) /* Get uncompressed size estimate of changed data. */ if (fromds == NULL) { - size = ds->ds_phys->ds_uncompressed_bytes; + size = dsl_dataset_phys(ds)->ds_uncompressed_bytes; } else { uint64_t used, comp; err = dsl_dataset_space_written(fromds, ds, @@ -885,21 +886,35 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, /* temporary clone name must not exist */ error = zap_lookup(dp->dp_meta_objset, - ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name, + dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EBUSY : error); /* new snapshot name must not exist */ error = zap_lookup(dp->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, - 8, 1, &val); + dsl_dataset_phys(ds)->ds_snapnames_zapobj, + drba->drba_cookie->drc_tosnap, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EEXIST : error); + /* + * Check snapshot limit before receiving. We'll recheck again at the + * end, but might as well abort before receiving if we're already over + * the limit. + * + * Note that we do not check the file system limit with + * dsl_dir_fscount_check because the temporary %clones don't count + * against that limit. + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, + NULL, drba->drba_cred); + if (error != 0) + return (error); + if (fromguid != 0) { dsl_dataset_t *snap; - uint64_t obj = ds->ds_phys->ds_prev_snap_obj; + uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; /* Find snapshot in this dir that matches fromguid. */ while (obj != 0) { @@ -911,9 +926,9 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ENODEV)); } - if (snap->ds_phys->ds_guid == fromguid) + if (dsl_dataset_phys(snap)->ds_guid == fromguid) break; - obj = snap->ds_phys->ds_prev_snap_obj; + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); } if (obj == 0) @@ -936,9 +951,9 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, dsl_dataset_rele(snap, FTAG); } else { /* if full, most recent snapshot must be $ORIGIN */ - if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) + if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= TXG_INITIAL) return (SET_ERROR(ENODEV)); - drba->drba_snapobj = ds->ds_phys->ds_prev_snap_obj; + drba->drba_snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } return (0); @@ -1015,6 +1030,25 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) if (error != 0) return (error); + /* + * Check filesystem and snapshot limits before receiving. We'll + * recheck snapshot limits again at the end (we create the + * filesystems and increment those counts during begin_sync). + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + if (drba->drba_origin != NULL) { dsl_dataset_t *origin; error = dsl_dataset_hold(dp, drba->drba_origin, @@ -1023,12 +1057,12 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) dsl_dataset_rele(ds, FTAG); return (error); } - if (!dsl_dataset_is_snapshot(origin)) { + if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } - if (origin->ds_phys->ds_guid != fromguid) { + if (dsl_dataset_phys(origin)->ds_guid != fromguid) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); @@ -1092,7 +1126,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); dmu_buf_will_dirty(newds->ds_dbuf, tx); - newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; /* * If we actually created a non-clone, we need to create the @@ -1124,6 +1158,7 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; + drc->drc_cred = CRED(); if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) drc->drc_byteswap = B_TRUE; @@ -1730,7 +1765,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, */ VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); - ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); + ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); @@ -1893,8 +1928,11 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) * the snap before drc_ds, because drc_ds can not * have any snaps of its own). */ - uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; - while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { + uint64_t obj; + + obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + while (obj != + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); @@ -1906,7 +1944,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) error = dsl_destroy_snapshot_check_impl( snap, B_FALSE); } - obj = snap->ds_phys->ds_prev_snap_obj; + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); if (error != 0) return (error); @@ -1919,7 +1957,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) return (error); } error = dsl_dataset_snapshot_check_impl(origin_head, - drc->drc_tosnap, tx, B_TRUE); + drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); dsl_dataset_rele(origin_head, FTAG); if (error != 0) return (error); @@ -1927,7 +1965,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) error = dsl_destroy_head_check_impl(drc->drc_ds, 1); } else { error = dsl_dataset_snapshot_check_impl(drc->drc_ds, - drc->drc_tosnap, tx, B_TRUE); + drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); } return (error); } @@ -1952,13 +1990,16 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) * Destroy any snapshots of drc_tofs (origin_head) * after the origin (the snap before drc_ds). */ - uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; - while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { + uint64_t obj; + + obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + while (obj != + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &snap)); ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); - obj = snap->ds_phys->ds_prev_snap_obj; + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_destroy_snapshot_sync_impl(snap, B_FALSE, tx); dsl_dataset_rele(snap, FTAG); @@ -1974,15 +2015,16 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) /* set snapshot's creation time and guid */ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); - origin_head->ds_prev->ds_phys->ds_creation_time = + dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; - origin_head->ds_prev->ds_phys->ds_guid = + dsl_dataset_phys(origin_head->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; - origin_head->ds_prev->ds_phys->ds_flags &= + dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(origin_head->ds_dbuf, tx); - origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + dsl_dataset_phys(origin_head)->ds_flags &= + ~DS_FLAG_INCONSISTENT; dsl_dataset_rele(origin_head, FTAG); dsl_destroy_head_sync_impl(drc->drc_ds, tx); @@ -1996,15 +2038,17 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ds->ds_prev->ds_phys->ds_creation_time = + dsl_dataset_phys(ds->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; - ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid; - ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + dsl_dataset_phys(ds->ds_prev)->ds_guid = + drc->drc_drrb->drr_toguid; + dsl_dataset_phys(ds->ds_prev)->ds_flags &= + ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; } - drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj; + drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; /* * Release the hold from dmu_recv_begin. This must be done before * we return to open context, so that when we free the dataset's dnode, @@ -2030,7 +2074,7 @@ add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); if (err == 0) { - gmep->guid = snapds->ds_phys->ds_guid; + gmep->guid = dsl_dataset_phys(snapds)->ds_guid; gmep->gme_ds = snapds; avl_add(guid_map, gmep); dsl_dataset_long_hold(snapds, gmep); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 9280a89b2f85..ac7386b6c5ae 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -539,7 +539,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); /* See comment on ZIL traversal in dsl_scan_visitds. */ - if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { + if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { uint32_t flags = ARC_WAIT; objset_phys_t *osp; arc_buf_t *buf; @@ -588,7 +588,7 @@ traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, - &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg)); + &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg)); } int @@ -643,8 +643,8 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, continue; break; } - if (ds->ds_phys->ds_prev_snap_txg > txg) - txg = ds->ds_phys->ds_prev_snap_txg; + if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg) + txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); if (err != 0) diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 3d6dcc70f305..89f45a781356 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -699,6 +699,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) { dmu_tx_hold_t *txh; dnode_t *dn; + dsl_dataset_phys_t *ds_phys; uint64_t nblocks; int epbs, err; @@ -773,8 +774,9 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) * we'll have to modify an indirect twig for each. */ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + ds_phys = dsl_dataset_phys(dn->dn_objset->os_dsl_dataset); for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) - if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) + if (ds_phys->ds_prev_snap_obj) txh->txh_space_towrite += 3 << dn->dn_indblkshift; else txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index ef74621a0f6c..2358849319ab 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -62,6 +63,45 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT; static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); #endif /* _KERNEL */ +static int +dbuf_compare(const void *x1, const void *x2) +{ + const dmu_buf_impl_t *d1 = x1; + const dmu_buf_impl_t *d2 = x2; + + if (d1->db_level < d2->db_level) { + return (-1); + } + if (d1->db_level > d2->db_level) { + return (1); + } + + if (d1->db_blkid < d2->db_blkid) { + return (-1); + } + if (d1->db_blkid > d2->db_blkid) { + return (1); + } + + if (d1->db_state < d2->db_state) { + return (-1); + } + if (d1->db_state > d2->db_state) { + return (1); + } + + ASSERT3S(d1->db_state, !=, DB_SEARCH); + ASSERT3S(d2->db_state, !=, DB_SEARCH); + + if ((uintptr_t)d1 < (uintptr_t)d2) { + return (-1); + } + if ((uintptr_t)d1 > (uintptr_t)d2) { + return (1); + } + return (0); +} + /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) @@ -116,7 +156,7 @@ dnode_cons(void *arg, void *unused, int kmflag) dn->dn_dbufs_count = 0; dn->dn_unlisted_l0_blkid = 0; - list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), + avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); dn->dn_moved = 0; @@ -169,7 +209,7 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_dbufs_count); ASSERT0(dn->dn_unlisted_l0_blkid); - list_destroy(&dn->dn_dbufs); + avl_destroy(&dn->dn_dbufs); } void @@ -366,8 +406,9 @@ static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + dnode_t *dn; + dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(dn->dn_objset)); dn->dn_moved = 0; @@ -404,13 +445,31 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); - list_insert_head(&os->os_dnodes, dn); + if (dnh->dnh_dnode != NULL) { + /* Lost the allocation race. */ + mutex_exit(&os->os_lock); + kmem_cache_free(dnode_cache, dn); + return (dnh->dnh_dnode); + } + + /* + * Exclude special dnodes from os_dnodes so an empty os_dnodes + * signifies that the special dnodes have no references from + * their children (the entries in os_dnodes). This allows + * dnode_destroy() to easily determine if the last child has + * been removed and then complete eviction of the objset. + */ + if (!DMU_OBJECT_IS_SPECIAL(object)) + list_insert_head(&os->os_dnodes, dn); membar_producer(); + /* - * Everything else must be valid before assigning dn_objset makes the - * dnode eligible for dnode_move(). + * Everything else must be valid before assigning dn_objset + * makes the dnode eligible for dnode_move(). */ dn->dn_objset = os; + + dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); @@ -424,12 +483,18 @@ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; + boolean_t complete_os_eviction = B_FALSE; ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); POINTER_INVALIDATE(&dn->dn_objset); - list_remove(&os->os_dnodes, dn); + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + list_remove(&os->os_dnodes, dn); + complete_os_eviction = + list_is_empty(&os->os_dnodes) && + list_link_active(&os->os_evicting_node); + } mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ @@ -464,6 +529,9 @@ dnode_destroy(dnode_t *dn) dmu_zfetch_rele(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); + + if (complete_os_eviction) + dmu_objset_evict_done(os); } void @@ -503,7 +571,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_assigned_txg); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + ASSERT(avl_is_empty(&dn->dn_dbufs)); for (i = 0; i < TXG_SIZE; i++) { ASSERT0(dn->dn_next_nblkptr[i]); @@ -689,8 +757,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ASSERT(refcount_count(&odn->dn_tx_holds) == 0); refcount_transfer(&ndn->dn_holds, &odn->dn_holds); - ASSERT(list_is_empty(&ndn->dn_dbufs)); - list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs); + ASSERT(avl_is_empty(&ndn->dn_dbufs)); + avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); ndn->dn_dbufs_count = odn->dn_dbufs_count; ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid; ndn->dn_bonus = odn->dn_bonus; @@ -724,7 +792,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) */ odn->dn_dbuf = NULL; odn->dn_handle = NULL; - list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t), + avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); odn->dn_dbufs_count = 0; odn->dn_unlisted_l0_blkid = 0; @@ -929,33 +997,32 @@ dnode_special_close(dnode_handle_t *dnh) */ while (refcount_count(&dn->dn_holds) > 0) delay(1); + ASSERT(dn->dn_dbuf == NULL || + dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } -dnode_t * +void dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh); - dnh->dnh_dnode = dn; + dnode_t *dn; + + dn = dnode_create(os, dnp, NULL, object, dnh); zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); - return (dn); } static void -dnode_buf_pageout(dmu_buf_t *db, void *arg) +dnode_buf_pageout(void *dbu) { - dnode_children_t *children_dnodes = arg; + dnode_children_t *children_dnodes = dbu; int i; - int epb = db->db_size >> DNODE_SHIFT; - ASSERT(epb == children_dnodes->dnc_count); - - for (i = 0; i < epb; i++) { + for (i = 0; i < children_dnodes->dnc_count; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; @@ -985,7 +1052,7 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg) dnh->dnh_dnode = NULL; } kmem_free(children_dnodes, sizeof (dnode_children_t) + - (epb - 1) * sizeof (dnode_handle_t)); + children_dnodes->dnc_count * sizeof (dnode_handle_t)); } /* @@ -1069,18 +1136,24 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, if (children_dnodes == NULL) { int i; dnode_children_t *winner; - children_dnodes = kmem_alloc(sizeof (dnode_children_t) + - (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP); + children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + + epb * sizeof (dnode_handle_t), KM_SLEEP); children_dnodes->dnc_count = epb; dnh = &children_dnodes->dnc_children[0]; for (i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); - dnh[i].dnh_dnode = NULL; } - if ((winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, - dnode_buf_pageout))) { + dmu_buf_init_user(&children_dnodes->dnc_dbu, + dnode_buf_pageout, NULL); + winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); + if (winner != NULL) { + + for (i = 0; i < epb; i++) { + zrl_destroy(&dnh[i].dnh_zrlock); + } + kmem_free(children_dnodes, sizeof (dnode_children_t) + - (epb - 1) * sizeof (dnode_handle_t)); + epb * sizeof (dnode_handle_t)); children_dnodes = winner; } } @@ -1088,17 +1161,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); - if ((dn = dnh->dnh_dnode) == NULL) { + dn = dnh->dnh_dnode; + if (dn == NULL) { dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; - dnode_t *winner; dn = dnode_create(os, phys, db, object, dnh); - winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn); - if (winner != NULL) { - zrl_add(&dnh->dnh_zrlock); - dnode_destroy(dn); /* implicit zrl_remove() */ - dn = winner; - } } mutex_enter(&dn->dn_mtx); @@ -1112,10 +1179,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } - mutex_exit(&dn->dn_mtx); - if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); + mutex_exit(&dn->dn_mtx); + /* Now we can rely on the hold to prevent the dnode from moving. */ zrl_remove(&dnh->dnh_zrlock); @@ -1233,7 +1300,8 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) return; } - ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); + ASSERT(!refcount_is_zero(&dn->dn_holds) || + !avl_is_empty(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]); ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]); @@ -1306,7 +1374,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx) int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { - dmu_buf_impl_t *db, *db_next; + dmu_buf_impl_t *db; int err; if (size == 0) @@ -1329,9 +1397,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) goto fail; mutex_enter(&dn->dn_dbufs_mtx); - for (db = list_head(&dn->dn_dbufs); db; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); - + for (db = avl_first(&dn->dn_dbufs); db != NULL; + db = AVL_NEXT(&dn->dn_dbufs, db)) { if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 1825e983551c..1bf243be51f7 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -402,57 +403,37 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) void dnode_evict_dbufs(dnode_t *dn) { - int progress; - int pass = 0; - - do { - dmu_buf_impl_t *db, marker; - int evicting = FALSE; - - progress = FALSE; - mutex_enter(&dn->dn_dbufs_mtx); - list_insert_tail(&dn->dn_dbufs, &marker); - db = list_head(&dn->dn_dbufs); - for (; db != ▮ db = list_head(&dn->dn_dbufs)) { - list_remove(&dn->dn_dbufs, db); - list_insert_tail(&dn->dn_dbufs, db); + dmu_buf_impl_t db_marker; + dmu_buf_impl_t *db, *db_next; + + mutex_enter(&dn->dn_dbufs_mtx); + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { + #ifdef DEBUG - DB_DNODE_ENTER(db); - ASSERT3P(DB_DNODE(db), ==, dn); - DB_DNODE_EXIT(db); + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); #endif /* DEBUG */ - mutex_enter(&db->db_mtx); - if (db->db_state == DB_EVICTING) { - progress = TRUE; - evicting = TRUE; - mutex_exit(&db->db_mtx); - } else if (refcount_is_zero(&db->db_holds)) { - progress = TRUE; - dbuf_clear(db); /* exits db_mtx for us */ - } else { - mutex_exit(&db->db_mtx); - } - + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING && + refcount_is_zero(&db->db_holds)) { + db_marker.db_level = db->db_level; + db_marker.db_blkid = db->db_blkid; + db_marker.db_state = DB_SEARCH; + avl_insert_here(&dn->dn_dbufs, &db_marker, db, + AVL_BEFORE); + + dbuf_clear(db); + + db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); + avl_remove(&dn->dn_dbufs, &db_marker); + } else { + mutex_exit(&db->db_mtx); + db_next = AVL_NEXT(&dn->dn_dbufs, db); } - list_remove(&dn->dn_dbufs, &marker); - /* - * NB: we need to drop dn_dbufs_mtx between passes so - * that any DB_EVICTING dbufs can make progress. - * Ideally, we would have some cv we could wait on, but - * since we don't, just wait a bit to give the other - * thread a chance to run. - */ - mutex_exit(&dn->dn_dbufs_mtx); - if (evicting) - delay(1); - pass++; - if ((pass % 100) == 0) - dprintf("Exceeded %d passes evicting dbufs\n", pass); - } while (progress); - - if (pass >= 100) - dprintf("Required %d passes to evict dbufs\n", pass); + } + mutex_exit(&dn->dn_dbufs_mtx); dnode_evict_bonus(dn); } @@ -491,6 +472,9 @@ dnode_undirty_dbufs(list_t *list) ASSERT(db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_data == db->db_buf); dbuf_unoverride(dr); + } else { + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); @@ -513,8 +497,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); - ASSERT3P(dn->dn_bonus, ==, NULL); + ASSERT(avl_is_empty(&dn->dn_dbufs)); /* * XXX - It would be nice to assert this, but we may still diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index 2cae5cd4d188..a4cb361d4e7a 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -65,7 +65,7 @@ dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, if (bmark_zapobj == 0) return (SET_ERROR(ESRCH)); - if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; @@ -120,7 +120,7 @@ dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name, int error; zfs_bookmark_phys_t bmark_phys; - if (!dsl_dataset_is_snapshot(snapds)) + if (!snapds->ds_is_snapshot) return (SET_ERROR(EINVAL)); error = dsl_bookmark_hold_ds(dp, bookmark_name, @@ -210,10 +210,11 @@ dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) &bmark_fs->ds_bookmarks, tx)); } - bmark_phys.zbm_guid = snapds->ds_phys->ds_guid; - bmark_phys.zbm_creation_txg = snapds->ds_phys->ds_creation_txg; + bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid; + bmark_phys.zbm_creation_txg = + dsl_dataset_phys(snapds)->ds_creation_txg; bmark_phys.zbm_creation_time = - snapds->ds_phys->ds_creation_time; + dsl_dataset_phys(snapds)->ds_creation_time; VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks, shortname, sizeof (uint64_t), @@ -342,7 +343,7 @@ dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) uint64_t bmark_zapobj = ds->ds_bookmarks; matchtype_t mt; - if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 79cb6a3a25e5..1b640d699539 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -21,8 +21,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -61,6 +62,8 @@ #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE +extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); + /* * Figure out how much of this delta should be propogated to the dsl_dir * layer. If there's a refreservation, that space has already been @@ -69,13 +72,15 @@ static int64_t parent_delta(dsl_dataset_t *ds, int64_t delta) { + dsl_dataset_phys_t *ds_phys; uint64_t old_bytes, new_bytes; if (ds->ds_reserved == 0) return (delta); - old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); - new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); + ds_phys = dsl_dataset_phys(ds); + old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved); + new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved); ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); return (new_bytes - old_bytes); @@ -108,10 +113,10 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); - ds->ds_phys->ds_referenced_bytes += used; - ds->ds_phys->ds_compressed_bytes += compressed; - ds->ds_phys->ds_uncompressed_bytes += uncompressed; - ds->ds_phys->ds_unique_bytes += used; + dsl_dataset_phys(ds)->ds_referenced_bytes += used; + dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; + dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; + dsl_dataset_phys(ds)->ds_unique_bytes += used; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, compressed, uncompressed, tx); @@ -141,20 +146,20 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); - ASSERT(!dsl_dataset_is_snapshot(ds)); + ASSERT(!ds->ds_is_snapshot); dmu_buf_will_dirty(ds->ds_dbuf, tx); - if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { + if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_lock); - ASSERT(ds->ds_phys->ds_unique_bytes >= used || + ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || !DS_UNIQUE_IS_ACCURATE(ds)); delta = parent_delta(ds, -used); - ds->ds_phys->ds_unique_bytes -= used; + dsl_dataset_phys(ds)->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, -compressed, -uncompressed, tx); @@ -175,15 +180,15 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); } ASSERT3U(ds->ds_prev->ds_object, ==, - ds->ds_phys->ds_prev_snap_obj); - ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); + dsl_dataset_phys(ds)->ds_prev_snap_obj); + ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ - if (ds->ds_prev->ds_phys->ds_next_snap_obj == + if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object && bp->blk_birth > - ds->ds_prev->ds_phys->ds_prev_snap_txg) { + dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); mutex_enter(&ds->ds_prev->ds_lock); - ds->ds_prev->ds_phys->ds_unique_bytes += used; + dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { @@ -192,12 +197,12 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, } } mutex_enter(&ds->ds_lock); - ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used); - ds->ds_phys->ds_referenced_bytes -= used; - ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); - ds->ds_phys->ds_compressed_bytes -= compressed; - ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); - ds->ds_phys->ds_uncompressed_bytes -= uncompressed; + ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used); + dsl_dataset_phys(ds)->ds_referenced_bytes -= used; + ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed); + dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed; + ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed); + dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed; mutex_exit(&ds->ds_lock); return (used); @@ -223,7 +228,7 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) if (ds->ds_trysnap_txg > spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) trysnap = ds->ds_trysnap_txg; - return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); + return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap)); } boolean_t @@ -239,14 +244,15 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, return (B_TRUE); } -/* ARGSUSED */ static void -dsl_dataset_evict(dmu_buf_t *db, void *dsv) +dsl_dataset_evict(void *dbu) { - dsl_dataset_t *ds = dsv; + dsl_dataset_t *ds = dbu; ASSERT(ds->ds_owner == NULL); + ds->ds_dbuf = NULL; + unique_remove(ds->ds_fsid_guid); if (ds->ds_objset != NULL) @@ -258,15 +264,16 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) } bplist_destroy(&ds->ds_pending_deadlist); - if (ds->ds_phys->ds_deadlist_obj != 0) + if (ds->ds_deadlist.dl_os != NULL) dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_dir) - dsl_dir_rele(ds->ds_dir, ds); + dsl_dir_async_rele(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_sendstream_lock); refcount_destroy(&ds->ds_longholds); kmem_free(ds, sizeof (dsl_dataset_t)); @@ -283,10 +290,10 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) if (ds->ds_snapname[0]) return (0); - if (ds->ds_phys->ds_next_snap_obj == 0) + if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) return (0); - err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, + err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &headdbuf); if (err != 0) return (err); @@ -301,11 +308,11 @@ int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; matchtype_t mt; int err; - if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; @@ -318,16 +325,17 @@ dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) } int -dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) +dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, + boolean_t adj_cnt) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; matchtype_t mt; int err; dsl_dir_snap_cmtime_update(ds->ds_dir); - if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; @@ -335,6 +343,11 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) err = zap_remove_norm(mos, snapobj, name, mt, tx); if (err == ENOTSUP && mt == MT_FIRST) err = zap_remove(mos, snapobj, name, tx); + + if (err == 0 && adj_cnt) + dsl_fs_ss_count_adjust(ds->ds_dir, -1, + DD_FIELD_SNAPSHOT_COUNT, tx); + return (err); } @@ -368,7 +381,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; - ds->ds_phys = dbuf->db_data; + ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; list_link_init(&ds->ds_synced_link); mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); @@ -378,18 +391,20 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, bplist_create(&ds->ds_pending_deadlist); dsl_deadlist_open(&ds->ds_deadlist, - mos, ds->ds_phys->ds_deadlist_obj); + mos, dsl_dataset_phys(ds)->ds_deadlist_obj); list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), offsetof(dmu_sendarg_t, dsa_link)); if (err == 0) { err = dsl_dir_hold_obj(dp, - ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); + dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, + &ds->ds_dir); } if (err != 0) { mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_sendstream_lock); refcount_destroy(&ds->ds_longholds); bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); @@ -398,11 +413,11 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, return (err); } - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { ds->ds_snapname[0] = '\0'; - if (ds->ds_phys->ds_prev_snap_obj != 0) { + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, + dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev); } if (doi.doi_type == DMU_OTN_ZAP_METADATA) { @@ -416,15 +431,16 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) err = dsl_dataset_get_snapname(ds); - if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { + if (err == 0 && + dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { err = zap_count( ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_userrefs_obj, + dsl_dataset_phys(ds)->ds_userrefs_obj, &ds->ds_userrefs); } } - if (err == 0 && !dsl_dataset_is_snapshot(ds)) { + if (err == 0 && !ds->ds_is_snapshot) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &ds->ds_reserved); @@ -437,8 +453,11 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_reserved = ds->ds_quota = 0; } - if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds, - &ds->ds_phys, dsl_dataset_evict)) != NULL) { + dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf); + if (err == 0) + winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); + + if (err != 0 || winner != NULL) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) @@ -446,6 +465,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dir_rele(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_sendstream_lock); refcount_destroy(&ds->ds_longholds); kmem_free(ds, sizeof (dsl_dataset_t)); if (err != 0) { @@ -455,12 +475,12 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds = winner; } else { ds->ds_fsid_guid = - unique_insert(ds->ds_phys->ds_fsid_guid); + unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid); } } ASSERT3P(ds->ds_dbuf, ==, dbuf); - ASSERT3P(ds->ds_phys, ==, dbuf->db_data); - ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || + ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); + ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); *dsp = ds; @@ -481,7 +501,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, return (err); ASSERT(dsl_pool_config_held(dp)); - obj = dd->dd_phys->dd_head_dataset_obj; + obj = dsl_dir_phys(dd)->dd_head_dataset_obj; if (obj != 0) err = dsl_dataset_hold_obj(dp, obj, tag, dsp); else @@ -609,16 +629,14 @@ dsl_dataset_rele(dsl_dataset_t *ds, void *tag) void dsl_dataset_disown(dsl_dataset_t *ds, void *tag) { - ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL); + ASSERT3P(ds->ds_owner, ==, tag); + ASSERT(ds->ds_dbuf != NULL); mutex_enter(&ds->ds_lock); ds->ds_owner = NULL; mutex_exit(&ds->ds_lock); dsl_dataset_long_rele(ds, tag); - if (ds->ds_dbuf != NULL) - dsl_dataset_rele(ds, tag); - else - dsl_dataset_evict(NULL, ds); + dsl_dataset_rele(ds, tag); } boolean_t @@ -650,9 +668,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, origin = dp->dp_origin_snap; ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); - ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); + ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0); ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); + ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0); dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); @@ -678,52 +696,55 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = - origin->ds_phys->ds_creation_txg; + dsl_dataset_phys(origin)->ds_creation_txg; dsphys->ds_referenced_bytes = - origin->ds_phys->ds_referenced_bytes; + dsl_dataset_phys(origin)->ds_referenced_bytes; dsphys->ds_compressed_bytes = - origin->ds_phys->ds_compressed_bytes; + dsl_dataset_phys(origin)->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = - origin->ds_phys->ds_uncompressed_bytes; - dsphys->ds_bp = origin->ds_phys->ds_bp; + dsl_dataset_phys(origin)->ds_uncompressed_bytes; + dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp; /* * Inherit flags that describe the dataset's contents * (INCONSISTENT) or properties (Case Insensitive). */ - dsphys->ds_flags |= origin->ds_phys->ds_flags & + dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags & (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); dmu_buf_will_dirty(origin->ds_dbuf, tx); - origin->ds_phys->ds_num_children++; + dsl_dataset_phys(origin)->ds_num_children++; VERIFY0(dsl_dataset_hold_obj(dp, - origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); + dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj, + FTAG, &ohds)); dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); dsl_dataset_rele(ohds, FTAG); if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { - if (origin->ds_phys->ds_next_clones_obj == 0) { - origin->ds_phys->ds_next_clones_obj = + if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) { + dsl_dataset_phys(origin)->ds_next_clones_obj = zap_create(mos, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(mos, - origin->ds_phys->ds_next_clones_obj, dsobj, tx)); + dsl_dataset_phys(origin)->ds_next_clones_obj, + dsobj, tx)); } dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_origin_obj = origin->ds_object; + dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object; if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { - if (origin->ds_dir->dd_phys->dd_clones == 0) { + if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); - origin->ds_dir->dd_phys->dd_clones = + dsl_dir_phys(origin->ds_dir)->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(mos, - origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); + dsl_dir_phys(origin->ds_dir)->dd_clones, + dsobj, tx)); } } @@ -733,7 +754,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_head_dataset_obj = dsobj; + dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj; return (dsobj); } @@ -767,6 +788,21 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_deleg_set_create_perms(dd, tx, cr); + /* + * Since we're creating a new node we know it's a leaf, so we can + * initialize the counts if the limit feature is active. + */ + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { + uint64_t cnt = 0; + objset_t *os = dd->dd_pool->dp_meta_objset; + + dsl_dir_zapify(dd, tx); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (cnt), 1, &cnt, tx)); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (cnt), 1, &cnt, tx)); + } + dsl_dir_rele(dd, FTAG); /* @@ -798,22 +834,22 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; - ASSERT(!dsl_dataset_is_snapshot(ds)); + ASSERT(!ds->ds_is_snapshot); - if (ds->ds_phys->ds_prev_snap_obj != 0) - mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) + mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; else mrs_used = 0; dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ASSERT3U(dlused, <=, mrs_used); - ds->ds_phys->ds_unique_bytes = - ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); + dsl_dataset_phys(ds)->ds_unique_bytes = + dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused); if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) - ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } void @@ -824,8 +860,9 @@ dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, int err; ASSERTV(uint64_t count); - ASSERT(ds->ds_phys->ds_num_children >= 2); - err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); + ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2); + err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, + obj, tx); /* * The err should not be ENOENT, but a bug in a previous version * of the code could cause upgrade_clones_cb() to not set @@ -838,16 +875,16 @@ dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, */ if (err != ENOENT) VERIFY0(err); - ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj, + ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, &count)); - ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); + ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2); } blkptr_t * dsl_dataset_get_blkptr(dsl_dataset_t *ds) { - return (&ds->ds_phys->ds_bp); + return (&dsl_dataset_phys(ds)->ds_bp); } void @@ -859,7 +896,7 @@ dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) tx->tx_pool->dp_meta_rootbp = *bp; } else { dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_bp = *bp; + dsl_dataset_phys(ds)->ds_bp = *bp; } } @@ -879,7 +916,7 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) ASSERT(ds->ds_objset != NULL); - if (ds->ds_phys->ds_next_snap_obj != 0) + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) panic("dirtying snapshot!"); dp = ds->ds_dir->dd_pool; @@ -917,7 +954,7 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) * outside of the reservation. */ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); - asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); + asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (SET_ERROR(ENOSPC)); @@ -935,11 +972,12 @@ typedef struct dsl_dataset_snapshot_arg { nvlist_t *ddsa_snaps; nvlist_t *ddsa_props; nvlist_t *ddsa_errors; + cred_t *ddsa_cr; } dsl_dataset_snapshot_arg_t; int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx, boolean_t recv) + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr) { int error; uint64_t value; @@ -953,7 +991,7 @@ dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, * We don't allow multiple snapshots of the same txg. If there * is already one, try again. */ - if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) + if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) return (SET_ERROR(EAGAIN)); /* @@ -977,6 +1015,18 @@ dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, if (!recv && DS_IS_INCONSISTENT(ds)) return (SET_ERROR(EBUSY)); + /* + * Skip the check for temporary snapshots or if we have already checked + * the counts in dsl_dataset_snapshot_check. This means we really only + * check the count here when we're receiving a stream. + */ + if (cnt != 0 && cr != NULL) { + error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr); + if (error != 0) + return (error); + } + error = dsl_dataset_snapshot_reserve_space(ds, tx); if (error != 0) return (error); @@ -992,6 +1042,103 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) nvpair_t *pair; int rv = 0; + /* + * Pre-compute how many total new snapshots will be created for each + * level in the tree and below. This is needed for validating the + * snapshot limit when either taking a recursive snapshot or when + * taking multiple snapshots. + * + * The problem is that the counts are not actually adjusted when + * we are checking, only when we finally sync. For a single snapshot, + * this is easy, the count will increase by 1 at each node up the tree, + * but its more complicated for the recursive/multiple snapshot case. + * + * The dsl_fs_ss_limit_check function does recursively check the count + * at each level up the tree but since it is validating each snapshot + * independently we need to be sure that we are validating the complete + * count for the entire set of snapshots. We do this by rolling up the + * counts for each component of the name into an nvlist and then + * checking each of those cases with the aggregated count. + * + * This approach properly handles not only the recursive snapshot + * case (where we get all of those on the ddsa_snaps list) but also + * the sibling case (e.g. snapshot a/b and a/c so that we will also + * validate the limit on 'a' using a count of 2). + * + * We validate the snapshot names in the third loop and only report + * name errors once. + */ + if (dmu_tx_is_syncing(tx)) { + char *nm; + nvlist_t *cnt_track = NULL; + cnt_track = fnvlist_alloc(); + + nm = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + /* Rollup aggregated counts into the cnt_track list */ + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; + pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + char *pdelim; + uint64_t val; + + (void) strlcpy(nm, nvpair_name(pair), sizeof (nm)); + pdelim = strchr(nm, '@'); + if (pdelim == NULL) + continue; + *pdelim = '\0'; + + do { + if (nvlist_lookup_uint64(cnt_track, nm, + &val) == 0) { + /* update existing entry */ + fnvlist_add_uint64(cnt_track, nm, + val + 1); + } else { + /* add to list */ + fnvlist_add_uint64(cnt_track, nm, 1); + } + + pdelim = strrchr(nm, '/'); + if (pdelim != NULL) + *pdelim = '\0'; + } while (pdelim != NULL); + } + + kmem_free(nm, MAXPATHLEN); + + /* Check aggregated counts at each level */ + for (pair = nvlist_next_nvpair(cnt_track, NULL); + pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) { + int error = 0; + char *name; + uint64_t cnt = 0; + dsl_dataset_t *ds; + + name = nvpair_name(pair); + cnt = fnvpair_value_uint64(pair); + ASSERT(cnt > 0); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error == 0) { + error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, + ddsa->ddsa_cr); + dsl_dataset_rele(ds, FTAG); + } + + if (error != 0) { + if (ddsa->ddsa_errors != NULL) + fnvlist_add_int32(ddsa->ddsa_errors, + name, error); + rv = error; + /* only report one error for this check */ + break; + } + } + nvlist_free(cnt_track); + } + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { int error = 0; @@ -1012,8 +1159,9 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) if (error == 0) error = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (error == 0) { + /* passing 0/NULL skips dsl_fs_ss_limit_check */ error = dsl_dataset_snapshot_check_impl(ds, - atp + 1, tx, B_FALSE); + atp + 1, tx, B_FALSE, 0, NULL); dsl_dataset_rele(ds, FTAG); } @@ -1025,6 +1173,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) rv = error; } } + return (rv); } @@ -1051,6 +1200,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, bcmp(&os->os_phys->os_zil_header, &zero_zil, sizeof (zero_zil)) == 0); + dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); /* * The origin's ds_creation_txg has to be < TXG_INITIAL @@ -1070,32 +1220,35 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsphys->ds_fsid_guid = unique_create(); (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); - dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; - dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; + dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsphys->ds_next_snap_obj = ds->ds_object; dsphys->ds_num_children = 1; dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = crtxg; - dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; - dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes; - dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; - dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; - dsphys->ds_flags = ds->ds_phys->ds_flags; - dsphys->ds_bp = ds->ds_phys->ds_bp; + dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; + dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes; + dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = + dsl_dataset_phys(ds)->ds_uncompressed_bytes; + dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags; + dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp; dmu_buf_rele(dbuf, FTAG); - ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); + ASSERT3U(ds->ds_prev != 0, ==, + dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); if (ds->ds_prev) { uint64_t next_clones_obj = - ds->ds_prev->ds_phys->ds_next_clones_obj; - ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == + dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj; + ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object || - ds->ds_prev->ds_phys->ds_num_children > 1); - if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { + dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1); + if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == + ds->ds_object) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, - ds->ds_prev->ds_phys->ds_creation_txg); - ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, + dsl_dataset_phys(ds->ds_prev)->ds_creation_txg); + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj; } else if (next_clones_obj != 0) { dsl_dataset_remove_from_next_clones(ds->ds_prev, dsphys->ds_next_snap_obj, tx); @@ -1112,33 +1265,36 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, if (ds->ds_reserved) { int64_t delta; ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); - delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); + delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, + ds->ds_reserved); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); } dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, - UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); + dsl_dataset_phys(ds)->ds_deadlist_obj = + dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, + dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&ds->ds_deadlist, mos, + dsl_dataset_phys(ds)->ds_deadlist_obj); dsl_deadlist_add_key(&ds->ds_deadlist, - ds->ds_phys->ds_prev_snap_txg, tx); + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); - ds->ds_phys->ds_prev_snap_obj = dsobj; - ds->ds_phys->ds_prev_snap_txg = crtxg; - ds->ds_phys->ds_unique_bytes = 0; + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); + dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; + dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; + dsl_dataset_phys(ds)->ds_unique_bytes = 0; if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) - ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; - VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, + VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, snapname, 8, 1, &dsobj, tx)); if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); + dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); dsl_scan_ds_snapshotted(ds, tx); @@ -1227,6 +1383,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) ddsa.ddsa_snaps = snaps; ddsa.ddsa_props = props; ddsa.ddsa_errors = errors; + ddsa.ddsa_cr = CRED(); if (error == 0) { error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, @@ -1275,8 +1432,9 @@ dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) if (error != 0) return (error); + /* NULL cred means no limit check for tmp snapshot */ error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, - tx, B_FALSE); + tx, B_FALSE, 0, NULL); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); @@ -1355,14 +1513,14 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); - ASSERT(ds->ds_phys->ds_next_snap_obj == 0); + ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0); /* * in case we had to change ds_fsid_guid when we opened it, * sync it out now. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; + dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; dmu_objset_sync(ds->ds_objset, zio, tx); } @@ -1384,13 +1542,14 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) * due to a bug in a previous version of the code. * Only trust it if it has the right number of entries. */ - if (ds->ds_phys->ds_next_clones_obj != 0) { - VERIFY0(zap_count(mos, ds->ds_phys->ds_next_clones_obj, + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { + VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, &count)); } - if (count != ds->ds_phys->ds_num_children - 1) + if (count != dsl_dataset_phys(ds)->ds_num_children - 1) goto fail; - for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); + for (zap_cursor_init(&zc, mos, + dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; @@ -1417,18 +1576,18 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) ASSERT(dsl_pool_config_held(dp)); - ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : - (ds->ds_phys->ds_uncompressed_bytes * 100 / - ds->ds_phys->ds_compressed_bytes); + ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 : + (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 / + dsl_dataset_phys(ds)->ds_compressed_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, - ds->ds_phys->ds_uncompressed_bytes); + dsl_dataset_phys(ds)->ds_uncompressed_bytes); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, - ds->ds_phys->ds_unique_bytes); + dsl_dataset_phys(ds)->ds_unique_bytes); get_clones_stat(ds, nv); } else { dsl_dir_stats(ds->ds_dir, nv); @@ -1439,17 +1598,17 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, - ds->ds_phys->ds_creation_time); + dsl_dataset_phys(ds)->ds_creation_time); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, - ds->ds_phys->ds_creation_txg); + dsl_dataset_phys(ds)->ds_creation_txg); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, ds->ds_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, ds->ds_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, - ds->ds_phys->ds_guid); + dsl_dataset_phys(ds)->ds_guid); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, - ds->ds_phys->ds_unique_bytes); + dsl_dataset_phys(ds)->ds_unique_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, ds->ds_object); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, @@ -1457,14 +1616,14 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, DS_IS_DEFER_DESTROY(ds) ? 1 : 0); - if (ds->ds_phys->ds_prev_snap_obj != 0) { + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { uint64_t written, comp, uncomp; dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_dataset_t *prev; int err; err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err == 0) { err = dsl_dataset_space_written(prev, ds, &written, &comp, &uncomp); @@ -1484,13 +1643,15 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) dsl_pool_t *dp = ds->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); - stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; - stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; - stat->dds_guid = ds->ds_phys->ds_guid; + stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg; + stat->dds_inconsistent = + dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT; + stat->dds_guid = dsl_dataset_phys(ds)->ds_guid; stat->dds_origin[0] = '\0'; - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { stat->dds_is_snapshot = B_TRUE; - stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; + stat->dds_num_clones = + dsl_dataset_phys(ds)->ds_num_children - 1; } else { stat->dds_is_snapshot = B_FALSE; stat->dds_num_clones = 0; @@ -1499,7 +1660,8 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) dsl_dataset_t *ods; VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); + dsl_dir_phys(ds->ds_dir)->dd_origin_obj, + FTAG, &ods)); dsl_dataset_name(ods, stat->dds_origin); dsl_dataset_rele(ods, FTAG); } @@ -1517,10 +1679,11 @@ dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { - *refdbytesp = ds->ds_phys->ds_referenced_bytes; + *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); - if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) - *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; + if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) + *availbytesp += + ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; if (ds->ds_quota != 0) { /* * Adjust available bytes according to refquota @@ -1531,7 +1694,7 @@ dsl_dataset_space(dsl_dataset_t *ds, else *availbytesp = 0; } - *usedobjsp = BP_GET_FILL(&ds->ds_phys->ds_bp); + *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp); *availobjsp = DN_MAX_OBJECT - *usedobjsp; } @@ -1541,8 +1704,8 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); if (snap == NULL) return (B_FALSE); - if (ds->ds_phys->ds_bp.blk_birth > - snap->ds_phys->ds_creation_txg) { + if (dsl_dataset_phys(ds)->ds_bp.blk_birth > + dsl_dataset_phys(snap)->ds_creation_txg) { objset_t *os, *os_snap; /* * It may be that only the ZIL differs, because it was @@ -1644,11 +1807,13 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, spa_history_log_internal_ds(ds, "rename", tx, "-> @%s", ddrsa->ddrsa_newsnapname); - VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx)); + VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx, + B_FALSE)); mutex_enter(&ds->ds_lock); (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname); mutex_exit(&ds->ds_lock); - VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, + VERIFY0(zap_add(dp->dp_meta_objset, + dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); dsl_dataset_rele(ds, FTAG); @@ -1760,13 +1925,13 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) return (error); /* must not be a snapshot */ - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* must have a most recent snapshot */ - if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { + if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } @@ -1785,7 +1950,7 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair), zfs_prop_to_name(ZFS_PROP_CREATETXG)); uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value"); - if (createtxg > ds->ds_phys->ds_prev_snap_txg) { + if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) { fnvlist_free(bookmarks); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EEXIST)); @@ -1804,7 +1969,7 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) * the refquota. */ if (ds->ds_quota != 0 && - ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) { + dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EDQUOT)); } @@ -1817,7 +1982,7 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) * this space, but the freeing happens over many txg's. */ unused_refres_delta = (int64_t)MIN(ds->ds_reserved, - ds->ds_phys->ds_unique_bytes); + dsl_dataset_phys(ds)->ds_unique_bytes); if (unused_refres_delta > 0 && unused_refres_delta > @@ -1896,6 +2061,7 @@ typedef struct dsl_dataset_promote_arg { dsl_dataset_t *origin_origin; /* origin of the origin */ uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; char *err_ds; + cred_t *cr; } dsl_dataset_promote_arg_t; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); @@ -1913,6 +2079,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) dsl_dataset_t *origin_ds; int err; uint64_t unused; + uint64_t ss_mv_cnt; err = promote_hold(ddpa, dp, FTAG); if (err != 0) @@ -1920,7 +2087,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) hds = ddpa->ddpa_clone; - if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) { + if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) { promote_rele(ddpa, FTAG); return (SET_ERROR(EXDEV)); } @@ -1939,9 +2106,10 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) /* compute origin's new unique space */ snap = list_tail(&ddpa->clone_snaps); - ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); + ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, + origin_ds->ds_object); dsl_deadlist_space_range(&snap->ds->ds_deadlist, - origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, + dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX, &ddpa->unique, &unused, &unused); /* @@ -1959,14 +2127,17 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ - ddpa->used = origin_ds->ds_phys->ds_referenced_bytes; - ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes; - ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; + ss_mv_cnt = 0; + ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes; + ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes; + ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes; for (snap = list_head(&ddpa->shared_snaps); snap; snap = list_next(&ddpa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; dsl_dataset_t *ds = snap->ds; + ss_mv_cnt++; + /* * If there are long holds, we won't be able to evict * the objset. @@ -1988,7 +2159,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) goto out; /* The very first snapshot does not have a deadlist */ - if (ds->ds_phys->ds_prev_snap_obj == 0) + if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0) continue; dsl_deadlist_space(&ds->ds_deadlist, @@ -2003,15 +2174,18 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) * so we need to subtract out the clone origin's used space. */ if (ddpa->origin_origin) { - ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes; - ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes; + ddpa->used -= + dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes; + ddpa->comp -= + dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes; ddpa->uncomp -= - ddpa->origin_origin->ds_phys->ds_uncompressed_bytes; + dsl_dataset_phys(ddpa->origin_origin)-> + ds_uncompressed_bytes; } - /* Check that there is enough space here */ + /* Check that there is enough space and limit headroom here */ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, - ddpa->used); + 0, ss_mv_cnt, ddpa->used, ddpa->cr); if (err != 0) goto out; @@ -2021,7 +2195,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) * it is the amount of space that will be on all of their * deadlists (that was not born before their new origin). */ - if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { uint64_t space; /* @@ -2043,9 +2217,11 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) goto out; ddpa->cloneusedsnap += space; } - if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags & + DD_FLAG_USED_BREAKDOWN) { err = snaplist_space(&ddpa->origin_snaps, - origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap); + dsl_dataset_phys(origin_ds)->ds_creation_txg, + &ddpa->originusedsnap); if (err != 0) goto out; } @@ -2072,7 +2248,7 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) VERIFY0(promote_hold(ddpa, dp, FTAG)); hds = ddpa->ddpa_clone; - ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE); + ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE); snap = list_head(&ddpa->shared_snaps); origin_ds = snap->ds; @@ -2090,47 +2266,49 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) /* change origin's next snap */ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); - oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; + oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj; snap = list_tail(&ddpa->clone_snaps); - ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); - origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; + ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, + origin_ds->ds_object); + dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object; /* change the origin's next clone */ - if (origin_ds->ds_phys->ds_next_clones_obj) { + if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) { dsl_dataset_remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); VERIFY0(zap_add_int(dp->dp_meta_objset, - origin_ds->ds_phys->ds_next_clones_obj, + dsl_dataset_phys(origin_ds)->ds_next_clones_obj, oldnext_obj, tx)); } /* change origin */ dmu_buf_will_dirty(dd->dd_dbuf, tx); - ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); - dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; + ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object); + dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj; dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; dmu_buf_will_dirty(odd->dd_dbuf, tx); - odd->dd_phys->dd_origin_obj = origin_ds->ds_object; + dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object; origin_head->ds_dir->dd_origin_txg = - origin_ds->ds_phys->ds_creation_txg; + dsl_dataset_phys(origin_ds)->ds_creation_txg; /* change dd_clone entries */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { VERIFY0(zap_remove_int(dp->dp_meta_objset, - odd->dd_phys->dd_clones, hds->ds_object, tx)); + dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx)); VERIFY0(zap_add_int(dp->dp_meta_objset, - ddpa->origin_origin->ds_dir->dd_phys->dd_clones, + dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, hds->ds_object, tx)); VERIFY0(zap_remove_int(dp->dp_meta_objset, - ddpa->origin_origin->ds_dir->dd_phys->dd_clones, + dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, origin_head->ds_object, tx)); - if (dd->dd_phys->dd_clones == 0) { - dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, - DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); + if (dsl_dir_phys(dd)->dd_clones == 0) { + dsl_dir_phys(dd)->dd_clones = + zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, + DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, - dd->dd_phys->dd_clones, origin_head->ds_object, tx)); + dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx)); } /* move snapshots to this dir */ @@ -2151,28 +2329,30 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) /* move snap name entry */ VERIFY0(dsl_dataset_get_snapname(ds)); VERIFY0(dsl_dataset_snap_remove(origin_head, - ds->ds_snapname, tx)); + ds->ds_snapname, tx, B_TRUE)); VERIFY0(zap_add(dp->dp_meta_objset, - hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, + dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); + dsl_fs_ss_count_adjust(hds->ds_dir, 1, + DD_FIELD_SNAPSHOT_COUNT, tx); /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); - ds->ds_phys->ds_dir_obj = dd->dd_object; + ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object); + dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object; ASSERT3P(ds->ds_dir, ==, odd); dsl_dir_rele(ds->ds_dir, ds); VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); /* move any clone references */ - if (ds->ds_phys->ds_next_clones_obj && + if (dsl_dataset_phys(ds)->ds_next_clones_obj && spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj); + dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *cnds; @@ -2188,12 +2368,13 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &cnds)); - o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; + o = dsl_dir_phys(cnds->ds_dir)-> + dd_head_dataset_obj; VERIFY0(zap_remove_int(dp->dp_meta_objset, - odd->dd_phys->dd_clones, o, tx)); + dsl_dir_phys(odd)->dd_clones, o, tx)); VERIFY0(zap_add_int(dp->dp_meta_objset, - dd->dd_phys->dd_clones, o, tx)); + dsl_dir_phys(dd)->dd_clones, o, tx)); dsl_dataset_rele(cnds, FTAG); } zap_cursor_fini(&zc); @@ -2210,7 +2391,7 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) */ delta = ddpa->cloneusedsnap - - dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, >=, 0); ASSERT3U(ddpa->used, >=, delta); dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); @@ -2218,14 +2399,14 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); delta = ddpa->originusedsnap - - odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, <=, 0); ASSERT3U(ddpa->used, >=, -delta); dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(odd, DD_USED_HEAD, -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); - origin_ds->ds_phys->ds_unique_bytes = ddpa->unique; + dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; /* log history record */ spa_history_log_internal_ds(hds, "promote", tx, ""); @@ -2260,12 +2441,12 @@ snaplist_make(dsl_pool_t *dp, return (err); if (first_obj == 0) - first_obj = ds->ds_dir->dd_phys->dd_origin_obj; + first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj; snap = kmem_alloc(sizeof (*snap), KM_SLEEP); snap->ds = ds; list_insert_tail(l, snap); - obj = ds->ds_phys->ds_prev_snap_obj; + obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } return (0); @@ -2315,13 +2496,13 @@ promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) return (error); dd = ddpa->ddpa_clone->ds_dir; - if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) || + if (ddpa->ddpa_clone->ds_is_snapshot || !dsl_dir_is_clone(dd)) { dsl_dataset_rele(ddpa->ddpa_clone, tag); return (SET_ERROR(EINVAL)); } - error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj, + error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj, &ddpa->shared_snaps, tag); if (error != 0) goto out; @@ -2332,16 +2513,16 @@ promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) goto out; snap = list_head(&ddpa->shared_snaps); - ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); - error = snaplist_make(dp, dd->dd_phys->dd_origin_obj, - snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, + ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj); + error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj, + dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj, &ddpa->origin_snaps, tag); if (error != 0) goto out; - if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { + if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) { error = dsl_dataset_hold_obj(dp, - snap->ds->ds_dir->dd_phys->dd_origin_obj, + dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj, tag, &ddpa->origin_origin); if (error != 0) goto out; @@ -2385,13 +2566,15 @@ dsl_dataset_promote(const char *name, char *conflsnap) if (error != 0) return (error); error = zap_count(dmu_objset_pool(os)->dp_meta_objset, - dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps); + dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj, + &numsnaps); dmu_objset_rele(os, FTAG); if (error != 0) return (error); ddpa.ddpa_clonename = name; ddpa.err_ds = conflsnap; + ddpa.cr = CRED(); return (dsl_sync_task(name, dsl_dataset_promote_check, dsl_dataset_promote_sync, &ddpa, 2 + numsnaps)); @@ -2404,8 +2587,8 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, int64_t unused_refres_delta; /* they should both be heads */ - if (dsl_dataset_is_snapshot(clone) || - dsl_dataset_is_snapshot(origin_head)) + if (clone->ds_is_snapshot || + origin_head->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* if we are not forcing, the branch point should be just before them */ @@ -2434,9 +2617,9 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, /* check amount of any unconsumed refreservation */ unused_refres_delta = (int64_t)MIN(origin_head->ds_reserved, - origin_head->ds_phys->ds_unique_bytes) - + dsl_dataset_phys(origin_head)->ds_unique_bytes) - (int64_t)MIN(origin_head->ds_reserved, - clone->ds_phys->ds_unique_bytes); + dsl_dataset_phys(clone)->ds_unique_bytes); if (unused_refres_delta > 0 && unused_refres_delta > @@ -2445,7 +2628,8 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, /* clone can't be over the head's refquota */ if (origin_head->ds_quota != 0 && - clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota) + dsl_dataset_phys(clone)->ds_referenced_bytes > + origin_head->ds_quota) return (SET_ERROR(EDQUOT)); return (0); @@ -2460,7 +2644,7 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, ASSERT(clone->ds_reserved == 0); ASSERT(origin_head->ds_quota == 0 || - clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota); + dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota); ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); dmu_buf_will_dirty(clone->ds_dbuf, tx); @@ -2478,9 +2662,9 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, unused_refres_delta = (int64_t)MIN(origin_head->ds_reserved, - origin_head->ds_phys->ds_unique_bytes) - + dsl_dataset_phys(origin_head)->ds_unique_bytes) - (int64_t)MIN(origin_head->ds_reserved, - clone->ds_phys->ds_unique_bytes); + dsl_dataset_phys(clone)->ds_unique_bytes); /* * Reset origin's unique bytes, if it exists. @@ -2491,16 +2675,17 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dmu_buf_will_dirty(origin->ds_dbuf, tx); dsl_deadlist_space_range(&clone->ds_deadlist, - origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, - &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); + dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX, + &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp); } /* swap blkptrs */ { blkptr_t tmp; - tmp = origin_head->ds_phys->ds_bp; - origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp; - clone->ds_phys->ds_bp = tmp; + tmp = dsl_dataset_phys(origin_head)->ds_bp; + dsl_dataset_phys(origin_head)->ds_bp = + dsl_dataset_phys(clone)->ds_bp; + dsl_dataset_phys(clone)->ds_bp = tmp; } /* set dd_*_bytes */ @@ -2509,7 +2694,7 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, uint64_t cdl_used, cdl_comp, cdl_uncomp; uint64_t odl_used, odl_comp, odl_uncomp; - ASSERT3U(clone->ds_dir->dd_phys-> + ASSERT3U(dsl_dir_phys(clone->ds_dir)-> dd_used_breakdown[DD_USED_SNAP], ==, 0); dsl_deadlist_space(&clone->ds_deadlist, @@ -2517,13 +2702,18 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_deadlist_space(&origin_head->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp); - dused = clone->ds_phys->ds_referenced_bytes + cdl_used - - (origin_head->ds_phys->ds_referenced_bytes + odl_used); - dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp - - (origin_head->ds_phys->ds_compressed_bytes + odl_comp); - duncomp = clone->ds_phys->ds_uncompressed_bytes + + dused = dsl_dataset_phys(clone)->ds_referenced_bytes + + cdl_used - + (dsl_dataset_phys(origin_head)->ds_referenced_bytes + + odl_used); + dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes + + cdl_comp - + (dsl_dataset_phys(origin_head)->ds_compressed_bytes + + odl_comp); + duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes + cdl_uncomp - - (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp); + (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes + + odl_uncomp); dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, dused, dcomp, duncomp, tx); @@ -2547,14 +2737,14 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, } /* swap ds_*_bytes */ - SWITCH64(origin_head->ds_phys->ds_referenced_bytes, - clone->ds_phys->ds_referenced_bytes); - SWITCH64(origin_head->ds_phys->ds_compressed_bytes, - clone->ds_phys->ds_compressed_bytes); - SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes, - clone->ds_phys->ds_uncompressed_bytes); - SWITCH64(origin_head->ds_phys->ds_unique_bytes, - clone->ds_phys->ds_unique_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes, + dsl_dataset_phys(clone)->ds_referenced_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes, + dsl_dataset_phys(clone)->ds_compressed_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes, + dsl_dataset_phys(clone)->ds_uncompressed_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes, + dsl_dataset_phys(clone)->ds_unique_bytes); /* apply any parent delta for change in unconsumed refreservation */ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, @@ -2565,12 +2755,12 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, */ dsl_deadlist_close(&clone->ds_deadlist); dsl_deadlist_close(&origin_head->ds_deadlist); - SWITCH64(origin_head->ds_phys->ds_deadlist_obj, - clone->ds_phys->ds_deadlist_obj); + SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj, + dsl_dataset_phys(clone)->ds_deadlist_obj); dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, - clone->ds_phys->ds_deadlist_obj); + dsl_dataset_phys(clone)->ds_deadlist_obj); dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, - origin_head->ds_phys->ds_deadlist_obj); + dsl_dataset_phys(origin_head)->ds_deadlist_obj); dsl_scan_ds_clone_swapped(origin_head, clone, tx); @@ -2621,10 +2811,11 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, /* * Make a space adjustment for reserved bytes. */ - if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { + if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { ASSERT3U(*used, >=, - ds->ds_reserved - ds->ds_phys->ds_unique_bytes); - *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); + ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); + *used -= + (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); *ref_rsrv = asize - MIN(asize, parent_delta(ds, asize + inflight)); } @@ -2639,9 +2830,10 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, * on-disk is over quota and there are no pending changes (which * may free up space for us). */ - if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { + if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >= + ds->ds_quota) { if (inflight > 0 || - ds->ds_phys->ds_referenced_bytes < ds->ds_quota) + dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota) error = SET_ERROR(ERESTART); else error = SET_ERROR(EDQUOT); @@ -2675,7 +2867,7 @@ dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) if (error != 0) return (error); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } @@ -2693,7 +2885,7 @@ dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) return (0); } - if (newval < ds->ds_phys->ds_referenced_bytes || + if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes || newval < ds->ds_reserved) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOSPC)); @@ -2758,7 +2950,7 @@ dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) if (error != 0) return (error); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } @@ -2783,7 +2975,7 @@ dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) mutex_enter(&ds->ds_lock); if (!DS_UNIQUE_IS_ACCURATE(ds)) dsl_dataset_recalc_head_uniq(ds); - unique = ds->ds_phys->ds_unique_bytes; + unique = dsl_dataset_phys(ds)->ds_unique_bytes; mutex_exit(&ds->ds_lock); if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { @@ -2820,7 +3012,7 @@ dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); - unique = ds->ds_phys->ds_unique_bytes; + unique = dsl_dataset_phys(ds)->ds_unique_bytes; delta = MAX(0, (int64_t)(newval - unique)) - MAX(0, (int64_t)(ds->ds_reserved - unique)); ds->ds_reserved = newval; @@ -2885,16 +3077,16 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, ASSERT(dsl_pool_config_held(dp)); *usedp = 0; - *usedp += new->ds_phys->ds_referenced_bytes; - *usedp -= oldsnap->ds_phys->ds_referenced_bytes; + *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; + *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes; *compp = 0; - *compp += new->ds_phys->ds_compressed_bytes; - *compp -= oldsnap->ds_phys->ds_compressed_bytes; + *compp += dsl_dataset_phys(new)->ds_compressed_bytes; + *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes; *uncompp = 0; - *uncompp += new->ds_phys->ds_uncompressed_bytes; - *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; + *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; + *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes; snapobj = new->ds_object; while (snapobj != oldsnap->ds_object) { @@ -2909,8 +3101,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, break; } - if (snap->ds_phys->ds_prev_snap_txg == - oldsnap->ds_phys->ds_creation_txg) { + if (dsl_dataset_phys(snap)->ds_prev_snap_txg == + dsl_dataset_phys(oldsnap)->ds_creation_txg) { /* * The blocks in the deadlist can not be born after * ds_prev_snap_txg, so get the whole deadlist space, @@ -2923,7 +3115,7 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, &used, &comp, &uncomp); } else { dsl_deadlist_space_range(&snap->ds_deadlist, - 0, oldsnap->ds_phys->ds_creation_txg, + 0, dsl_dataset_phys(oldsnap)->ds_creation_txg, &used, &comp, &uncomp); } *usedp += used; @@ -2935,7 +3127,7 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap * was not a snapshot of/before new. */ - snapobj = snap->ds_phys->ds_prev_snap_obj; + snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; if (snap != new) dsl_dataset_rele(snap, FTAG); if (snapobj == 0) { @@ -2971,21 +3163,21 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, uint64_t snapobj; dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; - ASSERT(dsl_dataset_is_snapshot(firstsnap)); - ASSERT(dsl_dataset_is_snapshot(lastsnap)); + ASSERT(firstsnap->ds_is_snapshot); + ASSERT(lastsnap->ds_is_snapshot); /* * Check that the snapshots are in the same dsl_dir, and firstsnap * is before lastsnap. */ if (firstsnap->ds_dir != lastsnap->ds_dir || - firstsnap->ds_phys->ds_creation_txg > - lastsnap->ds_phys->ds_creation_txg) + dsl_dataset_phys(firstsnap)->ds_creation_txg > + dsl_dataset_phys(lastsnap)->ds_creation_txg) return (SET_ERROR(EINVAL)); *usedp = *compp = *uncompp = 0; - snapobj = lastsnap->ds_phys->ds_next_snap_obj; + snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj; while (snapobj != firstsnap->ds_object) { dsl_dataset_t *ds; uint64_t used, comp, uncomp; @@ -2995,13 +3187,13 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, break; dsl_deadlist_space_range(&ds->ds_deadlist, - firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, + dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX, &used, &comp, &uncomp); *usedp += used; *compp += comp; *uncompp += uncomp; - snapobj = ds->ds_phys->ds_prev_snap_obj; + snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; ASSERT3U(snapobj, !=, 0); dsl_dataset_rele(ds, FTAG); } @@ -3027,13 +3219,13 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, dsl_dataset_t *origin; ASSERT(dsl_pool_config_held(dp)); - ASSERT(dsl_dataset_is_snapshot(earlier) || earlier_txg != 0); + ASSERT(earlier->ds_is_snapshot || earlier_txg != 0); if (earlier_txg == 0) - earlier_txg = earlier->ds_phys->ds_creation_txg; + earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg; - if (dsl_dataset_is_snapshot(later) && - earlier_txg >= later->ds_phys->ds_creation_txg) + if (later->ds_is_snapshot && + earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg) return (B_FALSE); if (later->ds_dir == earlier->ds_dir) @@ -3041,10 +3233,10 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, if (!dsl_dir_is_clone(later->ds_dir)) return (B_FALSE); - if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object) + if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object) return (B_TRUE); error = dsl_dataset_hold_obj(dp, - later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin); + dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin); if (error != 0) return (B_FALSE); ret = dsl_dataset_is_before(origin, earlier, earlier_txg); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 8a4362ff9c1a..55810f5c3024 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -121,6 +122,8 @@ dsl_deadlist_close(dsl_deadlist_t *dl) void *cookie = NULL; dsl_deadlist_entry_t *dle; + dl->dl_os = NULL; + if (dl->dl_oldfmt) { dl->dl_oldfmt = B_FALSE; bpobj_close(&dl->dl_bpobj); @@ -310,8 +313,9 @@ dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, while (mrs_obj != 0) { dsl_dataset_t *ds; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); - dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx); - mrs_obj = ds->ds_phys->ds_prev_snap_obj; + dsl_deadlist_add_key(&dl, + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; dsl_dataset_rele(ds, FTAG); } dsl_deadlist_close(&dl); diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index 99670dfe0724..23bbe5b94692 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -164,10 +164,10 @@ dsl_deleg_set_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); - zapobj = dd->dd_phys->dd_deleg_zapobj; + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; if (zapobj == 0) { dmu_buf_will_dirty(dd->dd_dbuf, tx); - zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); } @@ -208,7 +208,7 @@ dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) uint64_t zapobj; VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); - zapobj = dd->dd_phys->dd_deleg_zapobj; + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; if (zapobj == 0) { dsl_dir_rele(dd, FTAG); return; @@ -337,14 +337,14 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) nvlist_t *sp_nvp; uint64_t n; - if (dd->dd_phys->dd_deleg_zapobj == 0 || - zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) != 0 || - n == 0) + if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 || + zap_count(mos, + dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0) continue; sp_nvp = fnvlist_alloc(); for (zap_cursor_init(basezc, mos, - dd->dd_phys->dd_deleg_zapobj); + dsl_dir_phys(dd)->dd_deleg_zapobj); zap_cursor_retrieve(basezc, baseza) == 0; zap_cursor_advance(basezc)) { nvlist_t *perms_nvp; @@ -570,7 +570,7 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) SPA_VERSION_DELEGATED_PERMS) return (SET_ERROR(EPERM)); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { /* * Snapshots are treated as descendents only, * local permissions do not apply. @@ -603,7 +603,7 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) if (!zoned) break; } - zapobj = dd->dd_phys->dd_deleg_zapobj; + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; if (zapobj == 0) continue; @@ -682,7 +682,7 @@ copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, { objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t jumpobj, pjumpobj; - uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; zap_cursor_t zc; zap_attribute_t za; char whokey[ZFS_MAX_DELEG_NAME]; @@ -695,7 +695,7 @@ copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, if (zapobj == 0) { dmu_buf_will_dirty(dd->dd_dbuf, tx); - zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); } @@ -733,7 +733,7 @@ dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr) return; for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) { - uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj; + uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; if (pzapobj == 0) continue; diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 9765ba1553a8..6a52ed22f003 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2013 by Joyent, Inc. All rights reserved. */ #include @@ -50,7 +51,7 @@ typedef struct dmu_snapshots_destroy_arg { int dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) { - if (!dsl_dataset_is_snapshot(ds)) + if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); if (dsl_dataset_long_held(ds)) @@ -77,7 +78,7 @@ dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) /* * Can't delete a branch point. */ - if (ds->ds_phys->ds_num_children > 1) + if (dsl_dataset_phys(ds)->ds_num_children > 1) return (SET_ERROR(EEXIST)); return (0); @@ -146,12 +147,12 @@ process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); - if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { + if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); if (poa->ds_prev && !poa->after_branch_point && bp->blk_birth > - poa->ds_prev->ds_phys->ds_prev_snap_txg) { - poa->ds_prev->ds_phys->ds_unique_bytes += + dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { + dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes += bp_get_dsize_sync(dp->dp_spa, bp); } } else { @@ -182,7 +183,7 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, process_old_cb, &poa, tx)); VERIFY0(zio_wait(poa.pio)); - ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); + ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes); /* change snapused */ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, @@ -191,12 +192,14 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, /* swap next's deadlist to our deadlist */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_close(&ds_next->ds_deadlist); - deadlist_obj = ds->ds_phys->ds_deadlist_obj; - ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj; - ds_next->ds_phys->ds_deadlist_obj = deadlist_obj; - dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; + dsl_dataset_phys(ds)->ds_deadlist_obj = + dsl_dataset_phys(ds_next)->ds_deadlist_obj; + dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj; + dsl_deadlist_open(&ds->ds_deadlist, mos, + dsl_dataset_phys(ds)->ds_deadlist_obj); dsl_deadlist_open(&ds_next->ds_deadlist, mos, - ds_next->ds_phys->ds_deadlist_obj); + dsl_dataset_phys(ds_next)->ds_deadlist_obj); } static void @@ -211,13 +214,13 @@ dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) * find the clones, but dsl_deadlist_remove_key() is a no-op so it * doesn't matter. */ - if (ds->ds_dir->dd_phys->dd_clones == 0) + if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0) return; zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - for (zap_cursor_init(zc, mos, ds->ds_dir->dd_phys->dd_clones); + for (zap_cursor_init(zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dataset_t *clone; @@ -252,19 +255,20 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); ASSERT(refcount_is_zero(&ds->ds_longholds)); if (defer && - (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) { + (ds->ds_userrefs > 0 || + dsl_dataset_phys(ds)->ds_num_children > 1)) { ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY; spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); return; } - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); /* We need to log before removing it from the namespace. */ spa_history_log_internal_ds(ds, "destroy", tx, ""); @@ -273,42 +277,44 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) obj = ds->ds_object; - if (ds->ds_phys->ds_prev_snap_obj != 0) { + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { ASSERT3P(ds->ds_prev, ==, NULL); VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev)); after_branch_point = - (ds_prev->ds_phys->ds_next_snap_obj != obj); + (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj); dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); if (after_branch_point && - ds_prev->ds_phys->ds_next_clones_obj != 0) { + dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) { dsl_dataset_remove_from_next_clones(ds_prev, obj, tx); - if (ds->ds_phys->ds_next_snap_obj != 0) { + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { VERIFY0(zap_add_int(mos, - ds_prev->ds_phys->ds_next_clones_obj, - ds->ds_phys->ds_next_snap_obj, tx)); + dsl_dataset_phys(ds_prev)-> + ds_next_clones_obj, + dsl_dataset_phys(ds)->ds_next_snap_obj, + tx)); } } if (!after_branch_point) { - ds_prev->ds_phys->ds_next_snap_obj = - ds->ds_phys->ds_next_snap_obj; + dsl_dataset_phys(ds_prev)->ds_next_snap_obj = + dsl_dataset_phys(ds)->ds_next_snap_obj; } } VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); - ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); + dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next)); + ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj); - old_unique = ds_next->ds_phys->ds_unique_bytes; + old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes; dmu_buf_will_dirty(ds_next->ds_dbuf, tx); - ds_next->ds_phys->ds_prev_snap_obj = - ds->ds_phys->ds_prev_snap_obj; - ds_next->ds_phys->ds_prev_snap_txg = - ds->ds_phys->ds_prev_snap_txg; - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, - ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); + dsl_dataset_phys(ds_next)->ds_prev_snap_obj = + dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsl_dataset_phys(ds_next)->ds_prev_snap_txg = + dsl_dataset_phys(ds)->ds_prev_snap_txg; + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, + ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0); if (ds_next->ds_deadlist.dl_oldfmt) { process_old_deadlist(ds, ds_prev, ds_next, @@ -317,40 +323,40 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) /* Adjust prev's unique space. */ if (ds_prev && !after_branch_point) { dsl_deadlist_space_range(&ds_next->ds_deadlist, - ds_prev->ds_phys->ds_prev_snap_txg, - ds->ds_phys->ds_prev_snap_txg, + dsl_dataset_phys(ds_prev)->ds_prev_snap_txg, + dsl_dataset_phys(ds)->ds_prev_snap_txg, &used, &comp, &uncomp); - ds_prev->ds_phys->ds_unique_bytes += used; + dsl_dataset_phys(ds_prev)->ds_unique_bytes += used; } /* Adjust snapused. */ dsl_deadlist_space_range(&ds_next->ds_deadlist, - ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, + dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX, &used, &comp, &uncomp); dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -used, -comp, -uncomp, tx); /* Move blocks to be freed to pool's free list. */ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, - &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, + &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); /* Merge our deadlist into next's and free it. */ dsl_deadlist_merge(&ds_next->ds_deadlist, - ds->ds_phys->ds_deadlist_obj, tx); + dsl_dataset_phys(ds)->ds_deadlist_obj, tx); } dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); + dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_deadlist_obj = 0; + dsl_dataset_phys(ds)->ds_deadlist_obj = 0; /* Collapse range in clone heads */ dsl_dataset_remove_clones_key(ds, - ds->ds_phys->ds_creation_txg, tx); + dsl_dataset_phys(ds)->ds_creation_txg, tx); - if (dsl_dataset_is_snapshot(ds_next)) { + if (ds_next->ds_is_snapshot) { dsl_dataset_t *ds_nextnext; /* @@ -363,20 +369,21 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) * deadlist). */ VERIFY0(dsl_dataset_hold_obj(dp, - ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext)); + dsl_dataset_phys(ds_next)->ds_next_snap_obj, + FTAG, &ds_nextnext)); dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, - ds->ds_phys->ds_prev_snap_txg, - ds->ds_phys->ds_creation_txg, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + dsl_dataset_phys(ds)->ds_creation_txg, &used, &comp, &uncomp); - ds_next->ds_phys->ds_unique_bytes += used; + dsl_dataset_phys(ds_next)->ds_unique_bytes += used; dsl_dataset_rele(ds_nextnext, FTAG); ASSERT3P(ds_next->ds_prev, ==, NULL); /* Collapse range in this head. */ VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds)); + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds)); dsl_deadlist_remove_key(&hds->ds_deadlist, - ds->ds_phys->ds_creation_txg, tx); + dsl_dataset_phys(ds)->ds_creation_txg, tx); dsl_dataset_rele(hds, FTAG); } else { @@ -385,7 +392,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) ds_next->ds_prev = NULL; if (ds_prev) { VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, + dsl_dataset_phys(ds)->ds_prev_snap_obj, ds_next, &ds_next->ds_prev)); } @@ -399,7 +406,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) if (old_unique < ds_next->ds_reserved) { int64_t mrsdelta; uint64_t new_unique = - ds_next->ds_phys->ds_unique_bytes; + dsl_dataset_phys(ds_next)->ds_unique_bytes; ASSERT(old_unique <= new_unique); mrsdelta = MIN(new_unique - old_unique, @@ -420,9 +427,9 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) } /* remove from snapshot namespace */ - ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); + ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0); VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head)); VERIFY0(dsl_dataset_get_snapname(ds)); #ifdef ZFS_DEBUG { @@ -434,7 +441,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) ASSERT3U(val, ==, obj); } #endif - VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx)); + VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE)); dsl_dataset_rele(ds_head, FTAG); if (ds_prev != NULL) @@ -442,17 +449,20 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - if (ds->ds_phys->ds_next_clones_obj != 0) { + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { ASSERTV(uint64_t count); ASSERT0(zap_count(mos, - ds->ds_phys->ds_next_clones_obj, &count) && count == 0); + dsl_dataset_phys(ds)->ds_next_clones_obj, &count) && + count == 0); VERIFY0(dmu_object_free(mos, - ds->ds_phys->ds_next_clones_obj, tx)); + dsl_dataset_phys(ds)->ds_next_clones_obj, tx)); } - if (ds->ds_phys->ds_props_obj != 0) - VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); - if (ds->ds_phys->ds_userrefs_obj != 0) - VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); + if (dsl_dataset_phys(ds)->ds_props_obj != 0) + VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj, + tx)); + if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) + VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, + tx)); dsl_dir_rele(ds->ds_dir, ds); ds->ds_dir = NULL; dmu_object_free_zapified(mos, obj, tx); @@ -555,7 +565,8 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); } else { ASSERT(zilog == NULL); - ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); + ASSERT3U(bp->blk_birth, >, + dsl_dataset_phys(ka->ds)->ds_prev_snap_txg); (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); } @@ -577,9 +588,10 @@ old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) ka.ds = ds; ka.tx = tx; VERIFY0(traverse_dataset(ds, - ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, + dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST, kill_blkptr, &ka)); - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + dsl_dataset_phys(ds)->ds_unique_bytes == 0); } typedef struct dsl_destroy_head_arg { @@ -593,8 +605,8 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) uint64_t count; objset_t *mos; - ASSERT(!dsl_dataset_is_snapshot(ds)); - if (dsl_dataset_is_snapshot(ds)) + ASSERT(!ds->ds_is_snapshot); + if (ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); if (refcount_count(&ds->ds_longholds) != expected_holds) @@ -608,21 +620,21 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) * from.) */ if (ds->ds_prev != NULL && - ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object) return (SET_ERROR(EBUSY)); /* * Can't delete if there are children of this fs. */ error = zap_count(mos, - ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); + dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count); if (error != 0) return (error); if (count != 0) return (SET_ERROR(EEXIST)); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && - ds->ds_prev->ds_phys->ds_num_children == 2 && + dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && ds->ds_prev->ds_userrefs == 0) { /* We need to remove the origin snapshot as well. */ if (!refcount_is_zero(&ds->ds_prev->ds_longholds)) @@ -660,7 +672,18 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); - ASSERT0(dd->dd_phys->dd_head_dataset_obj); + ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj); + + /* + * Decrement the filesystem count for all parent filesystems. + * + * When we receive an incremental stream into a filesystem that already + * exists, a temporary clone is created. We never count this temporary + * clone, whose name begins with a '%'. + */ + if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL) + dsl_fs_ss_count_adjust(dd->dd_parent, -1, + DD_FIELD_FILESYSTEM_COUNT, tx); /* * Remove our reservation. The impl() routine avoids setting the @@ -668,16 +691,17 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) */ dsl_dir_set_reservation_sync_impl(dd, 0, tx); - ASSERT0(dd->dd_phys->dd_used_bytes); - ASSERT0(dd->dd_phys->dd_reserved); + ASSERT0(dsl_dir_phys(dd)->dd_used_bytes); + ASSERT0(dsl_dir_phys(dd)->dd_reserved); for (t = 0; t < DD_USED_NUM; t++) - ASSERT0(dd->dd_phys->dd_used_breakdown[t]); + ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]); - VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); - VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); - VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); + VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx)); + VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx)); + VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx)); VERIFY0(zap_remove(mos, - dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); + dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, + dd->dd_myname, tx)); dsl_dir_rele(dd, FTAG); dmu_object_free_zapified(mos, ddobj, tx); @@ -692,10 +716,10 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) boolean_t rmorigin; objset_t *os; - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); ASSERT(ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); - ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object); + ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); /* We need to log before removing it from the namespace. */ @@ -703,7 +727,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) rmorigin = (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && - ds->ds_prev->ds_phys->ds_num_children == 2 && + dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && ds->ds_prev->ds_userrefs == 0); /* Remove our reservation. */ @@ -718,20 +742,21 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) obj = ds->ds_object; - if (ds->ds_phys->ds_prev_snap_obj != 0) { + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { /* This is a clone */ ASSERT(ds->ds_prev != NULL); - ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj); - ASSERT0(ds->ds_phys->ds_next_snap_obj); + ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=, + obj); + ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj); dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) { + if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) { dsl_dataset_remove_from_next_clones(ds->ds_prev, obj, tx); } - ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1); - ds->ds_prev->ds_phys->ds_num_children--; + ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1); + dsl_dataset_phys(ds->ds_prev)->ds_num_children--; } /* @@ -740,9 +765,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) * safe to ignore the deadlist contents.) */ dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); + dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_deadlist_obj = 0; + dsl_dataset_phys(ds)->ds_deadlist_obj = 0; VERIFY0(dmu_objset_from_ds(ds, &os)); @@ -771,15 +796,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) scn->scn_async_destroying = B_TRUE; } - used = ds->ds_dir->dd_phys->dd_used_bytes; - comp = ds->ds_dir->dd_phys->dd_compressed_bytes; - uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; + used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; + comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; + uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - ds->ds_phys->ds_unique_bytes == used); + dsl_dataset_phys(ds)->ds_unique_bytes == used); bptree_add(mos, dp->dp_bptree_obj, - &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, + &dsl_dataset_phys(ds)->ds_bp, + dsl_dataset_phys(ds)->ds_prev_snap_txg, used, comp, uncomp, tx); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, -used, -comp, -uncomp, tx); @@ -790,7 +816,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) if (ds->ds_prev != NULL) { if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { VERIFY0(zap_remove_int(mos, - ds->ds_prev->ds_dir->dd_phys->dd_clones, + dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones, ds->ds_object, tx)); } prevobj = ds->ds_prev->ds_object; @@ -809,22 +835,22 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) /* Erase the link in the dir */ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); - ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0; ddobj = ds->ds_dir->dd_object; - ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); - VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx)); + ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0); + VERIFY0(zap_destroy(mos, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx)); if (ds->ds_bookmarks != 0) { - VERIFY0(zap_destroy(mos, - ds->ds_bookmarks, tx)); + VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); } spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - ASSERT0(ds->ds_phys->ds_next_clones_obj); - ASSERT0(ds->ds_phys->ds_props_obj); - ASSERT0(ds->ds_phys->ds_userrefs_obj); + ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj); + ASSERT0(dsl_dataset_phys(ds)->ds_props_obj); + ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj); dsl_dir_rele(ds->ds_dir, ds); ds->ds_dir = NULL; dmu_object_free_zapified(mos, obj, tx); @@ -862,7 +888,7 @@ dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) /* Mark it as inconsistent on-disk, in case we crash */ dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; spa_history_log_internal_ds(ds, "destroy begin", tx, ""); dsl_dataset_rele(ds, FTAG); @@ -905,7 +931,8 @@ dsl_destroy_head(const char *name) if (error == 0) { uint64_t obj; uint64_t prev_snap_txg = - dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg; + dsl_dataset_phys(dmu_objset_ds(os))-> + ds_prev_snap_txg; for (obj = 0; error == 0; error = dmu_object_next(os, &obj, FALSE, prev_snap_txg)) diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index b94b68e15774..0b54b8b4e405 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -22,6 +22,8 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -39,19 +41,101 @@ #include #include #include +#include +#include +#include #include #include "zfs_namecheck.h" +#include "zfs_prop.h" + +/* + * Filesystem and Snapshot Limits + * ------------------------------ + * + * These limits are used to restrict the number of filesystems and/or snapshots + * that can be created at a given level in the tree or below. A typical + * use-case is with a delegated dataset where the administrator wants to ensure + * that a user within the zone is not creating too many additional filesystems + * or snapshots, even though they're not exceeding their space quota. + * + * The filesystem and snapshot counts are stored as extensible properties. This + * capability is controlled by a feature flag and must be enabled to be used. + * Once enabled, the feature is not active until the first limit is set. At + * that point, future operations to create/destroy filesystems or snapshots + * will validate and update the counts. + * + * Because the count properties will not exist before the feature is active, + * the counts are updated when a limit is first set on an uninitialized + * dsl_dir node in the tree (The filesystem/snapshot count on a node includes + * all of the nested filesystems/snapshots. Thus, a new leaf node has a + * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and + * snapshot count properties on a node indicate uninitialized counts on that + * node.) When first setting a limit on an uninitialized node, the code starts + * at the filesystem with the new limit and descends into all sub-filesystems + * to add the count properties. + * + * In practice this is lightweight since a limit is typically set when the + * filesystem is created and thus has no children. Once valid, changing the + * limit value won't require a re-traversal since the counts are already valid. + * When recursively fixing the counts, if a node with a limit is encountered + * during the descent, the counts are known to be valid and there is no need to + * descend into that filesystem's children. The counts on filesystems above the + * one with the new limit will still be uninitialized, unless a limit is + * eventually set on one of those filesystems. The counts are always recursively + * updated when a limit is set on a dataset, unless there is already a limit. + * When a new limit value is set on a filesystem with an existing limit, it is + * possible for the new limit to be less than the current count at that level + * since a user who can change the limit is also allowed to exceed the limit. + * + * Once the feature is active, then whenever a filesystem or snapshot is + * created, the code recurses up the tree, validating the new count against the + * limit at each initialized level. In practice, most levels will not have a + * limit set. If there is a limit at any initialized level up the tree, the + * check must pass or the creation will fail. Likewise, when a filesystem or + * snapshot is destroyed, the counts are recursively adjusted all the way up + * the initizized nodes in the tree. Renaming a filesystem into different point + * in the tree will first validate, then update the counts on each branch up to + * the common ancestor. A receive will also validate the counts and then update + * them. + * + * An exception to the above behavior is that the limit is not enforced if the + * user has permission to modify the limit. This is primarily so that + * recursive snapshots in the global zone always work. We want to prevent a + * denial-of-service in which a lower level delegated dataset could max out its + * limit and thus block recursive snapshots from being taken in the global zone. + * Because of this, it is possible for the snapshot count to be over the limit + * and snapshots taken in the global zone could cause a lower level dataset to + * hit or exceed its limit. The administrator taking the global zone recursive + * snapshot should be aware of this side-effect and behave accordingly. + * For consistency, the filesystem limit is also not enforced if the user can + * modify the limit. + * + * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() + * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in + * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by + * dsl_dir_init_fs_ss_count(). + * + * There is a special case when we receive a filesystem that already exists. In + * this case a temporary clone name of %X is created (see dmu_recv_begin). We + * never update the filesystem counts for temporary clones. + * + * Likewise, we do not update the snapshot counts for temporary snapshots, + * such as those created by zfs diff. + */ + +extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); -/* ARGSUSED */ static void -dsl_dir_evict(dmu_buf_t *db, void *arg) +dsl_dir_evict(void *dbu) { - dsl_dir_t *dd = arg; + dsl_dir_t *dd = dbu; int t; ASSERTV(dsl_pool_t *dp = dd->dd_pool); + dd->dd_dbuf = NULL; + for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ASSERT(dd->dd_tempreserved[t] == 0); @@ -59,9 +143,9 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) } if (dd->dd_parent) - dsl_dir_rele(dd->dd_parent, dd); + dsl_dir_async_rele(dd->dd_parent, dd); - spa_close(dd->dd_pool->dp_spa, dd); + spa_async_close(dd->dd_pool->dp_spa, dd); /* * The props callback list should have been cleaned up by @@ -101,7 +185,6 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; - dd->dd_phys = dbuf->db_data; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), @@ -109,9 +192,10 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dsl_dir_snap_cmtime_update(dd); - if (dd->dd_phys->dd_parent_obj) { - err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj, - NULL, dd, &dd->dd_parent); + if (dsl_dir_phys(dd)->dd_parent_obj) { + err = dsl_dir_hold_obj(dp, + dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, + &dd->dd_parent); if (err != 0) goto errout; if (tail) { @@ -119,14 +203,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, uint64_t foundobj; err = zap_lookup(dp->dp_meta_objset, - dd->dd_parent->dd_phys->dd_child_dir_zapobj, - tail, sizeof (foundobj), 1, &foundobj); + dsl_dir_phys(dd->dd_parent)-> + dd_child_dir_zapobj, tail, + sizeof (foundobj), 1, &foundobj); ASSERT(err || foundobj == ddobj); #endif (void) strcpy(dd->dd_myname, tail); } else { err = zap_value_search(dp->dp_meta_objset, - dd->dd_parent->dd_phys->dd_child_dir_zapobj, + dsl_dir_phys(dd->dd_parent)-> + dd_child_dir_zapobj, ddobj, 0, dd->dd_myname); } if (err != 0) @@ -145,7 +231,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, * Just look at its phys directly instead. */ err = dmu_bonus_hold(dp->dp_meta_objset, - dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); + dsl_dir_phys(dd)->dd_origin_obj, FTAG, + &origin_bonus); if (err != 0) goto errout; origin_phys = origin_bonus->db_data; @@ -154,9 +241,9 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dmu_buf_rele(origin_bonus, FTAG); } - winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, - dsl_dir_evict); - if (winner) { + dmu_buf_init_user(&dd->dd_dbu, dsl_dir_evict, &dd->dd_dbuf); + winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); + if (winner != NULL) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); @@ -200,6 +287,21 @@ dsl_dir_rele(dsl_dir_t *dd, void *tag) dmu_buf_rele(dd->dd_dbuf, tag); } +/* + * Remove a reference to the given dsl dir that is being asynchronously + * released. Async releases occur from a taskq performing eviction of + * dsl datasets and dirs. This process is identical to a normal release + * with the exception of using the async API for releasing the reference on + * the spa. + */ +void +dsl_dir_async_rele(dsl_dir_t *dd, void *tag) +{ + dprintf_dd(dd, "%s\n", ""); + spa_async_close(dd->dd_pool->dp_spa, tag); + dmu_buf_rele(dd->dd_dbuf, tag); +} + /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ void dsl_dir_name(dsl_dir_t *dd, char *buf) @@ -333,7 +435,7 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, } while (next != NULL) { - dsl_dir_t *child_ds; + dsl_dir_t *child_dd; err = getcomponent(next, buf, &nextnext); if (err != 0) break; @@ -341,10 +443,10 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, if (next[0] == '@') break; dprintf("looking up %s in obj%lld\n", - buf, dd->dd_phys->dd_child_dir_zapobj); + buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); err = zap_lookup(dp->dp_meta_objset, - dd->dd_phys->dd_child_dir_zapobj, + dsl_dir_phys(dd)->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); if (err != 0) { if (err == ENOENT) @@ -352,11 +454,11 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, break; } - err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds); + err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); if (err != 0) break; dsl_dir_rele(dd, tag); - dd = child_ds; + dd = child_dd; next = nextnext; } @@ -384,6 +486,401 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, return (err); } +/* + * If the counts are already initialized for this filesystem and its + * descendants then do nothing, otherwise initialize the counts. + * + * The counts on this filesystem, and those below, may be uninitialized due to + * either the use of a pre-existing pool which did not support the + * filesystem/snapshot limit feature, or one in which the feature had not yet + * been enabled. + * + * Recursively descend the filesystem tree and update the filesystem/snapshot + * counts on each filesystem below, then update the cumulative count on the + * current filesystem. If the filesystem already has a count set on it, + * then we know that its counts, and the counts on the filesystems below it, + * are already correct, so we don't have to update this filesystem. + */ +static void +dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) +{ + uint64_t my_fs_cnt = 0; + uint64_t my_ss_cnt = 0; + dsl_pool_t *dp = dd->dd_pool; + objset_t *os = dp->dp_meta_objset; + zap_cursor_t *zc; + zap_attribute_t *za; + dsl_dataset_t *ds; + + ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); + ASSERT(dsl_pool_config_held(dp)); + ASSERT(dmu_tx_is_syncing(tx)); + + dsl_dir_zapify(dd, tx); + + /* + * If the filesystem count has already been initialized then we + * don't need to recurse down any further. + */ + if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) + return; + + zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* Iterate my child dirs */ + for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { + dsl_dir_t *chld_dd; + uint64_t count; + + VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, + &chld_dd)); + + /* + * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and + * temporary datasets. + */ + if (chld_dd->dd_myname[0] == '$' || + chld_dd->dd_myname[0] == '%') { + dsl_dir_rele(chld_dd, FTAG); + continue; + } + + my_fs_cnt++; /* count this child */ + + dsl_dir_init_fs_ss_count(chld_dd, tx); + + VERIFY0(zap_lookup(os, chld_dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); + my_fs_cnt += count; + VERIFY0(zap_lookup(os, chld_dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); + my_ss_cnt += count; + + dsl_dir_rele(chld_dd, FTAG); + } + zap_cursor_fini(zc); + /* Count my snapshots (we counted children's snapshots above) */ + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, + dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); + + for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + /* Don't count temporary snapshots */ + if (za->za_name[0] != '%') + my_ss_cnt++; + } + zap_cursor_fini(zc); + + dsl_dataset_rele(ds, FTAG); + + kmem_free(zc, sizeof (zap_cursor_t)); + kmem_free(za, sizeof (zap_attribute_t)); + + /* we're in a sync task, update counts */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); +} + +static int +dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) +{ + char *ddname = (char *)arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + dsl_dir_t *dd; + int error; + + error = dsl_dataset_hold(dp, ddname, FTAG, &ds); + if (error != 0) + return (error); + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + dd = ds->ds_dir; + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && + dsl_dir_is_zapified(dd) && + zap_contains(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT) == 0) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EALREADY)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) +{ + char *ddname = (char *)arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + spa_t *spa; + + VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); + + spa = dsl_dataset_get_spa(ds); + + if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { + /* + * Since the feature was not active and we're now setting a + * limit, increment the feature-active counter so that the + * feature becomes active for the first time. + * + * We are already in a sync task so we can update the MOS. + */ + spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); + } + + /* + * Since we are now setting a non-UINT64_MAX limit on the filesystem, + * we need to ensure the counts are correct. Descend down the tree from + * this point and update all of the counts to be accurate. + */ + dsl_dir_init_fs_ss_count(ds->ds_dir, tx); + + dsl_dataset_rele(ds, FTAG); +} + +/* + * Make sure the feature is enabled and activate it if necessary. + * Since we're setting a limit, ensure the on-disk counts are valid. + * This is only called by the ioctl path when setting a limit value. + * + * We do not need to validate the new limit, since users who can change the + * limit are also allowed to exceed the limit. + */ +int +dsl_dir_activate_fs_ss_limit(const char *ddname) +{ + int error; + + error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, + dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0); + + if (error == EALREADY) + error = 0; + + return (error); +} + +/* + * Used to determine if the filesystem_limit or snapshot_limit should be + * enforced. We allow the limit to be exceeded if the user has permission to + * write the property value. We pass in the creds that we got in the open + * context since we will always be the GZ root in syncing context. We also have + * to handle the case where we are allowed to change the limit on the current + * dataset, but there may be another limit in the tree above. + * + * We can never modify these two properties within a non-global zone. In + * addition, the other checks are modeled on zfs_secpolicy_write_perms. We + * can't use that function since we are already holding the dp_config_rwlock. + * In addition, we already have the dd and dealing with snapshots is simplified + * in this code. + */ + +typedef enum { + ENFORCE_ALWAYS, + ENFORCE_NEVER, + ENFORCE_ABOVE +} enforce_res_t; + +static enforce_res_t +dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) +{ + enforce_res_t enforce = ENFORCE_ALWAYS; + uint64_t obj; + dsl_dataset_t *ds; + uint64_t zoned; + + ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT); + +#ifdef _KERNEL +#if 0 + if (crgetzoneid(cr) != GLOBAL_ZONEID) + return (ENFORCE_ALWAYS); +#endif + + if (secpolicy_zfs(cr) == 0) + return (ENFORCE_NEVER); +#endif + + if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) + return (ENFORCE_ALWAYS); + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + + if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) + return (ENFORCE_ALWAYS); + + if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) { + /* Only root can access zoned fs's from the GZ */ + enforce = ENFORCE_ALWAYS; + } else { + if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) + enforce = ENFORCE_ABOVE; + } + + dsl_dataset_rele(ds, FTAG); + return (enforce); +} + +/* + * Check if adding additional child filesystem(s) would exceed any filesystem + * limits or adding additional snapshot(s) would exceed any snapshot limits. + * The prop argument indicates which limit to check. + * + * Note that all filesystem limits up to the root (or the highest + * initialized) filesystem or the given ancestor must be satisfied. + */ +int +dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, + dsl_dir_t *ancestor, cred_t *cr) +{ + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t limit, count; + char *count_prop; + enforce_res_t enforce; + int err = 0; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT); + + /* + * If we're allowed to change the limit, don't enforce the limit + * e.g. this can happen if a snapshot is taken by an administrative + * user in the global zone (i.e. a recursive snapshot by root). + * However, we must handle the case of delegated permissions where we + * are allowed to change the limit on the current dataset, but there + * is another limit in the tree above. + */ + enforce = dsl_enforce_ds_ss_limits(dd, prop, cr); + if (enforce == ENFORCE_NEVER) + return (0); + + /* + * e.g. if renaming a dataset with no snapshots, count adjustment + * is 0. + */ + if (delta == 0) + return (0); + + if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { + /* + * We don't enforce the limit for temporary snapshots. This is + * indicated by a NULL cred_t argument. + */ + if (cr == NULL) + return (0); + + count_prop = DD_FIELD_SNAPSHOT_COUNT; + } else { + count_prop = DD_FIELD_FILESYSTEM_COUNT; + } + + /* + * If an ancestor has been provided, stop checking the limit once we + * hit that dir. We need this during rename so that we don't overcount + * the check once we recurse up to the common ancestor. + */ + if (ancestor == dd) + return (0); + + /* + * If we hit an uninitialized node while recursing up the tree, we can + * stop since we know there is no limit here (or above). The counts are + * not valid on this node and we know we won't touch this node's counts. + */ + if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, + count_prop, sizeof (count), 1, &count) == ENOENT) + return (0); + + err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, + B_FALSE); + if (err != 0) + return (err); + + /* Is there a limit which we've hit? */ + if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) + return (SET_ERROR(EDQUOT)); + + if (dd->dd_parent != NULL) + err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, + ancestor, cr); + + return (err); +} + +/* + * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all + * parents. When a new filesystem/snapshot is created, increment the count on + * all parents, and when a filesystem/snapshot is destroyed, decrement the + * count. + */ +void +dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, + dmu_tx_t *tx) +{ + int err; + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t count; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || + strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); + + /* + * When we receive an incremental stream into a filesystem that already + * exists, a temporary clone is created. We don't count this temporary + * clone, whose name begins with a '%'. We also ignore hidden ($FREE, + * $MOS & $ORIGIN) objsets. + */ + if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && + strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) + return; + + /* + * e.g. if renaming a dataset with no snapshots, count adjustment is 0 + */ + if (delta == 0) + return; + + /* + * If we hit an uninitialized node while recursing up the tree, we can + * stop since we know the counts are not valid on this node and we + * know we shouldn't touch this node's counts. An uninitialized count + * on the node indicates that either the feature has not yet been + * activated or there are no limits on this part of the tree. + */ + if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, + prop, sizeof (count), 1, &count)) == ENOENT) + return; + VERIFY0(err); + + count += delta; + /* Use a signed verify to make sure we're not neg. */ + VERIFY3S(count, >=, 0); + + VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, + tx)); + + /* Roll up this additional count into our ancestors */ + if (dd->dd_parent != NULL) + dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); +} + uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx) @@ -396,7 +893,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); if (pds) { - VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, + VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, name, sizeof (uint64_t), 1, &ddobj, tx)); } else { /* it's the root dir */ @@ -408,8 +905,12 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, ddphys = dbuf->db_data; ddphys->dd_creation_time = gethrestime_sec(); - if (pds) + if (pds) { ddphys->dd_parent_obj = pds->dd_object; + + /* update the filesystem counts */ + dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); + } ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); ddphys->dd_child_dir_zapobj = zap_create(mos, @@ -424,9 +925,9 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, boolean_t dsl_dir_is_clone(dsl_dir_t *dd) { - return (dd->dd_phys->dd_origin_obj && + return (dsl_dir_phys(dd)->dd_origin_obj && (dd->dd_pool->dp_origin_snap == NULL || - dd->dd_phys->dd_origin_obj != + dsl_dir_phys(dd)->dd_origin_obj != dd->dd_pool->dp_origin_snap->ds_object)); } @@ -435,35 +936,52 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { mutex_enter(&dd->dd_lock); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, - dd->dd_phys->dd_used_bytes); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota); + dsl_dir_phys(dd)->dd_used_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, + dsl_dir_phys(dd)->dd_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, - dd->dd_phys->dd_reserved); + dsl_dir_phys(dd)->dd_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, - dd->dd_phys->dd_compressed_bytes == 0 ? 100 : - (dd->dd_phys->dd_uncompressed_bytes * 100 / - dd->dd_phys->dd_compressed_bytes)); + dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : + (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / + dsl_dir_phys(dd)->dd_compressed_bytes)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, - dd->dd_phys->dd_uncompressed_bytes); - if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + dsl_dir_phys(dd)->dd_uncompressed_bytes); + if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, - dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]); + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, - dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]); + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, - dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]); + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, - dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] + - dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]); + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); } mutex_exit(&dd->dd_lock); + if (dsl_dir_is_zapified(dd)) { + uint64_t count; + objset_t *os = dd->dd_pool->dp_meta_objset; + + if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (count), 1, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, + ZFS_PROP_FILESYSTEM_COUNT, count); + } + if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (count), 1, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, + ZFS_PROP_SNAPSHOT_COUNT, count); + } + } + if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; char buf[MAXNAMELEN]; VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_origin_obj, FTAG, &ds)); + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); @@ -475,7 +993,7 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; - ASSERT(dd->dd_phys); + ASSERT(dsl_dir_phys(dd)); if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { /* up the hold count until we can be written out */ @@ -486,8 +1004,9 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) static int64_t parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) { - uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); - uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); + uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); + uint64_t new_accounted = + MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); return (new_accounted - old_accounted); } @@ -546,9 +1065,9 @@ dsl_dir_space_available(dsl_dir_t *dd, } mutex_enter(&dd->dd_lock); - if (dd->dd_phys->dd_quota != 0) - quota = dd->dd_phys->dd_quota; - used = dd->dd_phys->dd_used_bytes; + if (dsl_dir_phys(dd)->dd_quota != 0) + quota = dsl_dir_phys(dd)->dd_quota; + used = dsl_dir_phys(dd)->dd_used_bytes; if (!ondiskonly) used += dsl_dir_space_towrite(dd); @@ -557,12 +1076,12 @@ dsl_dir_space_available(dsl_dir_t *dd, quota = MIN(quota, poolsize); } - if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { + if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { /* * We have some space reserved, in addition to what our * parent gave us. */ - parentspace += dd->dd_phys->dd_reserved - used; + parentspace += dsl_dir_phys(dd)->dd_reserved - used; } if (dd == ancestor) { @@ -621,7 +1140,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, est_inflight = dsl_dir_space_towrite(dd); for (i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; - used_on_disk = dd->dd_phys->dd_used_bytes; + used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and @@ -645,10 +1164,10 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, * If this transaction will result in a net free of space, * we want to let it through. */ - if (ignorequota || netfree || dd->dd_phys->dd_quota == 0) + if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) quota = UINT64_MAX; else - quota = dd->dd_phys->dd_quota; + quota = dsl_dir_phys(dd)->dd_quota; /* * Adjust the quota against the actual pool size at the root @@ -703,7 +1222,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, /* see if it's OK with our parent */ if (dd->dd_parent && parent_rsrv) { - boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0); + boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); return (dsl_dir_tempreserve_impl(dd->dd_parent, parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); @@ -828,7 +1347,7 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; est_used = dsl_dir_space_towrite(dd) + - dd->dd_phys->dd_used_bytes; + dsl_dir_phys(dd)->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); @@ -863,27 +1382,28 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, if (needlock) mutex_enter(&dd->dd_lock); - accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used); - ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used); + accounted_delta = + parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); + ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); ASSERT(compressed >= 0 || - dd->dd_phys->dd_compressed_bytes >= -compressed); + dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || - dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); - dd->dd_phys->dd_used_bytes += used; - dd->dd_phys->dd_uncompressed_bytes += uncompressed; - dd->dd_phys->dd_compressed_bytes += compressed; + dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); + dsl_dir_phys(dd)->dd_used_bytes += used; + dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; + dsl_dir_phys(dd)->dd_compressed_bytes += compressed; - if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { ASSERT(used > 0 || - dd->dd_phys->dd_used_breakdown[type] >= -used); - dd->dd_phys->dd_used_breakdown[type] += used; + dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); + dsl_dir_phys(dd)->dd_used_breakdown[type] += used; #ifdef DEBUG { dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) - u += dd->dd_phys->dd_used_breakdown[t]; - ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes); + u += dsl_dir_phys(dd)->dd_used_breakdown[t]; + ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); } #endif } @@ -907,17 +1427,18 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); - if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN)) + if (delta == 0 || + !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? - dd->dd_phys->dd_used_breakdown[oldtype] >= delta : - dd->dd_phys->dd_used_breakdown[newtype] >= -delta); - ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); - dd->dd_phys->dd_used_breakdown[oldtype] -= delta; - dd->dd_phys->dd_used_breakdown[newtype] += delta; + dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : + dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); + ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); + dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; + dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; mutex_exit(&dd->dd_lock); } @@ -961,8 +1482,8 @@ dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) */ towrite = dsl_dir_space_towrite(ds->ds_dir); if ((dmu_tx_is_syncing(tx) || towrite == 0) && - (newval < ds->ds_dir->dd_phys->dd_reserved || - newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) { + (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || + newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { error = SET_ERROR(ENOSPC); } mutex_exit(&ds->ds_dir->dd_lock); @@ -995,7 +1516,7 @@ dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); - ds->ds_dir->dd_phys->dd_quota = newval; + dsl_dir_phys(ds->ds_dir)->dd_quota = newval; mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); } @@ -1046,7 +1567,7 @@ dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) } mutex_enter(&dd->dd_lock); - used = dd->dd_phys->dd_used_bytes; + used = dsl_dir_phys(dd)->dd_used_bytes; mutex_exit(&dd->dd_lock); if (dd->dd_parent) { @@ -1056,13 +1577,13 @@ dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } - if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) { + if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { uint64_t delta = MAX(used, newval) - - MAX(used, dd->dd_phys->dd_reserved); + MAX(used, dsl_dir_phys(dd)->dd_reserved); if (delta > avail || - (dd->dd_phys->dd_quota > 0 && - newval > dd->dd_phys->dd_quota)) + (dsl_dir_phys(dd)->dd_quota > 0 && + newval > dsl_dir_phys(dd)->dd_quota)) error = SET_ERROR(ENOSPC); } @@ -1079,9 +1600,9 @@ dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); - used = dd->dd_phys->dd_used_bytes; - delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved); - dd->dd_phys->dd_reserved = value; + used = dsl_dir_phys(dd)->dd_used_bytes; + delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); + dsl_dir_phys(dd)->dd_reserved = value; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ @@ -1158,7 +1679,7 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) return (delta); mutex_enter(&dd->dd_lock); - delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta); + delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } @@ -1166,6 +1687,7 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) typedef struct dsl_dir_rename_arg { const char *ddra_oldname; const char *ddra_newname; + cred_t *ddra_cred; } dsl_dir_rename_arg_t; /* ARGSUSED */ @@ -1230,10 +1752,51 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx) } } + if (dmu_tx_is_syncing(tx)) { + if (spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_FS_SS_LIMIT)) { + /* + * Although this is the check function and we don't + * normally make on-disk changes in check functions, + * we need to do that here. + * + * Ensure this portion of the tree's counts have been + * initialized in case the new parent has limits set. + */ + dsl_dir_init_fs_ss_count(dd, tx); + } + } + if (newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = - MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); + MAX(dsl_dir_phys(dd)->dd_used_bytes, + dsl_dir_phys(dd)->dd_reserved); + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t fs_cnt = 0; + uint64_t ss_cnt = 0; + + if (dsl_dir_is_zapified(dd)) { + int err; + + err = zap_lookup(os, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, + &fs_cnt); + if (err != ENOENT && err != 0) + return (err); + + /* + * have to add 1 for the filesystem itself that we're + * moving + */ + fs_cnt++; + + err = zap_lookup(os, dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, + &ss_cnt); + if (err != ENOENT && err != 0) + return (err); + } /* no rename into our descendant */ if (closest_common_ancestor(dd, newparent) == dd) { @@ -1243,7 +1806,7 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx) } error = dsl_dir_transfer_possible(dd->dd_parent, - newparent, myspace); + newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); @@ -1275,18 +1838,50 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) "-> %s", ddra->ddra_newname); if (newparent != dd->dd_parent) { + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t fs_cnt = 0; + uint64_t ss_cnt = 0; + + /* + * We already made sure the dd counts were initialized in the + * check function. + */ + if (spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_FS_SS_LIMIT)) { + VERIFY0(zap_lookup(os, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, + &fs_cnt)); + /* add 1 for the filesystem itself that we're moving */ + fs_cnt++; + + VERIFY0(zap_lookup(os, dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, + &ss_cnt)); + } + + dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, + DD_FIELD_FILESYSTEM_COUNT, tx); + dsl_fs_ss_count_adjust(newparent, fs_cnt, + DD_FIELD_FILESYSTEM_COUNT, tx); + + dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, + DD_FIELD_SNAPSHOT_COUNT, tx); + dsl_fs_ss_count_adjust(newparent, ss_cnt, + DD_FIELD_SNAPSHOT_COUNT, tx); + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, - -dd->dd_phys->dd_used_bytes, - -dd->dd_phys->dd_compressed_bytes, - -dd->dd_phys->dd_uncompressed_bytes, tx); + -dsl_dir_phys(dd)->dd_used_bytes, + -dsl_dir_phys(dd)->dd_compressed_bytes, + -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD, - dd->dd_phys->dd_used_bytes, - dd->dd_phys->dd_compressed_bytes, - dd->dd_phys->dd_uncompressed_bytes, tx); + dsl_dir_phys(dd)->dd_used_bytes, + dsl_dir_phys(dd)->dd_compressed_bytes, + dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); - if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) { - uint64_t unused_rsrv = dd->dd_phys->dd_reserved - - dd->dd_phys->dd_used_bytes; + if (dsl_dir_phys(dd)->dd_reserved > + dsl_dir_phys(dd)->dd_used_bytes) { + uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - + dsl_dir_phys(dd)->dd_used_bytes; dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, -unused_rsrv, 0, 0, tx); @@ -1298,18 +1893,19 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ - error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, + error = zap_remove(mos, + dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, dd->dd_myname, tx); ASSERT0(error); (void) strcpy(dd->dd_myname, mynewname); dsl_dir_rele(dd->dd_parent, dd); - dd->dd_phys->dd_parent_obj = newparent->dd_object; + dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; VERIFY0(dsl_dir_hold_obj(dp, newparent->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ - VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj, + VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx)); #ifdef _KERNEL @@ -1329,17 +1925,20 @@ dsl_dir_rename(const char *oldname, const char *newname) ddra.ddra_oldname = oldname; ddra.ddra_newname = newname; + ddra.ddra_cred = CRED(); return (dsl_sync_task(oldname, dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3)); } int -dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr) { dsl_dir_t *ancestor; int64_t adelta; uint64_t avail; + int err; ancestor = closest_common_ancestor(sdd, tdd); adelta = would_change(sdd, -space, ancestor); @@ -1347,6 +1946,15 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) if (avail < space) return (SET_ERROR(ENOSPC)); + err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, + ancestor, cr); + if (err != 0) + return (err); + err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, + ancestor, cr); + if (err != 0) + return (err); + return (0); } @@ -1380,6 +1988,15 @@ dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); } +boolean_t +dsl_dir_is_zapified(dsl_dir_t *dd) +{ + dmu_object_info_t doi; + + dmu_object_info_from_db(dd->dd_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); +} + #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index b54c03bc33fe..717dfbe9f6a0 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -137,7 +138,7 @@ dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) int err; err = zap_lookup(dp->dp_meta_objset, - dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, + dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, name, sizeof (obj), 1, &obj); if (err) return (err); @@ -219,11 +220,11 @@ dsl_pool_open(dsl_pool_t *dp) err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; - err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, - FTAG, &ds); + err = dsl_dataset_hold_obj(dp, + dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); if (err == 0) { err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap); dsl_dataset_rele(ds, FTAG); } @@ -319,6 +320,8 @@ dsl_pool_close(dsl_pool_t *dp) arc_flush(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); + dmu_buf_user_evict_wait(); + rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); taskq_destroy(dp->dp_iput_taskq); @@ -683,15 +686,15 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) if (err) return (err); - while (ds->ds_phys->ds_prev_snap_obj != 0) { - err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - FTAG, &prev); + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) + if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) break; dsl_dataset_rele(ds, FTAG); ds = prev; @@ -705,7 +708,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) * The $ORIGIN can't have any data, or the accounting * will be wrong. */ - ASSERT0(prev->ds_phys->ds_bp.blk_birth); + ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); /* The origin doesn't get attached to itself */ if (ds->ds_object == prev->ds_object) { @@ -714,33 +717,35 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) } dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_prev_snap_obj = prev->ds_object; - ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; + dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; + dsl_dataset_phys(ds)->ds_prev_snap_txg = + dsl_dataset_phys(prev)->ds_creation_txg; dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); - ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; + dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; dmu_buf_will_dirty(prev->ds_dbuf, tx); - prev->ds_phys->ds_num_children++; + dsl_dataset_phys(prev)->ds_num_children++; - if (ds->ds_phys->ds_next_snap_obj == 0) { + if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { ASSERT(ds->ds_prev == NULL); VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); + dsl_dataset_phys(ds)->ds_prev_snap_obj, + ds, &ds->ds_prev)); } } - ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object); - ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object); + ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); - if (prev->ds_phys->ds_next_clones_obj == 0) { + if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { dmu_buf_will_dirty(prev->ds_dbuf, tx); - prev->ds_phys->ds_next_clones_obj = + dsl_dataset_phys(prev)->ds_next_clones_obj = zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, - prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); + dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); dsl_dataset_rele(ds, FTAG); if (prev != dp->dp_origin_snap) @@ -765,20 +770,22 @@ upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) dmu_tx_t *tx = arg; objset_t *mos = dp->dp_meta_objset; - if (ds->ds_dir->dd_phys->dd_origin_obj != 0) { + if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { dsl_dataset_t *origin; VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); + dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); - if (origin->ds_dir->dd_phys->dd_clones == 0) { + if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); - origin->ds_dir->dd_phys->dd_clones = zap_create(mos, - DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); + dsl_dir_phys(origin->ds_dir)->dd_clones = + zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, + 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, - origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx)); + dsl_dir_phys(origin->ds_dir)->dd_clones, + ds->ds_object, tx)); dsl_dataset_rele(origin, FTAG); } @@ -826,7 +833,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) NULL, 0, kcred, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); - VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); } diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c index d71247326926..4c0e579907c8 100644 --- a/module/zfs/dsl_prop.c +++ b/module/zfs/dsl_prop.c @@ -105,8 +105,8 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, } /* Check for a local value. */ - err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, - intsz, numints, buf); + err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, + propname, intsz, numints, buf); if (err != ENOENT) { if (setpoint != NULL && err == 0) dsl_dir_name(dd, setpoint); @@ -117,14 +117,14 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, * Skip the check for a received value if there is an explicit * inheritance entry. */ - err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, + err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, inheritstr); if (err != 0 && err != ENOENT) break; if (err == ENOENT) { /* Check for a received value. */ - err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, + err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, recvdstr, intsz, numints, buf); if (err != ENOENT) { if (setpoint != NULL && err == 0) { @@ -163,19 +163,17 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, { zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t inheritable; - boolean_t snapshot; uint64_t zapobj; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); - snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)); - zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj); + zapobj = dsl_dataset_phys(ds)->ds_props_obj; if (zapobj != 0) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int err; - ASSERT(snapshot); + ASSERT(ds->ds_is_snapshot); /* Check for a local value. */ err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); @@ -215,7 +213,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, } return (dsl_prop_get_dd(ds->ds_dir, propname, - intsz, numints, buf, setpoint, snapshot)); + intsz, numints, buf, setpoint, ds->ds_is_snapshot)); } /* @@ -327,7 +325,7 @@ dsl_prop_predict(dsl_dir_t *dd, const char *propname, } mos = dd->dd_pool->dp_meta_objset; - zapobj = dd->dd_phys->dd_props_zapobj; + zapobj = dsl_dir_phys(dd)->dd_props_zapobj; recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); version = spa_version(dd->dd_pool->dp_spa); @@ -486,7 +484,8 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, * If the prop is set here, then this change is not * being inherited here or below; stop the recursion. */ - err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname); + err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, + propname); if (err == 0) { dsl_dir_rele(dd, FTAG); return; @@ -497,7 +496,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, mutex_enter(&dd->dd_lock); for (cbr = list_head(&dd->dd_prop_cbs); cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { - uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj; + uint64_t propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj; if (strcmp(cbr->cbr_propname, propname) != 0) continue; @@ -515,7 +514,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, mos, - dd->dd_phys->dd_child_dir_zapobj); + dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { dsl_prop_changed_notify(dp, za->za_first_integer, @@ -544,17 +543,17 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, isint = (dodefault(propname, 8, 1, &intval) == 0); - if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { ASSERT(version >= SPA_VERSION_SNAP_PROPS); - if (ds->ds_phys->ds_props_obj == 0) { + if (dsl_dataset_phys(ds)->ds_props_obj == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_props_obj = + dsl_dataset_phys(ds)->ds_props_obj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); } - zapobj = ds->ds_phys->ds_props_obj; + zapobj = dsl_dataset_phys(ds)->ds_props_obj; } else { - zapobj = ds->ds_dir->dd_phys->dd_props_zapobj; + zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj; } if (version < SPA_VERSION_RECVD_PROPS) { @@ -641,7 +640,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, if (isint) { VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); - if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_prop_cb_record_t *cbr; /* * It's a snapshot; nothing can inherit this @@ -759,7 +758,7 @@ dsl_props_set_check(void *arg, dmu_tx_t *tx) } } - if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) { + if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } @@ -982,16 +981,16 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (dsl_dataset_is_snapshot(ds)) + if (ds->ds_is_snapshot) flags |= DSL_PROP_GET_SNAPSHOT; ASSERT(dsl_pool_config_held(dp)); - if (ds->ds_phys->ds_props_obj != 0) { + if (dsl_dataset_phys(ds)->ds_props_obj != 0) { ASSERT(flags & DSL_PROP_GET_SNAPSHOT); dsl_dataset_name(ds, setpoint); - err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj, - setpoint, flags, *nvp); + err = dsl_prop_get_all_impl(mos, + dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp); if (err) goto out; } @@ -1004,8 +1003,8 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, flags |= DSL_PROP_GET_INHERITING; } dsl_dir_name(dd, setpoint); - err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj, - setpoint, flags, *nvp); + err = dsl_prop_get_all_impl(mos, + dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp); if (err) break; } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 8b166bcc68eb..5f70b18719c8 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -415,8 +415,8 @@ static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; - if (dsl_dataset_is_snapshot(ds)) - return (MIN(smt, ds->ds_phys->ds_creation_txg)); + if (ds->ds_is_snapshot) + return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); return (smt); } @@ -846,14 +846,15 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) return; if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { /* Note, scn_cur_{min,max}_txg stays the same. */ scn->scn_phys.scn_bookmark.zb_objset = - ds->ds_phys->ds_next_snap_obj; + dsl_dataset_phys(ds)->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, - (u_longlong_t)ds->ds_phys->ds_next_snap_obj); + (u_longlong_t)dsl_dataset_phys(ds)-> + ds_next_snap_obj); scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; } else { SET_BOOKMARK(&scn->scn_phys.scn_bookmark, @@ -864,10 +865,10 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) } } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { /* * We keep the same mintxg; it could be > * ds_creation_txg if the previous snapshot was @@ -875,11 +876,13 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) */ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0); + dsl_dataset_phys(ds)->ds_next_snap_obj, + mintxg, tx) == 0); zfs_dbgmsg("destroying ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, - (u_longlong_t)ds->ds_phys->ds_next_snap_obj); + (u_longlong_t)dsl_dataset_phys(ds)-> + ds_next_snap_obj); } else { zfs_dbgmsg("destroying ds %llu; in queue; removing", (u_longlong_t)ds->ds_object); @@ -906,26 +909,26 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) if (scn->scn_phys.scn_state != DSS_SCANNING) return; - ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); + ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = - ds->ds_phys->ds_prev_snap_obj; + dsl_dataset_phys(ds)->ds_prev_snap_obj; zfs_dbgmsg("snapshotting ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, - (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); + (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0); + dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("snapshotting ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, - (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); + (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } dsl_scan_sync_state(scn, tx); } @@ -958,8 +961,8 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) ds1->ds_object, &mintxg) == 0) { int err; - ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); - ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); err = zap_add_int_key(dp->dp_meta_objset, @@ -977,8 +980,8 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) (u_longlong_t)ds2->ds_object); } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { - ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); - ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, @@ -1006,17 +1009,17 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) int err; dsl_scan_t *scn = dp->dp_scan; - if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj) + if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj) return (0); err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); - while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); dsl_dataset_rele(ds, FTAG); if (err) @@ -1025,7 +1028,7 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) } VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, - ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); + dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0); dsl_dataset_rele(ds, FTAG); return (0); } @@ -1050,14 +1053,14 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) * ZIL here, rather than in scan_recurse(), because the regular * snapshot block-sharing rules don't apply to it. */ - if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds)) + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot) dsl_scan_zil(dp, &os->os_zil_header); /* * Iterate over the bps in this ds. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx); + dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); dsl_dataset_name(ds, dsname); @@ -1091,14 +1094,15 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) /* * Add descendent datasets to work queue. */ - if (ds->ds_phys->ds_next_snap_obj != 0) { + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj, - ds->ds_phys->ds_creation_txg, tx) == 0); + scn->scn_phys.scn_queue_obj, + dsl_dataset_phys(ds)->ds_next_snap_obj, + dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0); } - if (ds->ds_phys->ds_num_children > 1) { + if (dsl_dataset_phys(ds)->ds_num_children > 1) { boolean_t usenext = B_FALSE; - if (ds->ds_phys->ds_next_clones_obj != 0) { + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { uint64_t count; /* * A bug in a previous version of the code could @@ -1108,17 +1112,17 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) * next_clones_obj when its count is correct. */ int err = zap_count(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, &count); + dsl_dataset_phys(ds)->ds_next_clones_obj, &count); if (err == 0 && - count == ds->ds_phys->ds_num_children - 1) + count == dsl_dataset_phys(ds)->ds_num_children - 1) usenext = B_TRUE; } if (usenext) { VERIFY0(zap_join_key(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, + dsl_dataset_phys(ds)->ds_next_clones_obj, scn->scn_phys.scn_queue_obj, - ds->ds_phys->ds_creation_txg, tx)); + dsl_dataset_phys(ds)->ds_creation_txg, tx)); } else { struct enqueue_clones_arg eca; eca.tx = tx; @@ -1146,10 +1150,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) if (err) return (err); - while (ds->ds_phys->ds_prev_snap_obj != 0) { + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - FTAG, &prev); + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); @@ -1158,7 +1162,7 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) /* * If this is a clone, we don't need to worry about it for now. */ - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { + if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(prev, FTAG); return (0); @@ -1168,7 +1172,7 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) } VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0); + ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0); dsl_dataset_rele(ds, FTAG); return (0); } @@ -1348,7 +1352,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, - ds->ds_phys->ds_prev_snap_txg); + dsl_dataset_phys(ds)->ds_prev_snap_txg); } scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); dsl_dataset_rele(ds, FTAG); @@ -1534,9 +1538,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (err != 0) return; if (!scn->scn_async_destroying && zfs_free_leak_on_eio && - (dp->dp_free_dir->dd_phys->dd_used_bytes != 0 || - dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 || - dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) { + (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || + dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || + dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { /* * We have finished background destroying, but there is still * some space left in the dp_free_dir. Transfer this leaked @@ -1551,19 +1555,19 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) rrw_exit(&dp->dp_config_rwlock, FTAG); } dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, - dp->dp_free_dir->dd_phys->dd_used_bytes, - dp->dp_free_dir->dd_phys->dd_compressed_bytes, - dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx); + dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, + dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, + dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, - -dp->dp_free_dir->dd_phys->dd_used_bytes, - -dp->dp_free_dir->dd_phys->dd_compressed_bytes, - -dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx); + -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, + -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, + -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } if (!scn->scn_async_destroying) { /* finished; verify that space accounting went to zero */ - ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes); - ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes); - ASSERT0(dp->dp_free_dir->dd_phys->dd_uncompressed_bytes); + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } if (scn->scn_phys.scn_state != DSS_SCANNING) diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c index 5f345f498a84..8b11cd957154 100644 --- a/module/zfs/dsl_synctask.c +++ b/module/zfs/dsl_synctask.c @@ -152,7 +152,7 @@ dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx) */ quota = dsl_pool_adjustedsize(dp, B_FALSE) - metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); - used = dp->dp_root_dir->dd_phys->dd_used_bytes; + used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; /* MOS space is triple-dittoed, so we multiply by 3. */ if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) { dst->dst_error = SET_ERROR(ENOSPC); diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c index 1d6c9df8979b..4007d4c5ea35 100644 --- a/module/zfs/dsl_userhold.c +++ b/module/zfs/dsl_userhold.c @@ -64,10 +64,10 @@ dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, return (SET_ERROR(E2BIG)); /* tags must be unique (if ds already exists) */ - if (ds != NULL && ds->ds_phys->ds_userrefs_obj != 0) { + if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { uint64_t value; - error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, + error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, htag, 8, 1, &value); if (error == 0) error = SET_ERROR(EEXIST); @@ -141,16 +141,16 @@ dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds, ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - if (ds->ds_phys->ds_userrefs_obj == 0) { + if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) { /* * This is the first user hold for this dataset. Create * the userrefs zap object. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); - zapobj = ds->ds_phys->ds_userrefs_obj = + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); } else { - zapobj = ds->ds_phys->ds_userrefs_obj; + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; } ds->ds_userrefs++; @@ -354,7 +354,7 @@ dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, objset_t *mos; int numholds; - if (!dsl_dataset_is_snapshot(ds)) + if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); if (nvlist_empty(holds)) @@ -362,7 +362,7 @@ dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, numholds = 0; mos = ds->ds_dir->dd_pool->dp_meta_objset; - zapobj = ds->ds_phys->ds_userrefs_obj; + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; VERIFY0(nvlist_alloc(&holds_found, NV_UNIQUE_NAME, KM_SLEEP)); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; @@ -400,7 +400,8 @@ dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, numholds++; } - if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 && + if (DS_IS_DEFER_DESTROY(ds) && + dsl_dataset_phys(ds)->ds_num_children == 1 && ds->ds_userrefs == numholds) { /* we need to destroy the snapshot as well */ if (dsl_dataset_long_held(ds)) { @@ -488,8 +489,8 @@ dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx); VERIFY(error == 0 || error == ENOENT); - VERIFY0(zap_remove(mos, ds->ds_phys->ds_userrefs_obj, holdname, - tx)); + VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, + holdname, tx)); ds->ds_userrefs--; spa_history_log_internal_ds(ds, "release", tx, @@ -519,7 +520,7 @@ dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) fnvpair_value_nvlist(pair), tx); if (nvlist_exists(ddura->ddura_todelete, name)) { ASSERT(ds->ds_userrefs == 0 && - ds->ds_phys->ds_num_children == 1 && + dsl_dataset_phys(ds)->ds_num_children == 1 && DS_IS_DEFER_DESTROY(ds)); dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); } @@ -651,13 +652,13 @@ dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl) return (err); } - if (ds->ds_phys->ds_userrefs_obj != 0) { + if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { zap_attribute_t *za; zap_cursor_t zc; za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_userrefs_obj); + dsl_dataset_phys(ds)->ds_userrefs_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { fnvlist_add_uint64(nvl, za->za_name, diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 9063d1dae449..8e857e12930a 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -209,6 +210,7 @@ sa_cache_constructor(void *buf, void *unused, int kmflag) { sa_handle_t *hdl = buf; + hdl->sa_dbu.dbu_evict_func = NULL; hdl->sa_bonus_tab = NULL; hdl->sa_spill_tab = NULL; hdl->sa_os = NULL; @@ -224,6 +226,7 @@ static void sa_cache_destructor(void *buf, void *unused) { sa_handle_t *hdl = buf; + hdl->sa_dbu.dbu_evict_func = NULL; mutex_destroy(&hdl->sa_lock); } @@ -1112,6 +1115,9 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, if (sa->sa_user_table) kmem_free(sa->sa_user_table, sa->sa_user_table_sz); mutex_exit(&sa->sa_lock); + avl_destroy(&sa->sa_layout_hash_tree); + avl_destroy(&sa->sa_layout_num_tree); + mutex_destroy(&sa->sa_lock); kmem_free(sa, sizeof (sa_os_t)); return ((error == ECKSUM) ? EIO : error); } @@ -1148,6 +1154,7 @@ sa_tear_down(objset_t *os) avl_destroy(&sa->sa_layout_hash_tree); avl_destroy(&sa->sa_layout_num_tree); + mutex_destroy(&sa->sa_lock); kmem_free(sa, sizeof (sa_os_t)); os->os_sa = NULL; @@ -1302,10 +1309,10 @@ sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) } /*ARGSUSED*/ -void -sa_evict(dmu_buf_t *db, void *sap) +static void +sa_evict(void *dbu) { - panic("evicting sa dbuf %p\n", (void *)db); + panic("evicting sa dbuf\n"); } static void @@ -1357,9 +1364,10 @@ sa_spill_rele(sa_handle_t *hdl) void sa_handle_destroy(sa_handle_t *hdl) { + dmu_buf_t *db = hdl->sa_bonus; + mutex_enter(&hdl->sa_lock); - (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl, - NULL, NULL, NULL); + (void) dmu_buf_remove_user(db, &hdl->sa_dbu); if (hdl->sa_bonus_tab) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); @@ -1384,7 +1392,7 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, sa_handle_type_t hdl_type, sa_handle_t **handlepp) { int error = 0; - sa_handle_t *handle; + sa_handle_t *handle = NULL; #ifdef ZFS_DEBUG dmu_object_info_t doi; @@ -1395,9 +1403,12 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, /* find handle, if it exists */ /* if one doesn't exist then create a new one, and initialize it */ - handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL; + if (hdl_type == SA_HDL_SHARED) + handle = dmu_buf_get_user(db); + if (handle == NULL) { - sa_handle_t *newhandle; + sa_handle_t *winner = NULL; + handle = kmem_cache_alloc(sa_cache, KM_SLEEP); handle->sa_userp = userp; handle->sa_bonus = db; @@ -1405,13 +1416,15 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, handle->sa_spill = NULL; error = sa_build_index(handle, SA_BONUS); - newhandle = (hdl_type == SA_HDL_SHARED) ? - dmu_buf_set_user_ie(db, handle, - NULL, sa_evict) : NULL; - if (newhandle != NULL) { + if (hdl_type == SA_HDL_SHARED) { + dmu_buf_init_user(&handle->sa_dbu, sa_evict, NULL); + winner = dmu_buf_set_user_ie(db, &handle->sa_dbu); + } + + if (winner != NULL) { kmem_cache_free(sa_cache, handle); - handle = newhandle; + handle = winner; } } *handlepp = handle; @@ -1943,14 +1956,6 @@ sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) blksize, nblocks); } -void -sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl) -{ - (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus, - oldhdl, newhdl, NULL, sa_evict); - oldhdl->sa_bonus = NULL; -} - void sa_set_userp(sa_handle_t *hdl, void *ptr) { @@ -2049,7 +2054,6 @@ EXPORT_SYMBOL(sa_size); EXPORT_SYMBOL(sa_update_from_cb); EXPORT_SYMBOL(sa_object_info); EXPORT_SYMBOL(sa_object_size); -EXPORT_SYMBOL(sa_update_user); EXPORT_SYMBOL(sa_get_userdata); EXPORT_SYMBOL(sa_set_userp); EXPORT_SYMBOL(sa_get_db); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 8bee15094685..2f9c241d89c2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* @@ -237,7 +238,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) */ if (pool->dp_free_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, - pool->dp_free_dir->dd_phys->dd_used_bytes, src); + dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, + src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); @@ -245,7 +247,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) if (pool->dp_leak_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, - pool->dp_leak_dir->dd_phys->dd_used_bytes, src); + dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, + src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 0, src); @@ -1093,6 +1096,8 @@ spa_activate(spa_t *spa, int mode) list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); + list_create(&spa->spa_evicting_os_list, sizeof (objset_t), + offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); @@ -1121,9 +1126,12 @@ spa_deactivate(spa_t *spa) ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + spa_evicting_os_wait(spa); + txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); + list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); @@ -2134,6 +2142,11 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, mosconfig, &ereport); } + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { @@ -3768,6 +3781,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_history_log_version(spa, "create"); + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); mutex_exit(&spa_namespace_lock); @@ -4309,6 +4327,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, */ if (spa->spa_sync_on) txg_wait_synced(spa->spa_dsl_pool, 0); + spa_evicting_os_wait(spa); /* * A pool cannot be exported or destroyed if there are active diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index 14e681e77d8b..8e0033d949da 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -520,7 +520,7 @@ spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dsl_dir_name(dd, namebuf); fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, - dd->dd_phys->dd_head_dataset_obj); + dsl_dir_phys(dd)->dd_head_dataset_obj); va_start(adx, fmt); log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index ec0c0195947c..050bc52fefa2 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -525,6 +526,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); @@ -533,6 +535,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); @@ -621,6 +624,7 @@ spa_remove(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0); nvlist_free(spa->spa_config_splitting); @@ -655,6 +659,7 @@ spa_remove(spa_t *spa) bplist_destroy(&spa->spa_free_bplist[t]); cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); @@ -662,6 +667,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); + mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); @@ -717,6 +723,20 @@ spa_close(spa_t *spa, void *tag) (void) refcount_remove(&spa->spa_refcount, tag); } +/* + * Remove a reference to the given spa_t held by a dsl dir that is + * being asynchronously released. Async releases occur from a taskq + * performing eviction of dsl datasets and dirs. The namespace lock + * isn't held and the hold by the object being evicted may contribute to + * spa_minref (e.g. dataset or directory released during pool export), + * so the asserts in spa_close() do not apply. + */ +void +spa_async_close(spa_t *spa, void *tag) +{ + (void) refcount_remove(&spa->spa_refcount, tag); +} + /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held. We really compare against spa_minref, which is the @@ -1600,6 +1620,34 @@ spa_log_class(spa_t *spa) return (spa->spa_log_class); } +void +spa_evicting_os_register(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_insert_head(&spa->spa_evicting_os_list, os); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_deregister(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_remove(&spa->spa_evicting_os_list, os); + cv_broadcast(&spa->spa_evicting_os_cv); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_wait(spa_t *spa) +{ + mutex_enter(&spa->spa_evicting_os_lock); + while (!list_is_empty(&spa->spa_evicting_os_list)) + cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); + mutex_exit(&spa->spa_evicting_os_lock); + + dmu_buf_user_evict_wait(); +} + int spa_max_replication(spa_t *spa) { diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 5ffa138a6b4b..fa2dfc7d5a33 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* @@ -50,9 +51,9 @@ int fzap_default_block_shift = 14; /* 16k blocksize */ -static void zap_leaf_pageout(dmu_buf_t *db, void *vl); -static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); +extern inline zap_phys_t *zap_f_phys(zap_t *zap); +static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); void fzap_byteswap(void *vbuf, size_t size) @@ -80,13 +81,12 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); zap->zap_ismicro = FALSE; - (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, - &zap->zap_f.zap_phys, zap_evict); + zap->zap_dbu.dbu_evict_func = zap_evict; mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; - zp = zap->zap_f.zap_phys; + zp = zap_f_phys(zap); /* * explicitly zero it since it might be coming from an * initialized microzap @@ -117,7 +117,6 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); l->l_dbuf = db; - l->l_phys = db->db_data; zap_leaf_init(l, zp->zap_normflags != 0); @@ -325,10 +324,10 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) * If we are within 2 bits of running out, stop growing, since * this is already an aberrant condition. */ - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) return (SET_ERROR(ENOSPC)); - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* * We are outgrowing the "embedded" ptrtbl (the one * stored in the header block). Give it its own entire @@ -338,9 +337,9 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) dmu_buf_t *db_new; int err; - ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, + ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - ASSERT0(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk); + ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); newblk = zap_allocate_blocks(zap, 1); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, @@ -353,17 +352,17 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); dmu_buf_rele(db_new, FTAG); - zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; - zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1; - zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++; + zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; + zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; + zap_f_phys(zap)->zap_ptrtbl.zt_shift++; - ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, - zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << + ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << (FZAP_BLOCK_SHIFT(zap)-3)); return (0); } else { - return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, zap_ptrtbl_transfer, tx)); } } @@ -373,8 +372,8 @@ zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) { dmu_buf_will_dirty(zap->zap_dbuf, tx); mutex_enter(&zap->zap_f.zap_num_entries_mtx); - ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta); - zap->zap_f.zap_phys->zap_num_entries += delta; + ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); + zap_f_phys(zap)->zap_num_entries += delta; mutex_exit(&zap->zap_f.zap_num_entries_mtx); } @@ -383,16 +382,25 @@ zap_allocate_blocks(zap_t *zap, int nblocks) { uint64_t newblk; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - newblk = zap->zap_f.zap_phys->zap_freeblk; - zap->zap_f.zap_phys->zap_freeblk += nblocks; + newblk = zap_f_phys(zap)->zap_freeblk; + zap_f_phys(zap)->zap_freeblk += nblocks; return (newblk); } +static void +zap_leaf_pageout(void *dbu) +{ + zap_leaf_t *l = dbu; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { void *winner; - zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -400,18 +408,18 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx) rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = zap_allocate_blocks(zap, 1); l->l_dbuf = NULL; - l->l_phys = NULL; VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); - winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); + dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf); + winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu); ASSERT(winner == NULL); dmu_buf_will_dirty(l->l_dbuf, tx); zap_leaf_init(l, zap->zap_normflags != 0); - zap->zap_f.zap_phys->zap_num_leafs++; + zap_f_phys(zap)->zap_num_leafs++; return (l); } @@ -421,7 +429,7 @@ fzap_count(zap_t *zap, uint64_t *count) { ASSERT(!zap->zap_ismicro); mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ - *count = zap->zap_f.zap_phys->zap_num_entries; + *count = zap_f_phys(zap)->zap_num_entries; mutex_exit(&zap->zap_f.zap_num_entries_mtx); return (0); } @@ -437,16 +445,6 @@ zap_put_leaf(zap_leaf_t *l) dmu_buf_rele(l->l_dbuf, NULL); } -_NOTE(ARGSUSED(0)) -static void -zap_leaf_pageout(dmu_buf_t *db, void *vl) -{ - zap_leaf_t *l = vl; - - rw_destroy(&l->l_rwlock); - kmem_free(l, sizeof (zap_leaf_t)); -} - static zap_leaf_t * zap_open_leaf(uint64_t blkid, dmu_buf_t *db) { @@ -454,20 +452,20 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db) ASSERT(blkid != 0); - l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = blkid; l->l_bs = highbit64(db->db_size) - 1; l->l_dbuf = db; - l->l_phys = NULL; - winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout); + dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf); + winner = dmu_buf_set_user(db, &l->l_dbu); rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ - zap_leaf_pageout(NULL, l); + zap_leaf_pageout(&l->l_dbu); l = winner; } @@ -476,7 +474,7 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db) * chain. There should be no chained leafs (as we have removed * support for them). */ - ASSERT0(l->l_phys->l_hdr.lh_pad1); + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); /* * There should be more hash entries than there can be @@ -486,11 +484,11 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db) /* The chunks should begin at the end of the hash table */ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *) - &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); + &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); /* The chunks should end at the end of the block */ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - - (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size); + (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); return (l); } @@ -523,16 +521,15 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, rw_enter(&l->l_rwlock, lt); /* - * Must lock before dirtying, otherwise l->l_phys could change, + * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, * causing ASSERT below to fail. */ if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); - ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data); - ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF); - ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); *lp = l; return (0); @@ -543,13 +540,13 @@ zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) { ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { ASSERT3U(idx, <, - (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift)); + (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); return (0); } else { - return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, valp)); } } @@ -560,11 +557,11 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) ASSERT(tx != NULL); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) { + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; return (0); } else { - return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, blk, tx)); } } @@ -576,16 +573,17 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) int err; ASSERT(zap->zap_dbuf == NULL || - zap->zap_f.zap_phys == zap->zap_dbuf->db_data); - ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); - idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + zap_f_phys(zap) == zap->zap_dbuf->db_data); + ASSERT3U(zap_f_phys(zap)->zap_magic, ==, ZAP_MAGIC); + idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); err = zap_idx_to_blk(zap, idx, &blk); if (err != 0) return (err); err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); - ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) == - (*lp)->l_phys->l_hdr.lh_prefix); + ASSERT(err || + ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == + zap_leaf_phys(*lp)->l_hdr.lh_prefix); return (err); } @@ -597,16 +595,16 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) zap_leaf_t *nl; int prefix_diff, i, err; uint64_t sibling; - int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len; + int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; - ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - l->l_phys->l_hdr.lh_prefix); + zap_leaf_phys(l)->l_hdr.lh_prefix); if (zap_tryupgradedir(zap, tx) == 0 || - old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { + old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { /* We failed to upgrade, or need to grow the pointer table */ objset_t *os = zap->zap_objset; uint64_t object = zap->zap_object; @@ -621,7 +619,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) ASSERT(!zap->zap_ismicro); while (old_prefix_len == - zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { + zap_f_phys(zap)->zap_ptrtbl.zt_shift) { err = zap_grow_ptrtbl(zap, tx); if (err) return (err); @@ -631,18 +629,18 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) if (err) return (err); - if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) { + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { /* it split while our locks were down */ *lp = l; return (0); } } ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - l->l_phys->l_hdr.lh_prefix); + zap_leaf_phys(l)->l_hdr.lh_prefix); - prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - + prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - (old_prefix_len + 1); sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; @@ -664,7 +662,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) ASSERT0(err); /* we checked for i/o errors above */ } - if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) { + if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { /* we want the sibling */ zap_put_leaf(l); *lp = nl; @@ -680,13 +678,13 @@ static void zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) { zap_t *zap = zn->zn_zap; - int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; - int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift && - l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); + int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && + zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); zap_put_leaf(l); - if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) { + if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { int err; /* @@ -706,7 +704,7 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) } /* could have finished growing while our locks were down */ - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift) + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) (void) zap_grow_ptrtbl(zap, tx); } } @@ -937,7 +935,7 @@ fzap_prefetch(zap_name_t *zn) int bs; idx = ZAP_HASH_IDX(zn->zn_hash, - zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + zap_f_phys(zap)->zap_ptrtbl.zt_shift); if (zap_idx_to_blk(zap, idx, &blk) != 0) return; bs = FZAP_BLOCK_SHIFT(zap); @@ -1169,8 +1167,8 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) if (zc->zc_leaf && (ZAP_HASH_IDX(zc->zc_hash, - zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) != - zc->zc_leaf->l_phys->l_hdr.lh_prefix)) { + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; @@ -1191,10 +1189,11 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) if (err == ENOENT) { uint64_t nocare = - (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1; + (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; zc->zc_cd = 0; - if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) { + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 || + zc->zc_hash == 0) { zc->zc_hash = -1ULL; } else { zap_put_leaf(zc->zc_leaf); @@ -1286,25 +1285,25 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) /* * Set zap_phys_t fields */ - zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs; - zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries; - zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk; - zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type; - zs->zs_magic = zap->zap_f.zap_phys->zap_magic; - zs->zs_salt = zap->zap_f.zap_phys->zap_salt; + zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; + zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; + zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; + zs->zs_block_type = zap_f_phys(zap)->zap_block_type; + zs->zs_magic = zap_f_phys(zap)->zap_magic; + zs->zs_salt = zap_f_phys(zap)->zap_salt; /* * Set zap_ptrtbl fields */ - zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; - zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk; + zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; + zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; zs->zs_ptrtbl_blks_copied = - zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied; - zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk; - zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; - zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; + zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; + zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; + zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; + zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* the ptrtbl is entirely in the header block. */ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); @@ -1312,16 +1311,16 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) int b; dmu_prefetch(zap->zap_objset, zap->zap_object, - zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs, - zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs); + zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs); - for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; + for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; b++) { dmu_buf_t *db; int err; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, + (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { zap_stats_ptrtbl(zap, db->db_data, @@ -1358,7 +1357,7 @@ fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, * could extend the table. */ if (add) { - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) *towrite += zap->zap_dbuf->db_size; else *towrite += (zap->zap_dbuf->db_size * 3); diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c index 9578048250e2..3abc08cff476 100644 --- a/module/zfs/zap_leaf.c +++ b/module/zfs/zap_leaf.c @@ -48,10 +48,12 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); #define LEAF_HASH(l, h) \ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ - ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len))) + ((h) >> \ + (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) -#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) +#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) +extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l); static void zap_memset(void *a, int c, size_t n) @@ -107,8 +109,11 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) { int i; zap_leaf_t l; + dmu_buf_t l_dbuf; + + l_dbuf.db_data = buf; l.l_bs = highbit64(size) - 1; - l.l_phys = buf; + l.l_dbuf = &l_dbuf; buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); @@ -161,18 +166,20 @@ zap_leaf_init(zap_leaf_t *l, boolean_t sort) int i; l->l_bs = highbit64(l->l_dbuf->db_size) - 1; - zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header)); - zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + zap_memset(&zap_leaf_phys(l)->l_hdr, 0, + sizeof (struct zap_leaf_header)); + zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + 2*ZAP_LEAF_HASH_NUMENTRIES(l)); for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; } ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; - l->l_phys->l_hdr.lh_block_type = ZBT_LEAF; - l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC; - l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); + zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF; + zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC; + zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); if (sort) - l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; + zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; } /* @@ -184,15 +191,16 @@ zap_leaf_chunk_alloc(zap_leaf_t *l) { int chunk; - ASSERT(l->l_phys->l_hdr.lh_nfree > 0); + ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0); - chunk = l->l_phys->l_hdr.lh_freelist; + chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); - l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; + zap_leaf_phys(l)->l_hdr.lh_freelist = + ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; - l->l_phys->l_hdr.lh_nfree--; + zap_leaf_phys(l)->l_hdr.lh_nfree--; return (chunk); } @@ -201,16 +209,16 @@ static void zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) { struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; - ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); zlf->lf_type = ZAP_CHUNK_FREE; - zlf->lf_next = l->l_phys->l_hdr.lh_freelist; + zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist; bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ - l->l_phys->l_hdr.lh_freelist = chunk; + zap_leaf_phys(l)->l_hdr.lh_freelist = chunk; - l->l_phys->l_hdr.lh_nfree++; + zap_leaf_phys(l)->l_hdr.lh_nfree++; } /* @@ -396,7 +404,7 @@ zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) uint16_t *chunkp; struct zap_leaf_entry *le; - ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); again: for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); @@ -416,7 +424,7 @@ zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) * lowest-cd match for MT_FIRST. */ ASSERT(zn->zn_matchtype == MT_EXACT || - (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); + (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); if (zap_leaf_array_match(l, zn, le->le_name_chunk, le->le_name_numints)) { zeh->zeh_num_integers = le->le_value_numints; @@ -456,10 +464,10 @@ zap_leaf_lookup_closest(zap_leaf_t *l, uint16_t lh; struct zap_leaf_entry *le; - ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { - for (chunk = l->l_phys->l_hash[lh]; + for (chunk = zap_leaf_phys(l)->l_hash[lh]; chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); @@ -539,7 +547,7 @@ zap_entry_update(zap_entry_handle_t *zeh, delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); - if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) + if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks) return (SET_ERROR(EAGAIN)); zap_leaf_array_free(l, &le->le_value_chunk); @@ -569,7 +577,7 @@ zap_entry_remove(zap_entry_handle_t *zeh) *zeh->zeh_chunkp = le->le_next; zap_leaf_chunk_free(l, entry_chunk); - l->l_phys->l_hdr.lh_nentries--; + zap_leaf_phys(l)->l_hdr.lh_nentries--; } int @@ -593,7 +601,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, if (cd == ZAP_NEED_CD) { /* find the lowest unused cd */ - if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { + if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { cd = 0; for (chunk = *LEAF_HASH_ENTPTR(l, h); @@ -629,7 +637,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, ASSERT3U(cd, <, zap_maxcd(zn->zn_zap)); } - if (l->l_phys->l_hdr.lh_nfree < numchunks) + if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks) return (SET_ERROR(EAGAIN)); /* make the entry */ @@ -650,7 +658,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, /* XXX if we did the search above, we could just use that */ chunkp = zap_leaf_rehash_entry(l, chunk); - l->l_phys->l_hdr.lh_nentries++; + zap_leaf_phys(l)->l_hdr.lh_nentries++; zeh->zeh_leaf = l; zeh->zeh_num_integers = num_integers; @@ -784,8 +792,8 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) zap_leaf_chunk_free(l, entry); - l->l_phys->l_hdr.lh_nentries--; - nl->l_phys->l_hdr.lh_nentries++; + zap_leaf_phys(l)->l_hdr.lh_nentries--; + zap_leaf_phys(nl)->l_hdr.lh_nentries++; } /* @@ -795,19 +803,22 @@ void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) { int i; - int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len; + int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* set new prefix and prefix_len */ - l->l_phys->l_hdr.lh_prefix <<= 1; - l->l_phys->l_hdr.lh_prefix_len++; - nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1; - nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len; + zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1; + zap_leaf_phys(l)->l_hdr.lh_prefix_len++; + zap_leaf_phys(nl)->l_hdr.lh_prefix = + zap_leaf_phys(l)->l_hdr.lh_prefix | 1; + zap_leaf_phys(nl)->l_hdr.lh_prefix_len = + zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* break existing hash chains */ - zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + 2*ZAP_LEAF_HASH_NUMENTRIES(l)); if (sort) - l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; + zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; /* * Transfer entries whose hash bit 'bit' is set to nl; rehash @@ -835,25 +846,25 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) { int i, n; - n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - - l->l_phys->l_hdr.lh_prefix_len; + n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + zap_leaf_phys(l)->l_hdr.lh_prefix_len; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_leafs_with_2n_pointers[n]++; - n = l->l_phys->l_hdr.lh_nentries/5; + n = zap_leaf_phys(l)->l_hdr.lh_nentries/5; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_blocks_with_n5_entries[n]++; n = ((1<l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / + zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / (1<zs_blocks_n_tenths_full[n]++; for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { int nentries = 0; - int chunk = l->l_phys->l_hash[i]; + int chunk = zap_leaf_phys(l)->l_hash[i]; while (chunk != CHAIN_END) { struct zap_leaf_entry *le = diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index dfa7c6615659..256bc8de9d40 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -38,6 +39,8 @@ #include #endif +extern inline mzap_phys_t *zap_m_phys(zap_t *zap); + static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags); uint64_t @@ -45,7 +48,7 @@ zap_getflags(zap_t *zap) { if (zap->zap_ismicro) return (0); - return (zap->zap_u.zap_fat.zap_phys->zap_flags); + return (zap_f_phys(zap)->zap_flags); } int @@ -384,7 +387,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ - winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); + dmu_buf_init_user(&zap->zap_dbu, zap_evict, &zap->zap_dbuf); + winner = dmu_buf_set_user(db, &zap->zap_dbu); if (winner != NULL) { rw_exit(&zap->zap_rwlock); @@ -396,15 +400,15 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) } if (zap->zap_ismicro) { - zap->zap_salt = zap->zap_m.zap_phys->mz_salt; - zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags; + zap->zap_salt = zap_m_phys(zap)->mz_salt; + zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; avl_create(&zap->zap_m.zap_avl, mze_compare, sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = - &zap->zap_m.zap_phys->mz_chunk[i]; + &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { zap_name_t *zn; @@ -416,8 +420,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) } } } else { - zap->zap_salt = zap->zap_f.zap_phys->zap_salt; - zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags; + zap->zap_salt = zap_f_phys(zap)->zap_salt; + zap->zap_normflags = zap_f_phys(zap)->zap_normflags; ASSERT3U(sizeof (struct zap_leaf_header), ==, 2*ZAP_LEAF_CHUNKSIZE); @@ -427,7 +431,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) * other members. */ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, - &zap->zap_f.zap_phys->zap_salt); + &zap_f_phys(zap)->zap_salt); /* * The embedded pointer table should end at the end of @@ -435,7 +439,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) */ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 1<zap_f.zap_phys, ==, + (uintptr_t)zap_f_phys(zap), ==, zap->zap_dbuf->db_size); } rw_exit(&zap->zap_rwlock); @@ -673,11 +677,10 @@ zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) return (dmu_object_free(os, zapobj, tx)); } -_NOTE(ARGSUSED(0)) void -zap_evict(dmu_buf_t *db, void *vzap) +zap_evict(void *dbu) { - zap_t *zap = vzap; + zap_t *zap = dbu; rw_destroy(&zap->zap_rwlock); @@ -936,7 +939,7 @@ mzap_addent(zap_name_t *zn, uint64_t value) #ifdef ZFS_DEBUG for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { ASSERTV(mzap_ent_phys_t *mze); - ASSERT(mze = &zap->zap_m.zap_phys->mz_chunk[i]); + ASSERT(mze = &zap_m_phys(zap)->mz_chunk[i]); ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif @@ -947,7 +950,7 @@ mzap_addent(zap_name_t *zn, uint64_t value) again: for (i = start; i < zap->zap_m.zap_num_chunks; i++) { - mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; + mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; @@ -1149,7 +1152,7 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, err = SET_ERROR(ENOENT); } else { zap->zap_m.zap_num_entries--; - bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], + bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], sizeof (mzap_ent_phys_t)); mze_remove(zap, mze); } diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c index a901448220b6..461456275f9f 100644 --- a/module/zfs/zfeature_common.c +++ b/module/zfs/zfeature_common.c @@ -215,6 +215,17 @@ zpool_feature_init(void) B_TRUE, B_FALSE, B_FALSE, bookmarks_deps); } + { + static const spa_feature_t filesystem_limits_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_FS_SS_LIMIT, + "com.joyent:filesystem_limits", "filesystem_limits", + "Filesystem and snapshot limits.", B_TRUE, B_FALSE, B_FALSE, + filesystem_limits_deps); + } + zfeature_register(SPA_FEATURE_EMBEDDED_DATA, "com.delphix:embedded_data", "embedded_data", "Blocks which compress very well use even less space.", diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index cd7697058983..380e065e44a2 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -23,10 +23,9 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Portions Copyright 2012 Pawel Jakub Dawidek - * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 201i3 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. @@ -641,12 +640,14 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, break; case ZFS_PROP_QUOTA: + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curproc)) { uint64_t zoned; char setpoint[MAXNAMELEN]; /* * Unprivileged users are allowed to modify the - * quota on things *under* (ie. contained by) + * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, "zoned", &zoned, @@ -944,7 +945,7 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_origin_obj, FTAG, &origin); + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); @@ -2417,6 +2418,21 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + if (intval == UINT64_MAX) { + /* clearing the limit, just do it */ + err = 0; + } else { + err = dsl_dir_activate_fs_ss_limit(dsname); + } + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; + break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; @@ -4232,7 +4248,8 @@ zfs_ioc_send(zfs_cmd_t *zc) } if (dsl_dir_is_clone(tosnap->ds_dir)) - zc->zc_fromobj = tosnap->ds_dir->dd_phys->dd_origin_obj; + zc->zc_fromobj = + dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } @@ -4670,7 +4687,7 @@ zfs_ioc_next_obj(zfs_cmd_t *zc) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, - os->os_dsl_dataset->ds_phys->ds_prev_snap_txg); + dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg); dmu_objset_rele(os, FTAG); return (error); diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index 257ab4254bbd..c9a9da7528d7 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -22,8 +22,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ -#include -#include +#include #include #include #include diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 15897b363de6..be1a86b24ed7 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -497,7 +497,7 @@ zilog_dirty(zilog_t *zilog, uint64_t txg) dsl_pool_t *dp = zilog->zl_dmu_pool; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); - if (dsl_dataset_is_snapshot(ds)) + if (ds->ds_is_snapshot) panic("dirtying snapshot!"); if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {