diff options
author | Justin T. Gibbs <justing@spectralogic.com> | 2015-04-02 14:44:32 +1100 |
---|---|---|
committer | Brian Behlendorf <behlendorf1@llnl.gov> | 2015-04-28 16:25:34 -0700 |
commit | 0c66c32d1d8b64a261cceb5f50a9e86777c5d0b2 (patch) | |
tree | 82f5630e8a4e77931e9992db3a7fac1964414716 /include | |
parent | d683ddbb7272a179da3918cc4f922d92a2195ba2 (diff) |
Illumos 5056 - ZFS deadlock on db_mtx and dn_holds
5056 ZFS deadlock on db_mtx and dn_holds
Author: Justin Gibbs <justing@spectralogic.com>
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
References:
https://www.illumos.org/issues/5056
https://github.com/illumos/illumos-gate/commit/bc9014e
Porting Notes:
sa_handle_get_from_db():
- the original patch includes an otherwise unmentioned fix for a
possible usage of an uninitialised variable
dmu_objset_open_impl():
- Under Illumos list_link_init() is the same as filling a list_node_t
with NULLs, so they don't notice if they miss doing list_link_init()
on a zero'd containing structure (e.g. allocated with kmem_zalloc as
here). Under Linux, not so much: an uninitialised list_node_t goes
"Boom!" some time later when it's used or destroyed.
dmu_objset_evict_dbufs():
- reduce stack usage using kmem_alloc()
Ported-by: Chris Dunlop <chris@onthe.net.au>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Diffstat (limited to 'include')
-rw-r--r-- | include/sys/dbuf.h | 6 | ||||
-rw-r--r-- | include/sys/dmu.h | 133 | ||||
-rw-r--r-- | include/sys/dmu_objset.h | 14 | ||||
-rw-r--r-- | include/sys/dnode.h | 4 | ||||
-rw-r--r-- | include/sys/dsl_dataset.h | 10 | ||||
-rw-r--r-- | include/sys/dsl_dir.h | 4 | ||||
-rw-r--r-- | include/sys/sa.h | 1 | ||||
-rw-r--r-- | include/sys/sa_impl.h | 4 | ||||
-rw-r--r-- | include/sys/spa.h | 5 | ||||
-rw-r--r-- | include/sys/spa_impl.h | 4 | ||||
-rw-r--r-- | include/sys/zap_impl.h | 4 | ||||
-rw-r--r-- | include/sys/zap_leaf.h | 2 |
12 files changed, 151 insertions, 40 deletions
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index d4e39b73f..c2f4f8bd0 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DBUF_H @@ -226,9 +227,8 @@ typedef struct dmu_buf_impl { /* Data which is unique to data (leaf) blocks: */ - /* stuff we store for the user (see dmu_buf_set_user) */ - void *db_user_ptr; - dmu_buf_evict_func_t *db_evict_func; + /* User callback information. */ + dmu_buf_user_t *db_user; uint8_t db_immediate_evict; uint8_t db_freed_in_flight; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 127cdcdb7..b2f1efae0 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -24,6 +24,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -39,11 +40,9 @@ * dmu_spa.h. */ +#include <sys/zfs_context.h> #include <sys/inttypes.h> -#include <sys/types.h> -#include <sys/param.h> #include <sys/cred.h> -#include <sys/time.h> #include <sys/fs/zfs.h> #include <sys/uio.h> @@ -288,8 +287,6 @@ typedef struct dmu_buf { void *db_data; /* data in buffer */ } dmu_buf_t; -typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); - /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ @@ -475,36 +472,126 @@ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); +typedef void dmu_buf_evict_func_t(void *user_ptr); + +/* + * A DMU buffer user object may be associated with a dbuf for the + * duration of its lifetime. This allows the user of a dbuf (client) + * to attach private data to a dbuf (e.g. in-core only data such as a + * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified + * when that dbuf has been evicted. Clients typically respond to the + * eviction notification by freeing their private data, thus ensuring + * the same lifetime for both dbuf and private data. + * + * The mapping from a dmu_buf_user_t to any client private data is the + * client's responsibility. All current consumers of the API with private + * data embed a dmu_buf_user_t as the first member of the structure for + * their private data. This allows conversions between the two types + * with a simple cast. Since the DMU buf user API never needs access + * to the private data, other strategies can be employed if necessary + * or convenient for the client (e.g. using container_of() to do the + * conversion for private data that cannot have the dmu_buf_user_t as + * its first member). + * + * Eviction callbacks are executed without the dbuf mutex held or any + * other type of mechanism to guarantee that the dbuf is still available. + * For this reason, users must assume the dbuf has already been freed + * and not reference the dbuf from the callback context. + * + * Users requesting "immediate eviction" are notified as soon as the dbuf + * is only referenced by dirty records (dirties == holds). Otherwise the + * notification occurs after eviction processing for the dbuf begins. + */ +typedef struct dmu_buf_user { + /* + * Asynchronous user eviction callback state. + */ + taskq_ent_t dbu_tqent; + + /* This instance's eviction function pointer. */ + dmu_buf_evict_func_t *dbu_evict_func; +#ifdef ZFS_DEBUG + /* + * Pointer to user's dbuf pointer. NULL for clients that do + * not associate a dbuf with their user data. + * + * The dbuf pointer is cleared upon eviction so as to catch + * use-after-evict bugs in clients. + */ + dmu_buf_t **dbu_clear_on_evict_dbufp; +#endif +} dmu_buf_user_t; + +/* + * Initialize the given dmu_buf_user_t instance with the eviction function + * evict_func, to be called when the user is evicted. + * + * NOTE: This function should only be called once on a given dmu_buf_user_t. + * To allow enforcement of this, dbu must already be zeroed on entry. + */ +#ifdef __lint +/* Very ugly, but it beats issuing suppression directives in many Makefiles. */ +extern void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, + dmu_buf_t **clear_on_evict_dbufp); +#else /* __lint */ +static inline void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, + dmu_buf_t **clear_on_evict_dbufp) +{ + ASSERT(dbu->dbu_evict_func == NULL); + ASSERT(evict_func != NULL); + dbu->dbu_evict_func = evict_func; +#ifdef ZFS_DEBUG + dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; +#endif +} +#endif /* __lint */ + /* - * Returns NULL on success, or the existing user ptr if it's already - * been set. + * Attach user data to a dbuf and mark it for normal (when the dbuf's + * data is cleared or its reference count goes to zero) eviction processing. * - * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. + */ +void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Attach user data to a dbuf and mark it for immediate (its dirty and + * reference counts are equal) eviction processing. * - * If non-NULL, pageout func will be called when this buffer is being - * excised from the cache, so that you can clean up the data structure - * pointed to by user_ptr. + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. + */ +void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Replace the current user of a dbuf. * - * dmu_evict_user() will call the pageout func for all buffers in a - * objset with a given pageout func. + * If given the current user of a dbuf, replaces the dbuf's user with + * "new_user" and returns the user data pointer that was replaced. + * Otherwise returns the current, and unmodified, dbuf user pointer. */ -void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, - dmu_buf_evict_func_t *pageout_func); +void *dmu_buf_replace_user(dmu_buf_t *db, + dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); + /* - * set_user_ie is the same as set_user, but request immediate eviction - * when hold count goes to zero. + * Remove the specified user data for a DMU buffer. + * + * Returns the user that was removed on success, or the current user if + * another user currently owns the buffer. */ -void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, - dmu_buf_evict_func_t *pageout_func); -void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, - void *user_ptr, dmu_buf_evict_func_t *pageout_func); -void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); +void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* - * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. + * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); +/* Block until any in-progress dmu buf user evictions complete. */ +void dmu_buf_user_evict_wait(void); + /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index cbf0394e6..65ae850f4 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -74,22 +75,25 @@ struct objset { arc_buf_t *os_phys_buf; objset_phys_t *os_phys; /* - * The following "special" dnodes have no parent and are exempt from - * dnode_move(), but they root their descendents in this objset using - * handles anyway, so that all access to dnodes from dbufs consistently - * uses handles. + * The following "special" dnodes have no parent, are exempt + * from dnode_move(), and are not recorded in os_dnodes, but they + * root their descendents in this objset using handles anyway, so + * that all access to dnodes from dbufs consistently uses handles. */ dnode_handle_t os_meta_dnode; dnode_handle_t os_userused_dnode; dnode_handle_t os_groupused_dnode; zilog_t *os_zil; + list_node_t os_evicting_node; + /* can change, under dsl_dir's locks: */ enum zio_checksum os_checksum; enum zio_compress os_compress; uint8_t os_copies; enum zio_checksum os_dedup_checksum; boolean_t os_dedup_verify; + boolean_t os_evicting; zfs_logbias_op_t os_logbias; zfs_cache_type_t os_primary_cache; zfs_cache_type_t os_secondary_cache; @@ -168,6 +172,8 @@ int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); int dmu_fsname(const char *snapname, char *buf); +void dmu_objset_evict_done(objset_t *os); + void dmu_objset_init(void); void dmu_objset_fini(void); diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 90a334ba7..50e011559 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DNODE_H @@ -277,6 +278,7 @@ typedef struct dnode_handle { } dnode_handle_t; typedef struct dnode_children { + dmu_buf_user_t dnc_dbu; /* User evict data */ size_t dnc_count; /* number of children */ dnode_handle_t dnc_children[]; /* sized dynamically */ } dnode_children_t; @@ -287,7 +289,7 @@ typedef struct free_range { uint64_t fr_nblks; } free_range_t; -dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp, +void dnode_special_open(struct objset *dd, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh); void dnode_special_close(dnode_handle_t *dnh); diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index c6280f2b8..7e5c0a7cb 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -23,6 +23,7 @@ * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DSL_DATASET_H @@ -125,11 +126,14 @@ typedef struct dsl_dataset_phys { } dsl_dataset_phys_t; typedef struct dsl_dataset { + dmu_buf_user_t ds_dbu; + /* Immutable: */ struct dsl_dir *ds_dir; dmu_buf_t *ds_dbuf; uint64_t ds_object; uint64_t ds_fsid_guid; + boolean_t ds_is_snapshot; /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; @@ -188,12 +192,6 @@ dsl_dataset_phys(dsl_dataset_t *ds) */ #define MAX_TAG_PREFIX_LEN 17 -static inline boolean_t -dsl_dataset_is_snapshot(dsl_dataset_t *ds) -{ - return (dsl_dataset_phys(ds)->ds_num_children != 0); -} - #define DS_UNIQUE_IS_ACCURATE(ds) \ ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index 46223f84c..55f3a8e5b 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DSL_DIR_H @@ -84,6 +85,8 @@ typedef struct dsl_dir_phys { } dsl_dir_phys_t; struct dsl_dir { + dmu_buf_user_t dd_dbu; + /* These are immutable; no lock needed: */ uint64_t dd_object; dsl_pool_t *dd_pool; @@ -119,6 +122,7 @@ dsl_dir_phys(dsl_dir_t *dd) } void dsl_dir_rele(dsl_dir_t *dd, void *tag); +void dsl_dir_async_rele(dsl_dir_t *dd, void *tag); int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **, const char **tail); int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, diff --git a/include/sys/sa.h b/include/sys/sa.h index 7b5b03a56..48e3bcd7c 100644 --- a/include/sys/sa.h +++ b/include/sys/sa.h @@ -133,7 +133,6 @@ int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); void sa_object_info(sa_handle_t *, dmu_object_info_t *); void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); -void sa_update_user(sa_handle_t *, sa_handle_t *); void *sa_get_userdata(sa_handle_t *); void sa_set_userp(sa_handle_t *, void *); dmu_buf_t *sa_get_db(sa_handle_t *); diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h index fcbd8eb34..6f2f1db6d 100644 --- a/include/sys/sa_impl.h +++ b/include/sys/sa_impl.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_SA_IMPL_H @@ -208,11 +209,12 @@ typedef enum sa_data_op { */ struct sa_handle { + dmu_buf_user_t sa_dbu; kmutex_t sa_lock; dmu_buf_t *sa_bonus; dmu_buf_t *sa_spill; objset_t *sa_os; - void *sa_userp; + void *sa_userp; sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ }; diff --git a/include/sys/spa.h b/include/sys/spa.h index d60195513..ea43713a4 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_SPA_H @@ -680,6 +681,7 @@ extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); +extern void spa_async_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 @@ -789,6 +791,9 @@ extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); +extern void spa_evicting_os_register(spa_t *, objset_t *os); +extern void spa_evicting_os_deregister(spa_t *, objset_t *os); +extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern uint8_t spa_get_failmode(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 19ba11537..25f28ef84 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_SPA_IMPL_H @@ -144,6 +145,9 @@ struct spa { uint64_t spa_claim_max_txg; /* highest claimed birth txg */ timespec_t spa_loaded_ts; /* 1st successful open time */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ + kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ + list_t spa_evicting_os_list; /* Objsets being evicted. */ + kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ uint64_t spa_config_guid; /* config pool guid */ diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 4b51a2ae2..028018a16 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_ZAP_IMPL_H @@ -140,6 +141,7 @@ typedef struct zap_phys { typedef struct zap_table_phys zap_table_phys_t; typedef struct zap { + dmu_buf_user_t zap_dbu; objset_t *zap_objset; uint64_t zap_object; struct dmu_buf *zap_dbuf; @@ -196,7 +198,7 @@ boolean_t zap_match(zap_name_t *zn, const char *matchname); int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap); -void zap_evict(dmu_buf_t *db, void *vmzap); +void zap_evict(void *dbu); zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h index d3749f368..e784c5963 100644 --- a/include/sys/zap_leaf.h +++ b/include/sys/zap_leaf.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_ZAP_LEAF_H @@ -152,6 +153,7 @@ typedef union zap_leaf_chunk { } zap_leaf_chunk_t; typedef struct zap_leaf { + dmu_buf_user_t l_dbu; krwlock_t l_rwlock; uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */ int l_bs; /* block size shift */ |