summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorJustin T. Gibbs <[email protected]>2015-04-02 14:44:32 +1100
committerBrian Behlendorf <[email protected]>2015-04-28 16:25:34 -0700
commit0c66c32d1d8b64a261cceb5f50a9e86777c5d0b2 (patch)
tree82f5630e8a4e77931e9992db3a7fac1964414716 /include
parentd683ddbb7272a179da3918cc4f922d92a2195ba2 (diff)
Illumos 5056 - ZFS deadlock on db_mtx and dn_holds
5056 ZFS deadlock on db_mtx and dn_holds Author: Justin Gibbs <[email protected]> Reviewed by: Will Andrews <[email protected]> Reviewed by: Matt Ahrens <[email protected]> Reviewed by: George Wilson <[email protected]> Approved by: Dan McDonald <[email protected]> References: https://www.illumos.org/issues/5056 https://github.com/illumos/illumos-gate/commit/bc9014e Porting Notes: sa_handle_get_from_db(): - the original patch includes an otherwise unmentioned fix for a possible usage of an uninitialised variable dmu_objset_open_impl(): - Under Illumos list_link_init() is the same as filling a list_node_t with NULLs, so they don't notice if they miss doing list_link_init() on a zero'd containing structure (e.g. allocated with kmem_zalloc as here). Under Linux, not so much: an uninitialised list_node_t goes "Boom!" some time later when it's used or destroyed. dmu_objset_evict_dbufs(): - reduce stack usage using kmem_alloc() Ported-by: Chris Dunlop <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]>
Diffstat (limited to 'include')
-rw-r--r--include/sys/dbuf.h6
-rw-r--r--include/sys/dmu.h133
-rw-r--r--include/sys/dmu_objset.h14
-rw-r--r--include/sys/dnode.h4
-rw-r--r--include/sys/dsl_dataset.h10
-rw-r--r--include/sys/dsl_dir.h4
-rw-r--r--include/sys/sa.h1
-rw-r--r--include/sys/sa_impl.h4
-rw-r--r--include/sys/spa.h5
-rw-r--r--include/sys/spa_impl.h4
-rw-r--r--include/sys/zap_impl.h4
-rw-r--r--include/sys/zap_leaf.h2
12 files changed, 151 insertions, 40 deletions
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index d4e39b73f..c2f4f8bd0 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_DBUF_H
@@ -226,9 +227,8 @@ typedef struct dmu_buf_impl {
/* Data which is unique to data (leaf) blocks: */
- /* stuff we store for the user (see dmu_buf_set_user) */
- void *db_user_ptr;
- dmu_buf_evict_func_t *db_evict_func;
+ /* User callback information. */
+ dmu_buf_user_t *db_user;
uint8_t db_immediate_evict;
uint8_t db_freed_in_flight;
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 127cdcdb7..b2f1efae0 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -24,6 +24,7 @@
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -39,11 +40,9 @@
* dmu_spa.h.
*/
+#include <sys/zfs_context.h>
#include <sys/inttypes.h>
-#include <sys/types.h>
-#include <sys/param.h>
#include <sys/cred.h>
-#include <sys/time.h>
#include <sys/fs/zfs.h>
#include <sys/uio.h>
@@ -288,8 +287,6 @@ typedef struct dmu_buf {
void *db_data; /* data in buffer */
} dmu_buf_t;
-typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
-
/*
* The names of zap entries in the DIRECTORY_OBJECT of the MOS.
*/
@@ -475,36 +472,126 @@ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
+typedef void dmu_buf_evict_func_t(void *user_ptr);
+
+/*
+ * A DMU buffer user object may be associated with a dbuf for the
+ * duration of its lifetime. This allows the user of a dbuf (client)
+ * to attach private data to a dbuf (e.g. in-core only data such as a
+ * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
+ * when that dbuf has been evicted. Clients typically respond to the
+ * eviction notification by freeing their private data, thus ensuring
+ * the same lifetime for both dbuf and private data.
+ *
+ * The mapping from a dmu_buf_user_t to any client private data is the
+ * client's responsibility. All current consumers of the API with private
+ * data embed a dmu_buf_user_t as the first member of the structure for
+ * their private data. This allows conversions between the two types
+ * with a simple cast. Since the DMU buf user API never needs access
+ * to the private data, other strategies can be employed if necessary
+ * or convenient for the client (e.g. using container_of() to do the
+ * conversion for private data that cannot have the dmu_buf_user_t as
+ * its first member).
+ *
+ * Eviction callbacks are executed without the dbuf mutex held or any
+ * other type of mechanism to guarantee that the dbuf is still available.
+ * For this reason, users must assume the dbuf has already been freed
+ * and not reference the dbuf from the callback context.
+ *
+ * Users requesting "immediate eviction" are notified as soon as the dbuf
+ * is only referenced by dirty records (dirties == holds). Otherwise the
+ * notification occurs after eviction processing for the dbuf begins.
+ */
+typedef struct dmu_buf_user {
+ /*
+ * Asynchronous user eviction callback state.
+ */
+ taskq_ent_t dbu_tqent;
+
+ /* This instance's eviction function pointer. */
+ dmu_buf_evict_func_t *dbu_evict_func;
+#ifdef ZFS_DEBUG
+ /*
+ * Pointer to user's dbuf pointer. NULL for clients that do
+ * not associate a dbuf with their user data.
+ *
+ * The dbuf pointer is cleared upon eviction so as to catch
+ * use-after-evict bugs in clients.
+ */
+ dmu_buf_t **dbu_clear_on_evict_dbufp;
+#endif
+} dmu_buf_user_t;
+
+/*
+ * Initialize the given dmu_buf_user_t instance with the eviction function
+ * evict_func, to be called when the user is evicted.
+ *
+ * NOTE: This function should only be called once on a given dmu_buf_user_t.
+ * To allow enforcement of this, dbu must already be zeroed on entry.
+ */
+#ifdef __lint
+/* Very ugly, but it beats issuing suppression directives in many Makefiles. */
+extern void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
+ dmu_buf_t **clear_on_evict_dbufp);
+#else /* __lint */
+static inline void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
+ dmu_buf_t **clear_on_evict_dbufp)
+{
+ ASSERT(dbu->dbu_evict_func == NULL);
+ ASSERT(evict_func != NULL);
+ dbu->dbu_evict_func = evict_func;
+#ifdef ZFS_DEBUG
+ dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
+#endif
+}
+#endif /* __lint */
+
/*
- * Returns NULL on success, or the existing user ptr if it's already
- * been set.
+ * Attach user data to a dbuf and mark it for normal (when the dbuf's
+ * data is cleared or its reference count goes to zero) eviction processing.
*
- * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
+ */
+void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Attach user data to a dbuf and mark it for immediate (its dirty and
+ * reference counts are equal) eviction processing.
*
- * If non-NULL, pageout func will be called when this buffer is being
- * excised from the cache, so that you can clean up the data structure
- * pointed to by user_ptr.
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
+ */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Replace the current user of a dbuf.
*
- * dmu_evict_user() will call the pageout func for all buffers in a
- * objset with a given pageout func.
+ * If given the current user of a dbuf, replaces the dbuf's user with
+ * "new_user" and returns the user data pointer that was replaced.
+ * Otherwise returns the current, and unmodified, dbuf user pointer.
*/
-void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr,
- dmu_buf_evict_func_t *pageout_func);
+void *dmu_buf_replace_user(dmu_buf_t *db,
+ dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
+
/*
- * set_user_ie is the same as set_user, but request immediate eviction
- * when hold count goes to zero.
+ * Remove the specified user data for a DMU buffer.
+ *
+ * Returns the user that was removed on success, or the current user if
+ * another user currently owns the buffer.
*/
-void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
- dmu_buf_evict_func_t *pageout_func);
-void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
- void *user_ptr, dmu_buf_evict_func_t *pageout_func);
-void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
+void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
/*
- * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
+ * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
*/
void *dmu_buf_get_user(dmu_buf_t *db);
+/* Block until any in-progress dmu buf user evictions complete. */
+void dmu_buf_user_evict_wait(void);
+
/*
* Returns the blkptr associated with this dbuf, or NULL if not set.
*/
diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h
index cbf0394e6..65ae850f4 100644
--- a/include/sys/dmu_objset.h
+++ b/include/sys/dmu_objset.h
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -74,22 +75,25 @@ struct objset {
arc_buf_t *os_phys_buf;
objset_phys_t *os_phys;
/*
- * The following "special" dnodes have no parent and are exempt from
- * dnode_move(), but they root their descendents in this objset using
- * handles anyway, so that all access to dnodes from dbufs consistently
- * uses handles.
+ * The following "special" dnodes have no parent, are exempt
+ * from dnode_move(), and are not recorded in os_dnodes, but they
+ * root their descendents in this objset using handles anyway, so
+ * that all access to dnodes from dbufs consistently uses handles.
*/
dnode_handle_t os_meta_dnode;
dnode_handle_t os_userused_dnode;
dnode_handle_t os_groupused_dnode;
zilog_t *os_zil;
+ list_node_t os_evicting_node;
+
/* can change, under dsl_dir's locks: */
enum zio_checksum os_checksum;
enum zio_compress os_compress;
uint8_t os_copies;
enum zio_checksum os_dedup_checksum;
boolean_t os_dedup_verify;
+ boolean_t os_evicting;
zfs_logbias_op_t os_logbias;
zfs_cache_type_t os_primary_cache;
zfs_cache_type_t os_secondary_cache;
@@ -168,6 +172,8 @@ int dmu_objset_userspace_upgrade(objset_t *os);
boolean_t dmu_objset_userspace_present(objset_t *os);
int dmu_fsname(const char *snapname, char *buf);
+void dmu_objset_evict_done(objset_t *os);
+
void dmu_objset_init(void);
void dmu_objset_fini(void);
diff --git a/include/sys/dnode.h b/include/sys/dnode.h
index 90a334ba7..50e011559 100644
--- a/include/sys/dnode.h
+++ b/include/sys/dnode.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_DNODE_H
@@ -277,6 +278,7 @@ typedef struct dnode_handle {
} dnode_handle_t;
typedef struct dnode_children {
+ dmu_buf_user_t dnc_dbu; /* User evict data */
size_t dnc_count; /* number of children */
dnode_handle_t dnc_children[]; /* sized dynamically */
} dnode_children_t;
@@ -287,7 +289,7 @@ typedef struct free_range {
uint64_t fr_nblks;
} free_range_t;
-dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
+void dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
uint64_t object, dnode_handle_t *dnh);
void dnode_special_close(dnode_handle_t *dnh);
diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h
index c6280f2b8..7e5c0a7cb 100644
--- a/include/sys/dsl_dataset.h
+++ b/include/sys/dsl_dataset.h
@@ -23,6 +23,7 @@
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_DSL_DATASET_H
@@ -125,11 +126,14 @@ typedef struct dsl_dataset_phys {
} dsl_dataset_phys_t;
typedef struct dsl_dataset {
+ dmu_buf_user_t ds_dbu;
+
/* Immutable: */
struct dsl_dir *ds_dir;
dmu_buf_t *ds_dbuf;
uint64_t ds_object;
uint64_t ds_fsid_guid;
+ boolean_t ds_is_snapshot;
/* only used in syncing context, only valid for non-snapshots: */
struct dsl_dataset *ds_prev;
@@ -188,12 +192,6 @@ dsl_dataset_phys(dsl_dataset_t *ds)
*/
#define MAX_TAG_PREFIX_LEN 17
-static inline boolean_t
-dsl_dataset_is_snapshot(dsl_dataset_t *ds)
-{
- return (dsl_dataset_phys(ds)->ds_num_children != 0);
-}
-
#define DS_UNIQUE_IS_ACCURATE(ds) \
((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h
index 46223f84c..55f3a8e5b 100644
--- a/include/sys/dsl_dir.h
+++ b/include/sys/dsl_dir.h
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_DSL_DIR_H
@@ -84,6 +85,8 @@ typedef struct dsl_dir_phys {
} dsl_dir_phys_t;
struct dsl_dir {
+ dmu_buf_user_t dd_dbu;
+
/* These are immutable; no lock needed: */
uint64_t dd_object;
dsl_pool_t *dd_pool;
@@ -119,6 +122,7 @@ dsl_dir_phys(dsl_dir_t *dd)
}
void dsl_dir_rele(dsl_dir_t *dd, void *tag);
+void dsl_dir_async_rele(dsl_dir_t *dd, void *tag);
int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
dsl_dir_t **, const char **tail);
int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
diff --git a/include/sys/sa.h b/include/sys/sa.h
index 7b5b03a56..48e3bcd7c 100644
--- a/include/sys/sa.h
+++ b/include/sys/sa.h
@@ -133,7 +133,6 @@ int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
void sa_object_info(sa_handle_t *, dmu_object_info_t *);
void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
-void sa_update_user(sa_handle_t *, sa_handle_t *);
void *sa_get_userdata(sa_handle_t *);
void sa_set_userp(sa_handle_t *, void *);
dmu_buf_t *sa_get_db(sa_handle_t *);
diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h
index fcbd8eb34..6f2f1db6d 100644
--- a/include/sys/sa_impl.h
+++ b/include/sys/sa_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_SA_IMPL_H
@@ -208,11 +209,12 @@ typedef enum sa_data_op {
*/
struct sa_handle {
+ dmu_buf_user_t sa_dbu;
kmutex_t sa_lock;
dmu_buf_t *sa_bonus;
dmu_buf_t *sa_spill;
objset_t *sa_os;
- void *sa_userp;
+ void *sa_userp;
sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */
sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */
};
diff --git a/include/sys/spa.h b/include/sys/spa.h
index d60195513..ea43713a4 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_SPA_H
@@ -680,6 +681,7 @@ extern spa_t *spa_next(spa_t *prev);
/* Refcount functions */
extern void spa_open_ref(spa_t *spa, void *tag);
extern void spa_close(spa_t *spa, void *tag);
+extern void spa_async_close(spa_t *spa, void *tag);
extern boolean_t spa_refcount_zero(spa_t *spa);
#define SCL_NONE 0x00
@@ -789,6 +791,9 @@ extern uint64_t spa_version(spa_t *spa);
extern boolean_t spa_deflate(spa_t *spa);
extern metaslab_class_t *spa_normal_class(spa_t *spa);
extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern void spa_evicting_os_register(spa_t *, objset_t *os);
+extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
+extern void spa_evicting_os_wait(spa_t *spa);
extern int spa_max_replication(spa_t *spa);
extern int spa_prev_software_version(spa_t *spa);
extern uint8_t spa_get_failmode(spa_t *spa);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 19ba11537..25f28ef84 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_SPA_IMPL_H
@@ -144,6 +145,9 @@ struct spa {
uint64_t spa_claim_max_txg; /* highest claimed birth txg */
timespec_t spa_loaded_ts; /* 1st successful open time */
objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
+ kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */
+ list_t spa_evicting_os_list; /* Objsets being evicted. */
+ kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */
txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
vdev_t *spa_root_vdev; /* top-level vdev container */
uint64_t spa_config_guid; /* config pool guid */
diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index 4b51a2ae2..028018a16 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_ZAP_IMPL_H
@@ -140,6 +141,7 @@ typedef struct zap_phys {
typedef struct zap_table_phys zap_table_phys_t;
typedef struct zap {
+ dmu_buf_user_t zap_dbu;
objset_t *zap_objset;
uint64_t zap_object;
struct dmu_buf *zap_dbuf;
@@ -196,7 +198,7 @@ boolean_t zap_match(zap_name_t *zn, const char *matchname);
int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
void zap_unlockdir(zap_t *zap);
-void zap_evict(dmu_buf_t *db, void *vmzap);
+void zap_evict(void *dbu);
zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
void zap_name_free(zap_name_t *zn);
int zap_hashbits(zap_t *zap);
diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h
index d3749f368..e784c5963 100644
--- a/include/sys/zap_leaf.h
+++ b/include/sys/zap_leaf.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_ZAP_LEAF_H
@@ -152,6 +153,7 @@ typedef union zap_leaf_chunk {
} zap_leaf_chunk_t;
typedef struct zap_leaf {
+ dmu_buf_user_t l_dbu;
krwlock_t l_rwlock;
uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */
int l_bs; /* block size shift */