summaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorBoris Protopopov <[email protected]>2017-06-13 12:03:44 -0400
committerBrian Behlendorf <[email protected]>2017-06-27 12:51:44 -0400
commit58404a73db9e245778aa7fd7028fbf742f8b815b (patch)
tree1cbf1a1d4dc76b1bb200953085fb883d34c6e3c6 /module/zfs
parent82710e993a3481b2c3cdefb6f5fc31f65e1c6798 (diff)
Refine use of zv_state_lock.
Use zv_state_lock to protect all members of zvol_state structure, add relevant ASSERT()s. Take zv_suspend_lock before zv_state_lock, do not hold zv_state_lock across suspend/resume. Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Boris Protopopov <[email protected]> Closes #6226
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/zvol.c233
1 files changed, 165 insertions, 68 deletions
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 8890aaaf7..75ed06e03 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -50,7 +50,15 @@
* time comes to remove the structure from the list, it is not in use, and
* therefore, it can be taken off zvol_state_list and freed.
*
- * The minor operations are issued to the spa->spa_zvol_taskq quues, that are
+ * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
+ * e.g. for the duration of receive and rollback operations. This lock can be
+ * held for significant periods of time. Given that it is undesirable to hold
+ * mutexes for long periods of time, the following lock ordering applies:
+ * - take zvol_state_lock if necessary, to protect zvol_state_list
+ * - take zv_suspend_lock if necessary, by the code path in question
+ * - take zv_state_lock to protect zvol_state_t
+ *
+ * The minor operations are issued to spa->spa_zvol_taskq queues, that are
* single-threaded (to preserve order of minor operations), and are executed
* through the zvol_task_cb that dispatches the specific operations. Therefore,
* these operations are serialized per pool. Consequently, we can be certain
@@ -156,49 +164,87 @@ zvol_name_hash(const char *name)
}
/*
- * Find a zvol_state_t given the full major+minor dev_t.
+ * Find a zvol_state_t given the full major+minor dev_t. If found,
+ * return with zv_state_lock taken, otherwise, return (NULL) without
+ * taking zv_state_lock.
*/
static zvol_state_t *
zvol_find_by_dev(dev_t dev)
{
zvol_state_t *zv;
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ mutex_enter(&zvol_state_lock);
for (zv = list_head(&zvol_state_list); zv != NULL;
zv = list_next(&zvol_state_list, zv)) {
- if (zv->zv_dev == dev)
+ mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_dev == dev) {
+ mutex_exit(&zvol_state_lock);
return (zv);
+ }
+ mutex_exit(&zv->zv_state_lock);
}
+ mutex_exit(&zvol_state_lock);
return (NULL);
}
/*
* Find a zvol_state_t given the name and hash generated by zvol_name_hash.
+ * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
+ * return (NULL) without the taking locks. The zv_suspend_lock is always taken
+ * before zv_state_lock. The mode argument indicates the mode (including none)
+ * for zv_suspend_lock to be taken.
*/
static zvol_state_t *
-zvol_find_by_name_hash(const char *name, uint64_t hash)
+zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
{
zvol_state_t *zv;
struct hlist_node *p;
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ mutex_enter(&zvol_state_lock);
hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
zv = hlist_entry(p, zvol_state_t, zv_hlink);
+ mutex_enter(&zv->zv_state_lock);
if (zv->zv_hash == hash &&
- strncmp(zv->zv_name, name, MAXNAMELEN) == 0)
+ strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
+ /*
+ * this is the right zvol, take the locks in the
+ * right order
+ */
+ if (mode != RW_NONE &&
+ !rw_tryenter(&zv->zv_suspend_lock, mode)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, mode);
+ mutex_enter(&zv->zv_state_lock);
+ /*
+ * zvol cannot be renamed as we continue
+ * to hold zvol_state_lock
+ */
+ ASSERT(zv->zv_hash == hash &&
+ strncmp(zv->zv_name, name, MAXNAMELEN)
+ == 0);
+ }
+ mutex_exit(&zvol_state_lock);
return (zv);
+ }
+ mutex_exit(&zv->zv_state_lock);
}
+ mutex_exit(&zvol_state_lock);
+
return (NULL);
}
/*
- * Find a zvol_state_t given the name provided at zvol_alloc() time.
+ * Find a zvol_state_t given the name.
+ * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
+ * return (NULL) without the taking locks. The zv_suspend_lock is always taken
+ * before zv_state_lock. The mode argument indicates the mode (including none)
+ * for zv_suspend_lock to be taken.
*/
static zvol_state_t *
-zvol_find_by_name(const char *name)
+zvol_find_by_name(const char *name, int mode)
{
- return (zvol_find_by_name_hash(name, zvol_name_hash(name)));
+ return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
}
@@ -295,9 +341,12 @@ zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
{
struct block_device *bdev;
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
bdev = bdget_disk(zv->zv_disk, 0);
if (bdev == NULL)
return;
+
set_capacity(zv->zv_disk, volsize >> 9);
zv->zv_volsize = volsize;
check_disk_size_change(zv->zv_disk, bdev);
@@ -391,13 +440,14 @@ zvol_set_volsize(const char *name, uint64_t volsize)
if (readonly)
return (SET_ERROR(EROFS));
- mutex_enter(&zvol_state_lock);
- zv = zvol_find_by_name(name);
- if (zv != NULL)
- mutex_enter(&zv->zv_state_lock);
- mutex_exit(&zvol_state_lock);
+ zv = zvol_find_by_name(name, RW_READER);
+
+ ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
+ RW_READ_HELD(&zv->zv_suspend_lock)));
if (zv == NULL || zv->zv_objset == NULL) {
+ if (zv != NULL)
+ rw_exit(&zv->zv_suspend_lock);
if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
FTAG, &os)) != 0) {
if (zv != NULL)
@@ -408,7 +458,6 @@ zvol_set_volsize(const char *name, uint64_t volsize)
if (zv != NULL)
zv->zv_objset = os;
} else {
- rw_enter(&zv->zv_suspend_lock, RW_READER);
os = zv->zv_objset;
}
@@ -435,6 +484,7 @@ out:
if (zv != NULL)
mutex_exit(&zv->zv_state_lock);
+
return (SET_ERROR(error));
}
@@ -485,23 +535,20 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
dmu_tx_t *tx;
int error;
- mutex_enter(&zvol_state_lock);
+ zv = zvol_find_by_name(name, RW_READER);
- zv = zvol_find_by_name(name);
- if (zv == NULL) {
- mutex_exit(&zvol_state_lock);
+ if (zv == NULL)
return (SET_ERROR(ENXIO));
- }
- mutex_enter(&zv->zv_state_lock);
- mutex_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+ RW_READ_HELD(&zv->zv_suspend_lock));
if (zv->zv_flags & ZVOL_RDONLY) {
mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
return (SET_ERROR(EROFS));
}
- rw_enter(&zv->zv_suspend_lock, RW_READER);
-
tx = dmu_tx_create(zv->zv_objset);
dmu_tx_hold_bonus(tx, ZVOL_OBJ);
error = dmu_tx_assign(tx, TXG_WAIT);
@@ -516,9 +563,9 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
if (error == 0)
zv->zv_volblocksize = volblocksize;
}
- rw_exit(&zv->zv_suspend_lock);
mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
return (SET_ERROR(error));
}
@@ -1060,6 +1107,9 @@ zvol_setup_zv(zvol_state_t *zv)
uint64_t ro;
objset_t *os = zv->zv_objset;
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+ RW_LOCK_HELD(&zv->zv_suspend_lock));
+
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
if (error)
return (SET_ERROR(error));
@@ -1094,6 +1144,9 @@ zvol_setup_zv(zvol_state_t *zv)
static void
zvol_shutdown_zv(zvol_state_t *zv)
{
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+ RW_LOCK_HELD(&zv->zv_suspend_lock));
+
zil_close(zv->zv_zilog);
zv->zv_zilog = NULL;
@@ -1127,24 +1180,27 @@ zvol_suspend(const char *name)
{
zvol_state_t *zv;
- mutex_enter(&zvol_state_lock);
- zv = zvol_find_by_name(name);
- if (zv == NULL) {
- mutex_exit(&zvol_state_lock);
+ zv = zvol_find_by_name(name, RW_WRITER);
+
+ if (zv == NULL)
return (NULL);
- }
- mutex_enter(&zv->zv_state_lock);
- mutex_exit(&zvol_state_lock);
/* block all I/O, release in zvol_resume. */
- rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+ RW_WRITE_HELD(&zv->zv_suspend_lock));
atomic_inc(&zv->zv_suspend_ref);
if (zv->zv_open_count > 0)
zvol_shutdown_zv(zv);
+ /*
+ * do not hold zv_state_lock across suspend/resume to
+ * avoid locking up zvol lookups
+ */
mutex_exit(&zv->zv_state_lock);
+
+ /* zv_suspend_lock is released in zvol_resume() */
return (zv);
}
@@ -1155,11 +1211,8 @@ zvol_resume(zvol_state_t *zv)
ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
- /*
- * Cannot take zv_state_lock here with zv_suspend_lock
- * held; however, the latter is held in exclusive mode,
- * so it is not necessary to do so
- */
+ mutex_enter(&zv->zv_state_lock);
+
if (zv->zv_open_count > 0) {
VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
@@ -1168,6 +1221,9 @@ zvol_resume(zvol_state_t *zv)
error = zvol_setup_zv(zv);
}
+
+ mutex_exit(&zv->zv_state_lock);
+
rw_exit(&zv->zv_suspend_lock);
/*
* We need this because we don't hold zvol_state_lock while releasing
@@ -1186,6 +1242,9 @@ zvol_first_open(zvol_state_t *zv)
objset_t *os;
int error, locked = 0;
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
/*
* In all other cases the spa_namespace_lock is taken before the
* bdev->bd_mutex lock. But in this case the Linux __blkdev_get()
@@ -1233,6 +1292,9 @@ out_mutex:
static void
zvol_last_close(zvol_state_t *zv)
{
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
zvol_shutdown_zv(zv);
dmu_objset_disown(zv->zv_objset, zv);
@@ -1243,14 +1305,15 @@ static int
zvol_open(struct block_device *bdev, fmode_t flag)
{
zvol_state_t *zv;
- int error = 0, drop_suspend = 0;
+ int error = 0;
+ boolean_t drop_suspend = B_FALSE;
ASSERT(!mutex_owned(&zvol_state_lock));
mutex_enter(&zvol_state_lock);
/*
- * Obtain a copy of private_data under the lock to make sure
- * that either the result of zvol free code path setting
+ * Obtain a copy of private_data under the zvol_state_lock to make
+ * sure that either the result of zvol free code path setting
* bdev->bd_disk->private_data to NULL is observed, or zvol_free()
* is not called on this zv because of the positive zv_open_count.
*/
@@ -1259,14 +1322,25 @@ zvol_open(struct block_device *bdev, fmode_t flag)
mutex_exit(&zvol_state_lock);
return (SET_ERROR(-ENXIO));
}
+
+ /* take zv_suspend_lock before zv_state_lock */
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+
mutex_enter(&zv->zv_state_lock);
- mutex_exit(&zvol_state_lock);
+ /*
+ * make sure zvol is not suspended during first open
+ * (hold zv_suspend_lock), otherwise, drop the lock
+ */
if (zv->zv_open_count == 0) {
- /* make sure zvol is not suspended when first open */
- rw_enter(&zv->zv_suspend_lock, RW_READER);
- drop_suspend = 1;
+ drop_suspend = B_TRUE;
+ } else {
+ rw_exit(&zv->zv_suspend_lock);
+ }
+
+ mutex_exit(&zvol_state_lock);
+ if (zv->zv_open_count == 0) {
error = zvol_first_open(zv);
if (error)
goto out_mutex;
@@ -1285,9 +1359,9 @@ out_open_count:
if (zv->zv_open_count == 0)
zvol_last_close(zv);
out_mutex:
+ mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
- mutex_exit(&zv->zv_state_lock);
return (SET_ERROR(error));
}
@@ -1300,27 +1374,38 @@ static int
zvol_release(struct gendisk *disk, fmode_t mode)
{
zvol_state_t *zv;
+ boolean_t drop_suspend = B_FALSE;
ASSERT(!mutex_owned(&zvol_state_lock));
mutex_enter(&zvol_state_lock);
zv = disk->private_data;
ASSERT(zv && zv->zv_open_count > 0);
+
+ /* take zv_suspend_lock before zv_state_lock */
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+
mutex_enter(&zv->zv_state_lock);
mutex_exit(&zvol_state_lock);
- /* make sure zvol is not suspended when last close */
+ /*
+ * make sure zvol is not suspended during last close
+ * (hold zv_suspend_lock), otherwise, drop the lock
+ */
if (zv->zv_open_count == 1)
- rw_enter(&zv->zv_suspend_lock, RW_READER);
+ drop_suspend = B_TRUE;
+ else
+ rw_exit(&zv->zv_suspend_lock);
zv->zv_open_count--;
- if (zv->zv_open_count == 0) {
+ if (zv->zv_open_count == 0)
zvol_last_close(zv);
- rw_exit(&zv->zv_suspend_lock);
- }
mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+
#ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
return (0);
#endif
@@ -1430,10 +1515,11 @@ zvol_probe(dev_t dev, int *part, void *arg)
zvol_state_t *zv;
struct kobject *kobj;
- mutex_enter(&zvol_state_lock);
zv = zvol_find_by_dev(dev);
kobj = zv ? get_disk(zv->zv_disk) : NULL;
- mutex_exit(&zvol_state_lock);
+ ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
+ if (zv)
+ mutex_exit(&zv->zv_state_lock);
return (kobj);
}
@@ -1571,6 +1657,8 @@ zvol_free(void *arg)
zvol_state_t *zv = arg;
ASSERT(!MUTEX_HELD(&zvol_state_lock));
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
ASSERT(zv->zv_open_count == 0);
ASSERT(zv->zv_disk->private_data == NULL);
@@ -1611,17 +1699,14 @@ zvol_create_minor_impl(const char *name)
return (SET_ERROR(-idx));
minor = idx << ZVOL_MINOR_BITS;
- mutex_enter(&zvol_state_lock);
-
- zv = zvol_find_by_name_hash(name, hash);
+ zv = zvol_find_by_name_hash(name, hash, RW_NONE);
if (zv) {
- mutex_exit(&zvol_state_lock);
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ mutex_exit(&zv->zv_state_lock);
ida_simple_remove(&zvol_ida, idx);
return (SET_ERROR(EEXIST));
}
- mutex_exit(&zvol_state_lock);
-
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
@@ -1715,10 +1800,9 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
int readonly = get_disk_ro(zv->zv_disk);
ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
- rw_enter(&zv->zv_suspend_lock, RW_READER);
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
- rw_exit(&zv->zv_suspend_lock);
/* move to new hashtable entry */
zv->zv_hash = zvol_name_hash(zv->zv_name);
@@ -1960,15 +2044,15 @@ zvol_remove_minors_impl(const char *name)
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
+ mutex_enter(&zv->zv_state_lock);
if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
(strncmp(zv->zv_name, name, namelen) == 0 &&
(zv->zv_name[namelen] == '/' ||
zv->zv_name[namelen] == '@'))) {
/*
- * By taking zv_state_lock here, we guarantee that no
+ * By holding zv_state_lock here, we guarantee that no
* one is currently using this zv
*/
- mutex_enter(&zv->zv_state_lock);
/* If in use, leave alone */
if (zv->zv_open_count > 0 ||
@@ -1979,7 +2063,10 @@ zvol_remove_minors_impl(const char *name)
zvol_remove(zv);
- /* clear this so zvol_open won't open it */
+ /*
+ * clear this while holding zvol_state_lock so
+ * zvol_open won't open it
+ */
zv->zv_disk->private_data = NULL;
/* Drop zv_state_lock before zvol_free() */
@@ -1992,6 +2079,8 @@ zvol_remove_minors_impl(const char *name)
list_insert_head(&free_list, zv);
else
tid = t;
+ } else {
+ mutex_exit(&zv->zv_state_lock);
}
}
mutex_exit(&zvol_state_lock);
@@ -2025,12 +2114,12 @@ zvol_remove_minor_impl(const char *name)
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
+ mutex_enter(&zv->zv_state_lock);
if (strcmp(zv->zv_name, name) == 0) {
/*
- * By taking zv_state_lock here, we guarantee that no
+ * By holding zv_state_lock here, we guarantee that no
* one is currently using this zv
*/
- mutex_enter(&zv->zv_state_lock);
/* If in use, leave alone */
if (zv->zv_open_count > 0 ||
@@ -2045,6 +2134,8 @@ zvol_remove_minor_impl(const char *name)
mutex_exit(&zv->zv_state_lock);
break;
+ } else {
+ mutex_exit(&zv->zv_state_lock);
}
}
@@ -2077,9 +2168,13 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
+ mutex_enter(&zv->zv_state_lock);
+
/* If in use, leave alone */
- if (zv->zv_open_count > 0)
+ if (zv->zv_open_count > 0) {
+ mutex_exit(&zv->zv_state_lock);
continue;
+ }
if (strcmp(zv->zv_name, oldname) == 0) {
zvol_rename_minor(zv, newname);
@@ -2091,6 +2186,8 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
zv->zv_name + oldnamelen + 1);
zvol_rename_minor(zv, name);
}
+
+ mutex_exit(&zv->zv_state_lock);
}
mutex_exit(&zvol_state_lock);