diff options
author | Boris Protopopov <[email protected]> | 2017-06-13 12:03:44 -0400 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2017-06-27 12:51:44 -0400 |
commit | 58404a73db9e245778aa7fd7028fbf742f8b815b (patch) | |
tree | 1cbf1a1d4dc76b1bb200953085fb883d34c6e3c6 /module/zfs | |
parent | 82710e993a3481b2c3cdefb6f5fc31f65e1c6798 (diff) |
Refine use of zv_state_lock.
Use zv_state_lock to protect all members of zvol_state structure, add
relevant ASSERT()s. Take zv_suspend_lock before zv_state_lock, do not
hold zv_state_lock across suspend/resume.
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Boris Protopopov <[email protected]>
Closes #6226
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/zvol.c | 233 |
1 files changed, 165 insertions, 68 deletions
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 8890aaaf7..75ed06e03 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -50,7 +50,15 @@ * time comes to remove the structure from the list, it is not in use, and * therefore, it can be taken off zvol_state_list and freed. * - * The minor operations are issued to the spa->spa_zvol_taskq quues, that are + * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol, + * e.g. for the duration of receive and rollback operations. This lock can be + * held for significant periods of time. Given that it is undesirable to hold + * mutexes for long periods of time, the following lock ordering applies: + * - take zvol_state_lock if necessary, to protect zvol_state_list + * - take zv_suspend_lock if necessary, by the code path in question + * - take zv_state_lock to protect zvol_state_t + * + * The minor operations are issued to spa->spa_zvol_taskq queues, that are * single-threaded (to preserve order of minor operations), and are executed * through the zvol_task_cb that dispatches the specific operations. Therefore, * these operations are serialized per pool. Consequently, we can be certain @@ -156,49 +164,87 @@ zvol_name_hash(const char *name) } /* - * Find a zvol_state_t given the full major+minor dev_t. + * Find a zvol_state_t given the full major+minor dev_t. If found, + * return with zv_state_lock taken, otherwise, return (NULL) without + * taking zv_state_lock. */ static zvol_state_t * zvol_find_by_dev(dev_t dev) { zvol_state_t *zv; - ASSERT(MUTEX_HELD(&zvol_state_lock)); + mutex_enter(&zvol_state_lock); for (zv = list_head(&zvol_state_list); zv != NULL; zv = list_next(&zvol_state_list, zv)) { - if (zv->zv_dev == dev) + mutex_enter(&zv->zv_state_lock); + if (zv->zv_dev == dev) { + mutex_exit(&zvol_state_lock); return (zv); + } + mutex_exit(&zv->zv_state_lock); } + mutex_exit(&zvol_state_lock); return (NULL); } /* * Find a zvol_state_t given the name and hash generated by zvol_name_hash. + * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, + * return (NULL) without the taking locks. The zv_suspend_lock is always taken + * before zv_state_lock. The mode argument indicates the mode (including none) + * for zv_suspend_lock to be taken. */ static zvol_state_t * -zvol_find_by_name_hash(const char *name, uint64_t hash) +zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) { zvol_state_t *zv; struct hlist_node *p; - ASSERT(MUTEX_HELD(&zvol_state_lock)); + mutex_enter(&zvol_state_lock); hlist_for_each(p, ZVOL_HT_HEAD(hash)) { zv = hlist_entry(p, zvol_state_t, zv_hlink); + mutex_enter(&zv->zv_state_lock); if (zv->zv_hash == hash && - strncmp(zv->zv_name, name, MAXNAMELEN) == 0) + strncmp(zv->zv_name, name, MAXNAMELEN) == 0) { + /* + * this is the right zvol, take the locks in the + * right order + */ + if (mode != RW_NONE && + !rw_tryenter(&zv->zv_suspend_lock, mode)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, mode); + mutex_enter(&zv->zv_state_lock); + /* + * zvol cannot be renamed as we continue + * to hold zvol_state_lock + */ + ASSERT(zv->zv_hash == hash && + strncmp(zv->zv_name, name, MAXNAMELEN) + == 0); + } + mutex_exit(&zvol_state_lock); return (zv); + } + mutex_exit(&zv->zv_state_lock); } + mutex_exit(&zvol_state_lock); + return (NULL); } /* - * Find a zvol_state_t given the name provided at zvol_alloc() time. + * Find a zvol_state_t given the name. + * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, + * return (NULL) without the taking locks. The zv_suspend_lock is always taken + * before zv_state_lock. The mode argument indicates the mode (including none) + * for zv_suspend_lock to be taken. */ static zvol_state_t * -zvol_find_by_name(const char *name) +zvol_find_by_name(const char *name, int mode) { - return (zvol_find_by_name_hash(name, zvol_name_hash(name))); + return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode)); } @@ -295,9 +341,12 @@ zvol_size_changed(zvol_state_t *zv, uint64_t volsize) { struct block_device *bdev; + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + bdev = bdget_disk(zv->zv_disk, 0); if (bdev == NULL) return; + set_capacity(zv->zv_disk, volsize >> 9); zv->zv_volsize = volsize; check_disk_size_change(zv->zv_disk, bdev); @@ -391,13 +440,14 @@ zvol_set_volsize(const char *name, uint64_t volsize) if (readonly) return (SET_ERROR(EROFS)); - mutex_enter(&zvol_state_lock); - zv = zvol_find_by_name(name); - if (zv != NULL) - mutex_enter(&zv->zv_state_lock); - mutex_exit(&zvol_state_lock); + zv = zvol_find_by_name(name, RW_READER); + + ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) && + RW_READ_HELD(&zv->zv_suspend_lock))); if (zv == NULL || zv->zv_objset == NULL) { + if (zv != NULL) + rw_exit(&zv->zv_suspend_lock); if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, FTAG, &os)) != 0) { if (zv != NULL) @@ -408,7 +458,6 @@ zvol_set_volsize(const char *name, uint64_t volsize) if (zv != NULL) zv->zv_objset = os; } else { - rw_enter(&zv->zv_suspend_lock, RW_READER); os = zv->zv_objset; } @@ -435,6 +484,7 @@ out: if (zv != NULL) mutex_exit(&zv->zv_state_lock); + return (SET_ERROR(error)); } @@ -485,23 +535,20 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) dmu_tx_t *tx; int error; - mutex_enter(&zvol_state_lock); + zv = zvol_find_by_name(name, RW_READER); - zv = zvol_find_by_name(name); - if (zv == NULL) { - mutex_exit(&zvol_state_lock); + if (zv == NULL) return (SET_ERROR(ENXIO)); - } - mutex_enter(&zv->zv_state_lock); - mutex_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock) && + RW_READ_HELD(&zv->zv_suspend_lock)); if (zv->zv_flags & ZVOL_RDONLY) { mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); return (SET_ERROR(EROFS)); } - rw_enter(&zv->zv_suspend_lock, RW_READER); - tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_bonus(tx, ZVOL_OBJ); error = dmu_tx_assign(tx, TXG_WAIT); @@ -516,9 +563,9 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) if (error == 0) zv->zv_volblocksize = volblocksize; } - rw_exit(&zv->zv_suspend_lock); mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); return (SET_ERROR(error)); } @@ -1060,6 +1107,9 @@ zvol_setup_zv(zvol_state_t *zv) uint64_t ro; objset_t *os = zv->zv_objset; + ASSERT(MUTEX_HELD(&zv->zv_state_lock) && + RW_LOCK_HELD(&zv->zv_suspend_lock)); + error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) return (SET_ERROR(error)); @@ -1094,6 +1144,9 @@ zvol_setup_zv(zvol_state_t *zv) static void zvol_shutdown_zv(zvol_state_t *zv) { + ASSERT(MUTEX_HELD(&zv->zv_state_lock) && + RW_LOCK_HELD(&zv->zv_suspend_lock)); + zil_close(zv->zv_zilog); zv->zv_zilog = NULL; @@ -1127,24 +1180,27 @@ zvol_suspend(const char *name) { zvol_state_t *zv; - mutex_enter(&zvol_state_lock); - zv = zvol_find_by_name(name); - if (zv == NULL) { - mutex_exit(&zvol_state_lock); + zv = zvol_find_by_name(name, RW_WRITER); + + if (zv == NULL) return (NULL); - } - mutex_enter(&zv->zv_state_lock); - mutex_exit(&zvol_state_lock); /* block all I/O, release in zvol_resume. */ - rw_enter(&zv->zv_suspend_lock, RW_WRITER); + ASSERT(MUTEX_HELD(&zv->zv_state_lock) && + RW_WRITE_HELD(&zv->zv_suspend_lock)); atomic_inc(&zv->zv_suspend_ref); if (zv->zv_open_count > 0) zvol_shutdown_zv(zv); + /* + * do not hold zv_state_lock across suspend/resume to + * avoid locking up zvol lookups + */ mutex_exit(&zv->zv_state_lock); + + /* zv_suspend_lock is released in zvol_resume() */ return (zv); } @@ -1155,11 +1211,8 @@ zvol_resume(zvol_state_t *zv) ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); - /* - * Cannot take zv_state_lock here with zv_suspend_lock - * held; however, the latter is held in exclusive mode, - * so it is not necessary to do so - */ + mutex_enter(&zv->zv_state_lock); + if (zv->zv_open_count > 0) { VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); @@ -1168,6 +1221,9 @@ zvol_resume(zvol_state_t *zv) error = zvol_setup_zv(zv); } + + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); /* * We need this because we don't hold zvol_state_lock while releasing @@ -1186,6 +1242,9 @@ zvol_first_open(zvol_state_t *zv) objset_t *os; int error, locked = 0; + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + /* * In all other cases the spa_namespace_lock is taken before the * bdev->bd_mutex lock. But in this case the Linux __blkdev_get() @@ -1233,6 +1292,9 @@ out_mutex: static void zvol_last_close(zvol_state_t *zv) { + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + zvol_shutdown_zv(zv); dmu_objset_disown(zv->zv_objset, zv); @@ -1243,14 +1305,15 @@ static int zvol_open(struct block_device *bdev, fmode_t flag) { zvol_state_t *zv; - int error = 0, drop_suspend = 0; + int error = 0; + boolean_t drop_suspend = B_FALSE; ASSERT(!mutex_owned(&zvol_state_lock)); mutex_enter(&zvol_state_lock); /* - * Obtain a copy of private_data under the lock to make sure - * that either the result of zvol free code path setting + * Obtain a copy of private_data under the zvol_state_lock to make + * sure that either the result of zvol free code path setting * bdev->bd_disk->private_data to NULL is observed, or zvol_free() * is not called on this zv because of the positive zv_open_count. */ @@ -1259,14 +1322,25 @@ zvol_open(struct block_device *bdev, fmode_t flag) mutex_exit(&zvol_state_lock); return (SET_ERROR(-ENXIO)); } + + /* take zv_suspend_lock before zv_state_lock */ + rw_enter(&zv->zv_suspend_lock, RW_READER); + mutex_enter(&zv->zv_state_lock); - mutex_exit(&zvol_state_lock); + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock), otherwise, drop the lock + */ if (zv->zv_open_count == 0) { - /* make sure zvol is not suspended when first open */ - rw_enter(&zv->zv_suspend_lock, RW_READER); - drop_suspend = 1; + drop_suspend = B_TRUE; + } else { + rw_exit(&zv->zv_suspend_lock); + } + + mutex_exit(&zvol_state_lock); + if (zv->zv_open_count == 0) { error = zvol_first_open(zv); if (error) goto out_mutex; @@ -1285,9 +1359,9 @@ out_open_count: if (zv->zv_open_count == 0) zvol_last_close(zv); out_mutex: + mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - mutex_exit(&zv->zv_state_lock); return (SET_ERROR(error)); } @@ -1300,27 +1374,38 @@ static int zvol_release(struct gendisk *disk, fmode_t mode) { zvol_state_t *zv; + boolean_t drop_suspend = B_FALSE; ASSERT(!mutex_owned(&zvol_state_lock)); mutex_enter(&zvol_state_lock); zv = disk->private_data; ASSERT(zv && zv->zv_open_count > 0); + + /* take zv_suspend_lock before zv_state_lock */ + rw_enter(&zv->zv_suspend_lock, RW_READER); + mutex_enter(&zv->zv_state_lock); mutex_exit(&zvol_state_lock); - /* make sure zvol is not suspended when last close */ + /* + * make sure zvol is not suspended during last close + * (hold zv_suspend_lock), otherwise, drop the lock + */ if (zv->zv_open_count == 1) - rw_enter(&zv->zv_suspend_lock, RW_READER); + drop_suspend = B_TRUE; + else + rw_exit(&zv->zv_suspend_lock); zv->zv_open_count--; - if (zv->zv_open_count == 0) { + if (zv->zv_open_count == 0) zvol_last_close(zv); - rw_exit(&zv->zv_suspend_lock); - } mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + #ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID return (0); #endif @@ -1430,10 +1515,11 @@ zvol_probe(dev_t dev, int *part, void *arg) zvol_state_t *zv; struct kobject *kobj; - mutex_enter(&zvol_state_lock); zv = zvol_find_by_dev(dev); kobj = zv ? get_disk(zv->zv_disk) : NULL; - mutex_exit(&zvol_state_lock); + ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock)); + if (zv) + mutex_exit(&zv->zv_state_lock); return (kobj); } @@ -1571,6 +1657,8 @@ zvol_free(void *arg) zvol_state_t *zv = arg; ASSERT(!MUTEX_HELD(&zvol_state_lock)); + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); ASSERT(zv->zv_open_count == 0); ASSERT(zv->zv_disk->private_data == NULL); @@ -1611,17 +1699,14 @@ zvol_create_minor_impl(const char *name) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; - mutex_enter(&zvol_state_lock); - - zv = zvol_find_by_name_hash(name, hash); + zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { - mutex_exit(&zvol_state_lock); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + mutex_exit(&zv->zv_state_lock); ida_simple_remove(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } - mutex_exit(&zvol_state_lock); - doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os); @@ -1715,10 +1800,9 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname) int readonly = get_disk_ro(zv->zv_disk); ASSERT(MUTEX_HELD(&zvol_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - rw_enter(&zv->zv_suspend_lock, RW_READER); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); - rw_exit(&zv->zv_suspend_lock); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(zv->zv_name); @@ -1960,15 +2044,15 @@ zvol_remove_minors_impl(const char *name) for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); + mutex_enter(&zv->zv_state_lock); if (name == NULL || strcmp(zv->zv_name, name) == 0 || (strncmp(zv->zv_name, name, namelen) == 0 && (zv->zv_name[namelen] == '/' || zv->zv_name[namelen] == '@'))) { /* - * By taking zv_state_lock here, we guarantee that no + * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ - mutex_enter(&zv->zv_state_lock); /* If in use, leave alone */ if (zv->zv_open_count > 0 || @@ -1979,7 +2063,10 @@ zvol_remove_minors_impl(const char *name) zvol_remove(zv); - /* clear this so zvol_open won't open it */ + /* + * clear this while holding zvol_state_lock so + * zvol_open won't open it + */ zv->zv_disk->private_data = NULL; /* Drop zv_state_lock before zvol_free() */ @@ -1992,6 +2079,8 @@ zvol_remove_minors_impl(const char *name) list_insert_head(&free_list, zv); else tid = t; + } else { + mutex_exit(&zv->zv_state_lock); } } mutex_exit(&zvol_state_lock); @@ -2025,12 +2114,12 @@ zvol_remove_minor_impl(const char *name) for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); + mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, name) == 0) { /* - * By taking zv_state_lock here, we guarantee that no + * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ - mutex_enter(&zv->zv_state_lock); /* If in use, leave alone */ if (zv->zv_open_count > 0 || @@ -2045,6 +2134,8 @@ zvol_remove_minor_impl(const char *name) mutex_exit(&zv->zv_state_lock); break; + } else { + mutex_exit(&zv->zv_state_lock); } } @@ -2077,9 +2168,13 @@ zvol_rename_minors_impl(const char *oldname, const char *newname) for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); + mutex_enter(&zv->zv_state_lock); + /* If in use, leave alone */ - if (zv->zv_open_count > 0) + if (zv->zv_open_count > 0) { + mutex_exit(&zv->zv_state_lock); continue; + } if (strcmp(zv->zv_name, oldname) == 0) { zvol_rename_minor(zv, newname); @@ -2091,6 +2186,8 @@ zvol_rename_minors_impl(const char *oldname, const char *newname) zv->zv_name + oldnamelen + 1); zvol_rename_minor(zv, name); } + + mutex_exit(&zv->zv_state_lock); } mutex_exit(&zvol_state_lock); |