aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChunwei Chen <[email protected]>2017-01-19 13:56:36 -0800
committerBrian Behlendorf <[email protected]>2017-01-19 13:56:36 -0800
commit040dab993936d832df4c7624bbcdb71c3fb9b34b (patch)
tree0261aedba3c5e6fdda94a0a10388d065f0802924
parent76fe529b392068dfb7575739542cd4f69d2d4343 (diff)
Suspend/resume zvol for recv and rollback
When doing recv and rollback, dsl_dataset_clone_swap_sync_impl will be called to swap out the ds_objset and do dmu_objset_evict on the old one. However, currently zv->zv_objset will not be swapped out accordingly, so if anyone currently holds a fd on the zvol, we risk hitting a use-after-free. We fix this by introducing the suspend and resume mechanism of zsb to zv. Before recv or rollback, we use zvol_suspend to block all access to zv_objset and shut it down. After the recv or rollback, we use zvol_resume to swap in zv_objset with the new ds_objset and unblock the access. Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Chunwei Chen <[email protected]> Closes #4866 Closes #5609
-rw-r--r--include/sys/zvol.h7
-rw-r--r--module/zfs/dsl_dataset.c5
-rw-r--r--module/zfs/zfs_ioctl.c8
-rw-r--r--module/zfs/zvol.c190
4 files changed, 168 insertions, 42 deletions
diff --git a/include/sys/zvol.h b/include/sys/zvol.h
index 2fb20fbf9..f149da977 100644
--- a/include/sys/zvol.h
+++ b/include/sys/zvol.h
@@ -35,14 +35,14 @@
#define SPEC_MAXOFFSET_T ((1LL << ((NBBY * sizeof (daddr32_t)) + \
DEV_BSHIFT - 1)) - 1)
-extern void *zvol_tag;
-
extern void zvol_create_minors(spa_t *spa, const char *name, boolean_t async);
extern void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async);
extern void zvol_rename_minors(spa_t *spa, const char *oldname,
const char *newname, boolean_t async);
#ifdef _KERNEL
+typedef struct zvol_state zvol_state_t;
+
extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
extern int zvol_check_volblocksize(const char *name, uint64_t volblocksize);
extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
@@ -51,6 +51,9 @@ extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
extern int zvol_set_volsize(const char *, uint64_t);
extern int zvol_set_volblocksize(const char *, uint64_t);
extern int zvol_set_snapdev(const char *, zprop_source_t, uint64_t);
+extern zvol_state_t *zvol_suspend(const char *);
+extern int zvol_resume(zvol_state_t *);
+extern void *zvol_tag(zvol_state_t *);
extern int zvol_init(void);
extern void zvol_fini(void);
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 43fecf28b..ddab3eddb 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2124,8 +2124,7 @@ dsl_dataset_rename_snapshot(const char *fsname,
* only one long hold on the dataset. We're not allowed to change anything here
* so we don't permanently release the long hold or regular hold here. We want
* to do this only when syncing to avoid the dataset unexpectedly going away
- * when we release the long hold. Allow a long hold to exist for volumes, this
- * may occur when asynchronously registering the minor with the kernel.
+ * when we release the long hold.
*/
static int
dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
@@ -2140,7 +2139,7 @@ dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
dsl_dataset_long_rele(ds, owner);
}
- held = (dsl_dataset_long_held(ds) && (ds->ds_owner != zvol_tag));
+ held = dsl_dataset_long_held(ds);
if (owner != NULL)
dsl_dataset_long_hold(ds, owner);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index ba4e0ee3f..659345917 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3641,6 +3641,7 @@ static int
zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl)
{
zfs_sb_t *zsb;
+ zvol_state_t *zv;
int error;
if (get_zfs_sb(fsname, &zsb) == 0) {
@@ -3653,6 +3654,9 @@ zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl)
error = error ? error : resume_err;
}
deactivate_super(zsb->z_sb);
+ } else if ((zv = zvol_suspend(fsname)) != NULL) {
+ error = dsl_dataset_rollback(fsname, zvol_tag(zv), outnvl);
+ zvol_resume(zv);
} else {
error = dsl_dataset_rollback(fsname, NULL, outnvl);
}
@@ -4240,6 +4244,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin,
if (error == 0) {
zfs_sb_t *zsb = NULL;
+ zvol_state_t *zv = NULL;
if (get_zfs_sb(tofs, &zsb) == 0) {
/* online recv */
@@ -4255,6 +4260,9 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin,
error = zfs_resume_fs(zsb, tofs);
error = error ? error : end_err;
deactivate_super(zsb->z_sb);
+ } else if ((zv = zvol_suspend(tofs)) != NULL) {
+ error = dmu_recv_end(&drc, zvol_tag(zv));
+ zvol_resume(zv);
} else {
error = dmu_recv_end(&drc, NULL);
}
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 10c963ca5..aad110b1b 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -61,7 +61,6 @@ unsigned long zvol_max_discard_blocks = 16384;
static kmutex_t zvol_state_lock;
static list_t zvol_state_list;
-void *zvol_tag = "zvol_tag";
#define ZVOL_HT_SIZE 1024
static struct hlist_head *zvol_htable;
@@ -71,7 +70,7 @@ static DEFINE_IDA(zvol_ida);
/*
* The in-core state of each volume.
*/
-typedef struct zvol_state {
+struct zvol_state {
char zv_name[MAXNAMELEN]; /* name */
uint64_t zv_volsize; /* advertised space */
uint64_t zv_volblocksize; /* volume block size */
@@ -88,7 +87,9 @@ typedef struct zvol_state {
list_node_t zv_next; /* next zvol_state_t linkage */
uint64_t zv_hash; /* name hash */
struct hlist_node zv_hlink; /* hash link */
-} zvol_state_t;
+ atomic_t zv_suspend_ref; /* refcount for suspend */
+ krwlock_t zv_suspend_lock; /* suspend lock */
+};
typedef enum {
ZVOL_ASYNC_CREATE_MINORS,
@@ -373,6 +374,7 @@ zvol_set_volsize(const char *name, uint64_t volsize)
if (zv != NULL)
zv->zv_objset = os;
} else {
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
os = zv->zv_objset;
}
@@ -392,6 +394,8 @@ out:
dmu_objset_disown(os, FTAG);
if (zv != NULL)
zv->zv_objset = NULL;
+ } else {
+ rw_exit(&zv->zv_suspend_lock);
}
mutex_exit(&zvol_state_lock);
return (error);
@@ -457,6 +461,8 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
goto out;
}
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+
tx = dmu_tx_create(zv->zv_objset);
dmu_tx_hold_bonus(tx, ZVOL_OBJ);
error = dmu_tx_assign(tx, TXG_WAIT);
@@ -471,6 +477,7 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
if (error == 0)
zv->zv_volblocksize = volblocksize;
}
+ rw_exit(&zv->zv_suspend_lock);
out:
mutex_exit(&zvol_state_lock);
@@ -785,6 +792,8 @@ zvol_request(struct request_queue *q, struct bio *bio)
#endif
int error = 0;
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+
uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
uio.uio_skip = BIO_BI_SKIP(bio);
uio.uio_resid = BIO_BI_SIZE(bio);
@@ -836,6 +845,7 @@ out2:
generic_end_io_acct(rw, &zv->zv_disk->part0, start);
out1:
BIO_END_IO(bio, -error);
+ rw_exit(&zv->zv_suspend_lock);
spl_fstrans_unmark(cookie);
#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
return (0);
@@ -947,32 +957,28 @@ zvol_remove(zvol_state_t *zv)
hlist_del(&zv->zv_hlink);
}
+/*
+ * Setup zv after we just own the zv->objset
+ */
static int
-zvol_first_open(zvol_state_t *zv)
+zvol_setup_zv(zvol_state_t *zv)
{
- objset_t *os;
uint64_t volsize;
int error;
uint64_t ro;
-
- /* lie and say we're read-only */
- error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
- if (error)
- return (SET_ERROR(-error));
-
- zv->zv_objset = os;
+ objset_t *os = zv->zv_objset;
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
if (error)
- goto out_owned;
+ return (SET_ERROR(error));
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
if (error)
- goto out_owned;
+ return (SET_ERROR(error));
- error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
+ error = dmu_bonus_hold(os, ZVOL_OBJ, zv, &zv->zv_dbuf);
if (error)
- goto out_owned;
+ return (SET_ERROR(error));
set_capacity(zv->zv_disk, volsize >> 9);
zv->zv_volsize = volsize;
@@ -986,23 +992,20 @@ zvol_first_open(zvol_state_t *zv)
set_disk_ro(zv->zv_disk, 0);
zv->zv_flags &= ~ZVOL_RDONLY;
}
-
-out_owned:
- if (error) {
- dmu_objset_disown(os, zvol_tag);
- zv->zv_objset = NULL;
- }
-
- return (SET_ERROR(-error));
+ return (0);
}
+/*
+ * Shutdown every zv_objset related stuff except zv_objset itself.
+ * The is the reverse of zvol_setup_zv.
+ */
static void
-zvol_last_close(zvol_state_t *zv)
+zvol_shutdown_zv(zvol_state_t *zv)
{
zil_close(zv->zv_zilog);
zv->zv_zilog = NULL;
- dmu_buf_rele(zv->zv_dbuf, zvol_tag);
+ dmu_buf_rele(zv->zv_dbuf, zv);
zv->zv_dbuf = NULL;
/*
@@ -1012,8 +1015,98 @@ zvol_last_close(zvol_state_t *zv)
!(zv->zv_flags & ZVOL_RDONLY))
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
(void) dmu_objset_evict_dbufs(zv->zv_objset);
+}
+
+/*
+ * return the proper tag for rollback and recv
+ */
+void *
+zvol_tag(zvol_state_t *zv)
+{
+ ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
+ return (zv->zv_open_count > 0 ? zv : NULL);
+}
+
+/*
+ * Suspend the zvol for recv and rollback.
+ */
+zvol_state_t *
+zvol_suspend(const char *name)
+{
+ zvol_state_t *zv;
+
+ mutex_enter(&zvol_state_lock);
+ zv = zvol_find_by_name(name);
+ if (zv == NULL)
+ goto out;
- dmu_objset_disown(zv->zv_objset, zvol_tag);
+ /* block all I/O, release in zvol_resume. */
+ rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+
+ atomic_inc(&zv->zv_suspend_ref);
+
+ if (zv->zv_open_count > 0)
+ zvol_shutdown_zv(zv);
+out:
+ mutex_exit(&zvol_state_lock);
+ return (zv);
+}
+
+int
+zvol_resume(zvol_state_t *zv)
+{
+ int error = 0;
+
+ ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
+ if (zv->zv_open_count > 0) {
+ VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
+ VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
+ VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset));
+ dmu_objset_rele(zv->zv_objset, zv);
+
+ error = zvol_setup_zv(zv);
+ }
+ rw_exit(&zv->zv_suspend_lock);
+ /*
+ * We need this because we don't hold zvol_state_lock while releasing
+ * zv_suspend_lock. zvol_remove_minors_impl thus cannot check
+ * zv_suspend_lock to determine it is safe to free because rwlock is
+ * not inherent atomic.
+ */
+ atomic_dec(&zv->zv_suspend_ref);
+
+ return (SET_ERROR(error));
+}
+
+static int
+zvol_first_open(zvol_state_t *zv)
+{
+ objset_t *os;
+ int error;
+
+ /* lie and say we're read-only */
+ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zv, &os);
+ if (error)
+ return (SET_ERROR(-error));
+
+ zv->zv_objset = os;
+
+ error = zvol_setup_zv(zv);
+
+ if (error) {
+ dmu_objset_disown(os, zv);
+ zv->zv_objset = NULL;
+ }
+
+ return (SET_ERROR(-error));
+}
+
+static void
+zvol_last_close(zvol_state_t *zv)
+{
+ zvol_shutdown_zv(zv);
+
+ dmu_objset_disown(zv->zv_objset, zv);
zv->zv_objset = NULL;
}
@@ -1021,7 +1114,7 @@ static int
zvol_open(struct block_device *bdev, fmode_t flag)
{
zvol_state_t *zv;
- int error = 0, drop_mutex = 0;
+ int error = 0, drop_mutex = 0, drop_suspend = 0;
/*
* If the caller is already holding the mutex do not take it
@@ -1047,6 +1140,10 @@ zvol_open(struct block_device *bdev, fmode_t flag)
}
if (zv->zv_open_count == 0) {
+ /* make sure zvol is not suspended when first open */
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+ drop_suspend = 1;
+
error = zvol_first_open(zv);
if (error)
goto out_mutex;
@@ -1064,8 +1161,9 @@ zvol_open(struct block_device *bdev, fmode_t flag)
out_open_count:
if (zv->zv_open_count == 0)
zvol_last_close(zv);
-
out_mutex:
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
if (drop_mutex)
mutex_exit(&zvol_state_lock);
@@ -1089,9 +1187,15 @@ zvol_release(struct gendisk *disk, fmode_t mode)
drop_mutex = 1;
}
+ /* make sure zvol is not suspended when last close */
+ if (zv->zv_open_count == 1)
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+
zv->zv_open_count--;
- if (zv->zv_open_count == 0)
+ if (zv->zv_open_count == 0) {
zvol_last_close(zv);
+ rw_exit(&zv->zv_suspend_lock);
+ }
if (drop_mutex)
mutex_exit(&zvol_state_lock);
@@ -1110,6 +1214,7 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
ASSERT(zv && zv->zv_open_count > 0);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
switch (cmd) {
case BLKFLSBUF:
zil_commit(zv->zv_zilog, ZVOL_OBJ);
@@ -1121,8 +1226,8 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
default:
error = -ENOTTY;
break;
-
}
+ rw_exit(&zv->zv_suspend_lock);
return (SET_ERROR(error));
}
@@ -1296,6 +1401,7 @@ zvol_alloc(dev_t dev, const char *name)
strlcpy(zv->zv_name, name, MAXNAMELEN);
zfs_rlock_init(&zv->zv_range_lock);
+ rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
zv->zv_disk->major = zvol_major;
zv->zv_disk->first_minor = (dev & MINORMASK);
@@ -1325,6 +1431,7 @@ zvol_free_impl(void *arg)
zvol_state_t *zv = arg;
ASSERT(zv->zv_open_count == 0);
+ rw_destroy(&zv->zv_suspend_lock);
zfs_rlock_destroy(&zv->zv_range_lock);
zv->zv_disk->private_data = NULL;
@@ -1380,7 +1487,7 @@ zvol_create_minor_impl(const char *name)
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
- error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os);
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
if (error)
goto out_doi;
@@ -1446,7 +1553,7 @@ zvol_create_minor_impl(const char *name)
zv->zv_objset = NULL;
out_dmu_objset_disown:
- dmu_objset_disown(os, zvol_tag);
+ dmu_objset_disown(os, FTAG);
out_doi:
kmem_free(doi, sizeof (dmu_object_info_t));
out:
@@ -1479,7 +1586,14 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
ASSERT(MUTEX_HELD(&zvol_state_lock));
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+ rw_exit(&zv->zv_suspend_lock);
+
+ /* move to new hashtable entry */
+ zv->zv_hash = zvol_name_hash(zv->zv_name);
+ hlist_del(&zv->zv_hlink);
+ hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
/*
* The block device's read-only state is briefly changed causing
@@ -1512,11 +1626,11 @@ zvol_prefetch_minors_impl(void *arg)
char *dsname = job->name;
objset_t *os = NULL;
- job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, zvol_tag,
+ job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, FTAG,
&os);
if (job->error == 0) {
dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
- dmu_objset_disown(os, zvol_tag);
+ dmu_objset_disown(os, FTAG);
}
}
@@ -1718,7 +1832,8 @@ zvol_remove_minors_impl(const char *name)
zv->zv_name[namelen] == '@'))) {
/* If in use, leave alone */
- if (zv->zv_open_count > 0)
+ if (zv->zv_open_count > 0 ||
+ atomic_read(&zv->zv_suspend_ref))
continue;
zvol_remove(zv);
@@ -1759,7 +1874,8 @@ zvol_remove_minor_impl(const char *name)
if (strcmp(zv->zv_name, name) == 0) {
/* If in use, leave alone */
- if (zv->zv_open_count > 0)
+ if (zv->zv_open_count > 0 ||
+ atomic_read(&zv->zv_suspend_ref))
continue;
zvol_remove(zv);
zvol_free(zv);