diff options
author | Boris Protopopov <[email protected]> | 2014-03-22 05:07:14 -0400 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2016-03-10 09:49:22 -0800 |
commit | a0bd735adb1b1eb81fef10b4db102ee051c4d4ff (patch) | |
tree | 121fcde3000a116f0c33143b28a530a87fd6073a /module/zfs/zvol.c | |
parent | eb0856779f7b57162c9179f238104f6d6e150745 (diff) |
Add support for asynchronous zvol minor operations
zfsonlinux issue #2217 - zvol minor operations: check snapdev
property before traversing snapshots of a dataset
zfsonlinux issue #3681 - lock order inversion between zvol_open()
and dsl_pool_sync()...zvol_rename_minors()
Create a per-pool zvol taskq for asynchronous zvol tasks.
There are a few key design decisions to be aware of.
* Each taskq must be single threaded to ensure tasks are always
processed in the order in which they were dispatched.
* There is a taskq per-pool in order to keep the pools independent.
This way if one pool is suspended it will not impact another.
* The preferred location to dispatch a zvol minor task is a sync
task. In this context there is easy access to the spa_t and
minimal error handling is required because the sync task must
succeed.
Support for asynchronous zvol minor operations address issue #3681.
Signed-off-by: Boris Protopopov <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #2217
Closes #3678
Closes #3681
Diffstat (limited to 'module/zfs/zvol.c')
-rw-r--r-- | module/zfs/zvol.c | 494 |
1 files changed, 373 insertions, 121 deletions
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 034cf6a6a..ab4d3ceb7 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -42,6 +42,7 @@ #include <sys/dmu_traverse.h> #include <sys/dsl_dataset.h> #include <sys/dsl_prop.h> +#include <sys/dsl_dir.h> #include <sys/zap.h> #include <sys/zfeature.h> #include <sys/zil_impl.h> @@ -49,6 +50,7 @@ #include <sys/zio.h> #include <sys/zfs_rlock.h> #include <sys/zfs_znode.h> +#include <sys/spa_impl.h> #include <sys/zvol.h> #include <linux/blkdev_compat.h> @@ -81,6 +83,23 @@ typedef struct zvol_state { list_node_t zv_next; /* next zvol_state_t linkage */ } zvol_state_t; +typedef enum { + ZVOL_ASYNC_CREATE_MINORS, + ZVOL_ASYNC_REMOVE_MINORS, + ZVOL_ASYNC_RENAME_MINORS, + ZVOL_ASYNC_SET_SNAPDEV, + ZVOL_ASYNC_MAX +} zvol_async_op_t; + +typedef struct { + zvol_async_op_t op; + char pool[MAXNAMELEN]; + char name1[MAXNAMELEN]; + char name2[MAXNAMELEN]; + zprop_source_t source; + uint64_t snapdev; +} zvol_task_t; + #define ZVOL_RDONLY 0x1 /* @@ -977,6 +996,7 @@ zvol_first_open(zvol_state_t *zv) error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) { dmu_objset_disown(os, zvol_tag); + zv->zv_objset = NULL; goto out_mutex; } @@ -984,6 +1004,7 @@ zvol_first_open(zvol_state_t *zv) error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); if (error) { dmu_objset_disown(os, zvol_tag); + zv->zv_objset = NULL; goto out_mutex; } @@ -1036,7 +1057,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) /* * If the caller is already holding the mutex do not take it - * again, this will happen as part of zvol_create_minor(). + * again, this will happen as part of zvol_create_minor_impl(). * Once add_disk() is called the device is live and the kernel * will attempt to open it to read the partition information. */ @@ -1355,31 +1376,13 @@ zvol_free(zvol_state_t *zv) kmem_free(zv, sizeof (zvol_state_t)); } +/* + * Create a block device minor node and setup the linkage between it + * and the specified volume. Once this function returns the block + * device is live and ready for use. + */ static int -__zvol_snapdev_hidden(const char *name) -{ - uint64_t snapdev; - char *parent; - char *atp; - int error = 0; - - parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strlcpy(parent, name, MAXPATHLEN); - - if ((atp = strrchr(parent, '@')) != NULL) { - *atp = '\0'; - error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL); - if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN)) - error = SET_ERROR(ENODEV); - } - - kmem_free(parent, MAXPATHLEN); - - return (SET_ERROR(error)); -} - -static int -__zvol_create_minor(const char *name, boolean_t ignore_snapdev) +zvol_create_minor_impl(const char *name) { zvol_state_t *zv; objset_t *os; @@ -1389,7 +1392,7 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) unsigned minor = 0; int error = 0; - ASSERT(MUTEX_HELD(&zvol_state_lock)); + mutex_enter(&zvol_state_lock); zv = zvol_find_by_name(name); if (zv) { @@ -1397,12 +1400,6 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) goto out; } - if (ignore_snapdev == B_FALSE) { - error = __zvol_snapdev_hidden(name); - if (error) - goto out; - } - doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); @@ -1489,69 +1486,18 @@ out: */ mutex_exit(&zvol_state_lock); add_disk(zv->zv_disk); - mutex_enter(&zvol_state_lock); + } else { + mutex_exit(&zvol_state_lock); } return (SET_ERROR(error)); } /* - * Create a block device minor node and setup the linkage between it - * and the specified volume. Once this function returns the block - * device is live and ready for use. - */ -int -zvol_create_minor(const char *name) -{ - int error; - - mutex_enter(&zvol_state_lock); - error = __zvol_create_minor(name, B_FALSE); - mutex_exit(&zvol_state_lock); - - return (SET_ERROR(error)); -} - -static int -__zvol_remove_minor(const char *name) -{ - zvol_state_t *zv; - - ASSERT(MUTEX_HELD(&zvol_state_lock)); - - zv = zvol_find_by_name(name); - if (zv == NULL) - return (SET_ERROR(ENXIO)); - - if (zv->zv_open_count > 0) - return (SET_ERROR(EBUSY)); - - zvol_remove(zv); - zvol_free(zv); - - return (0); -} - -/* - * Remove a block device minor node for the specified volume. - */ -int -zvol_remove_minor(const char *name) -{ - int error; - - mutex_enter(&zvol_state_lock); - error = __zvol_remove_minor(name); - mutex_exit(&zvol_state_lock); - - return (SET_ERROR(error)); -} - -/* * Rename a block device minor mode for the specified volume. */ static void -__zvol_rename_minor(zvol_state_t *zv, const char *newname) +zvol_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_disk); @@ -1571,30 +1517,120 @@ __zvol_rename_minor(zvol_state_t *zv, const char *newname) set_disk_ro(zv->zv_disk, readonly); } + +/* + * Mask errors to continue dmu_objset_find() traversal + */ +static int +zvol_create_snap_minor_cb(const char *dsname, void *arg) +{ + const char *name = (const char *)arg; + + /* skip the designated dataset */ + if (name && strcmp(dsname, name) == 0) + return (0); + + /* at this point, the dsname should name a snapshot */ + if (strchr(dsname, '@') == 0) { + dprintf("zvol_create_snap_minor_cb(): " + "%s is not a shapshot name\n", dsname); + } else { + (void) zvol_create_minor_impl(dsname); + } + + return (0); +} + +/* + * Mask errors to continue dmu_objset_find() traversal + */ static int zvol_create_minors_cb(const char *dsname, void *arg) { - (void) zvol_create_minor(dsname); + uint64_t snapdev; + int error; + + error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); + if (error) + return (0); + + /* + * Given the name and the 'snapdev' property, create device minor nodes + * with the linkages to zvols/snapshots as needed. + * If the name represents a zvol, create a minor node for the zvol, then + * check if its snapshots are 'visible', and if so, iterate over the + * snapshots and create device minor nodes for those. + */ + if (strchr(dsname, '@') == 0) { + /* create minor for the 'dsname' explicitly */ + error = zvol_create_minor_impl(dsname); + if ((error == 0 || error == EEXIST) && + (snapdev == ZFS_SNAPDEV_VISIBLE)) { + fstrans_cookie_t cookie = spl_fstrans_mark(); + /* + * traverse snapshots only, do not traverse children, + * and skip the 'dsname' + */ + error = dmu_objset_find((char *)dsname, + zvol_create_snap_minor_cb, (void *)dsname, + DS_FIND_SNAPSHOTS); + spl_fstrans_unmark(cookie); + } + } else { + dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", + dsname); + } return (0); } /* - * Create minors for specified dataset including children and snapshots. + * Create minors for the specified dataset, including children and snapshots. + * Pay attention to the 'snapdev' property and iterate over the snapshots + * only if they are 'visible'. This approach allows one to assure that the + * snapshot metadata is read from disk only if it is needed. + * + * The name can represent a dataset to be recursively scanned for zvols and + * their snapshots, or a single zvol snapshot. If the name represents a + * dataset, the scan is performed in two nested stages: + * - scan the dataset for zvols, and + * - for each zvol, create a minor node, then check if the zvol's snapshots + * are 'visible', and only then iterate over the snapshots if needed + * + * If the name represents a snapshot, a check is perfromed if the snapshot is + * 'visible' (which also verifies that the parent is a zvol), and if so, + * a minor node for that snapshot is created. */ -int -zvol_create_minors(const char *name) +static int +zvol_create_minors_impl(const char *name) { int error = 0; fstrans_cookie_t cookie; + char *atp, *parent; if (zvol_inhibit_dev) return (0); - cookie = spl_fstrans_mark(); - error = dmu_objset_find((char *)name, zvol_create_minors_cb, - NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); - spl_fstrans_unmark(cookie); + parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) strlcpy(parent, name, MAXPATHLEN); + + if ((atp = strrchr(parent, '@')) != NULL) { + uint64_t snapdev; + + *atp = '\0'; + error = dsl_prop_get_integer(parent, "snapdev", + &snapdev, NULL); + + if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) + error = zvol_create_minor_impl(name); + } else { + cookie = spl_fstrans_mark(); + error = dmu_objset_find(parent, zvol_create_minors_cb, + NULL, DS_FIND_CHILDREN); + spl_fstrans_unmark(cookie); + } + + kmem_free(parent, MAXPATHLEN); return (SET_ERROR(error)); } @@ -1602,8 +1638,8 @@ zvol_create_minors(const char *name) /* * Remove minors for specified dataset including children and snapshots. */ -void -zvol_remove_minors(const char *name) +static void +zvol_remove_minors_impl(const char *name) { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); @@ -1633,11 +1669,41 @@ zvol_remove_minors(const char *name) mutex_exit(&zvol_state_lock); } +/* Remove minor for this specific snapshot only */ +static void +zvol_remove_minor_impl(const char *name) +{ + zvol_state_t *zv, *zv_next; + + if (zvol_inhibit_dev) + return; + + if (strchr(name, '@') == NULL) + return; + + mutex_enter(&zvol_state_lock); + + for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { + zv_next = list_next(&zvol_state_list, zv); + + if (strcmp(zv->zv_name, name) == 0) { + /* If in use, leave alone */ + if (zv->zv_open_count > 0) + continue; + zvol_remove(zv); + zvol_free(zv); + break; + } + } + + mutex_exit(&zvol_state_lock); +} + /* * Rename minors for specified dataset including children and snapshots. */ -void -zvol_rename_minors(const char *oldname, const char *newname) +static void +zvol_rename_minors_impl(const char *oldname, const char *newname) { zvol_state_t *zv, *zv_next; int oldnamelen, newnamelen; @@ -1660,14 +1726,14 @@ zvol_rename_minors(const char *oldname, const char *newname) continue; if (strcmp(zv->zv_name, oldname) == 0) { - __zvol_rename_minor(zv, newname); + zvol_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { snprintf(name, MAXNAMELEN, "%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); - __zvol_rename_minor(zv, name); + zvol_rename_minor(zv, name); } } @@ -1676,42 +1742,227 @@ zvol_rename_minors(const char *oldname, const char *newname) kmem_free(name, MAXNAMELEN); } +typedef struct zvol_snapdev_cb_arg { + uint64_t snapdev; +} zvol_snapdev_cb_arg_t; + static int -snapdev_snapshot_changed_cb(const char *dsname, void *arg) { - uint64_t snapdev = *(uint64_t *) arg; +zvol_set_snapdev_cb(const char *dsname, void *param) { + zvol_snapdev_cb_arg_t *arg = param; if (strchr(dsname, '@') == NULL) return (0); - switch (snapdev) { + switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: - mutex_enter(&zvol_state_lock); - (void) __zvol_create_minor(dsname, B_TRUE); - mutex_exit(&zvol_state_lock); + (void) zvol_create_minor_impl(dsname); break; case ZFS_SNAPDEV_HIDDEN: - (void) zvol_remove_minor(dsname); + (void) zvol_remove_minor_impl(dsname); break; } return (0); } +static void +zvol_set_snapdev_impl(char *name, uint64_t snapdev) +{ + zvol_snapdev_cb_arg_t arg = {snapdev}; + fstrans_cookie_t cookie = spl_fstrans_mark(); + /* + * The zvol_set_snapdev_sync() sets snapdev appropriately + * in the dataset hierarchy. Here, we only scan snapshots. + */ + dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS); + spl_fstrans_unmark(cookie); +} + +static zvol_task_t * +zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, + uint64_t snapdev) +{ + zvol_task_t *task; + char *delim; + + /* Never allow tasks on hidden names. */ + if (name1[0] == '$') + return (NULL); + + task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); + task->op = op; + task->snapdev = snapdev; + delim = strchr(name1, '/'); + strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN); + + strlcpy(task->name1, name1, MAXNAMELEN); + if (name2 != NULL) + strlcpy(task->name2, name2, MAXNAMELEN); + + return (task); +} + +static void +zvol_task_free(zvol_task_t *task) +{ + kmem_free(task, sizeof (zvol_task_t)); +} + +/* + * The worker thread function performed asynchronously. + */ +static void +zvol_task_cb(void *param) +{ + zvol_task_t *task = (zvol_task_t *)param; + + switch (task->op) { + case ZVOL_ASYNC_CREATE_MINORS: + (void) zvol_create_minors_impl(task->name1); + break; + case ZVOL_ASYNC_REMOVE_MINORS: + zvol_remove_minors_impl(task->name1); + break; + case ZVOL_ASYNC_RENAME_MINORS: + zvol_rename_minors_impl(task->name1, task->name2); + break; + case ZVOL_ASYNC_SET_SNAPDEV: + zvol_set_snapdev_impl(task->name1, task->snapdev); + break; + default: + VERIFY(0); + break; + } + + zvol_task_free(task); +} + +typedef struct zvol_set_snapdev_arg { + const char *zsda_name; + uint64_t zsda_value; + zprop_source_t zsda_source; + dmu_tx_t *zsda_tx; +} zvol_set_snapdev_arg_t; + +/* + * Sanity check the dataset for safe use by the sync task. No additional + * conditions are imposed. + */ +static int +zvol_set_snapdev_check(void *arg, dmu_tx_t *tx) +{ + zvol_set_snapdev_arg_t *zsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd; + int error; + + error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); + if (error != 0) + return (error); + + dsl_dir_rele(dd, FTAG); + + return (error); +} + +static int +zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + zvol_set_snapdev_arg_t *zsda = arg; + char dsname[MAXNAMELEN]; + zvol_task_t *task; + + dsl_dataset_name(ds, dsname); + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV), + zsda->zsda_source, sizeof (zsda->zsda_value), 1, + &zsda->zsda_value, zsda->zsda_tx); + + task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, + NULL, zsda->zsda_value); + if (task == NULL) + return (0); + + (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, + task, TQ_SLEEP); + return (0); +} + +/* + * Traverse all child snapshot datasets and apply snapdev appropriately. + */ +static void +zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx) +{ + zvol_set_snapdev_arg_t *zsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd; + + VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); + zsda->zsda_tx = tx; + + dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb, + zsda, DS_FIND_CHILDREN); + + dsl_dir_rele(dd, FTAG); +} + int -zvol_set_snapdev(const char *dsname, uint64_t snapdev) { - fstrans_cookie_t cookie; +zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev) +{ + zvol_set_snapdev_arg_t zsda; - if (zvol_inhibit_dev) - /* caller should continue to modify snapdev property */ - return (-1); + zsda.zsda_name = ddname; + zsda.zsda_source = source; + zsda.zsda_value = snapdev; - cookie = spl_fstrans_mark(); - (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb, - &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); - spl_fstrans_unmark(cookie); + return (dsl_sync_task(ddname, zvol_set_snapdev_check, + zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); +} + +void +zvol_create_minors(spa_t *spa, const char *name, boolean_t async) +{ + zvol_task_t *task; + taskqid_t id; + + task = zvol_task_alloc(ZVOL_ASYNC_CREATE_MINORS, name, NULL, ~0ULL); + if (task == NULL) + return; + + id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); + if ((async == B_FALSE) && (id != 0)) + taskq_wait_id(spa->spa_zvol_taskq, id); +} + +void +zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) +{ + zvol_task_t *task; + taskqid_t id; + + task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL); + if (task == NULL) + return; - /* caller should continue to modify snapdev property */ - return (-1); + id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); + if ((async == B_FALSE) && (id != 0)) + taskq_wait_id(spa->spa_zvol_taskq, id); +} + +void +zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, + boolean_t async) +{ + zvol_task_t *task; + taskqid_t id; + + task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL); + if (task == NULL) + return; + + id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); + if ((async == B_FALSE) && (id != 0)) + taskq_wait_id(spa->spa_zvol_taskq, id); } int @@ -1721,7 +1972,6 @@ zvol_init(void) list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); - mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); error = register_blkdev(zvol_major, ZVOL_DRIVER); @@ -1745,11 +1995,13 @@ out: void zvol_fini(void) { - zvol_remove_minors(NULL); + zvol_remove_minors_impl(NULL); + blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); unregister_blkdev(zvol_major, ZVOL_DRIVER); - mutex_destroy(&zvol_state_lock); + list_destroy(&zvol_state_list); + mutex_destroy(&zvol_state_lock); } module_param(zvol_inhibit_dev, uint, 0644); |