summaryrefslogtreecommitdiffstats
path: root/module/zfs/zvol.c
diff options
context:
space:
mode:
authorBoris Protopopov <[email protected]>2014-03-22 05:07:14 -0400
committerBrian Behlendorf <[email protected]>2016-03-10 09:49:22 -0800
commita0bd735adb1b1eb81fef10b4db102ee051c4d4ff (patch)
tree121fcde3000a116f0c33143b28a530a87fd6073a /module/zfs/zvol.c
parenteb0856779f7b57162c9179f238104f6d6e150745 (diff)
Add support for asynchronous zvol minor operations
zfsonlinux issue #2217 - zvol minor operations: check snapdev property before traversing snapshots of a dataset zfsonlinux issue #3681 - lock order inversion between zvol_open() and dsl_pool_sync()...zvol_rename_minors() Create a per-pool zvol taskq for asynchronous zvol tasks. There are a few key design decisions to be aware of. * Each taskq must be single threaded to ensure tasks are always processed in the order in which they were dispatched. * There is a taskq per-pool in order to keep the pools independent. This way if one pool is suspended it will not impact another. * The preferred location to dispatch a zvol minor task is a sync task. In this context there is easy access to the spa_t and minimal error handling is required because the sync task must succeed. Support for asynchronous zvol minor operations address issue #3681. Signed-off-by: Boris Protopopov <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #2217 Closes #3678 Closes #3681
Diffstat (limited to 'module/zfs/zvol.c')
-rw-r--r--module/zfs/zvol.c494
1 files changed, 373 insertions, 121 deletions
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 034cf6a6a..ab4d3ceb7 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -42,6 +42,7 @@
#include <sys/dmu_traverse.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
#include <sys/zap.h>
#include <sys/zfeature.h>
#include <sys/zil_impl.h>
@@ -49,6 +50,7 @@
#include <sys/zio.h>
#include <sys/zfs_rlock.h>
#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
#include <sys/zvol.h>
#include <linux/blkdev_compat.h>
@@ -81,6 +83,23 @@ typedef struct zvol_state {
list_node_t zv_next; /* next zvol_state_t linkage */
} zvol_state_t;
+typedef enum {
+ ZVOL_ASYNC_CREATE_MINORS,
+ ZVOL_ASYNC_REMOVE_MINORS,
+ ZVOL_ASYNC_RENAME_MINORS,
+ ZVOL_ASYNC_SET_SNAPDEV,
+ ZVOL_ASYNC_MAX
+} zvol_async_op_t;
+
+typedef struct {
+ zvol_async_op_t op;
+ char pool[MAXNAMELEN];
+ char name1[MAXNAMELEN];
+ char name2[MAXNAMELEN];
+ zprop_source_t source;
+ uint64_t snapdev;
+} zvol_task_t;
+
#define ZVOL_RDONLY 0x1
/*
@@ -977,6 +996,7 @@ zvol_first_open(zvol_state_t *zv)
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
if (error) {
dmu_objset_disown(os, zvol_tag);
+ zv->zv_objset = NULL;
goto out_mutex;
}
@@ -984,6 +1004,7 @@ zvol_first_open(zvol_state_t *zv)
error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
if (error) {
dmu_objset_disown(os, zvol_tag);
+ zv->zv_objset = NULL;
goto out_mutex;
}
@@ -1036,7 +1057,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
/*
* If the caller is already holding the mutex do not take it
- * again, this will happen as part of zvol_create_minor().
+ * again, this will happen as part of zvol_create_minor_impl().
* Once add_disk() is called the device is live and the kernel
* will attempt to open it to read the partition information.
*/
@@ -1355,31 +1376,13 @@ zvol_free(zvol_state_t *zv)
kmem_free(zv, sizeof (zvol_state_t));
}
+/*
+ * Create a block device minor node and setup the linkage between it
+ * and the specified volume. Once this function returns the block
+ * device is live and ready for use.
+ */
static int
-__zvol_snapdev_hidden(const char *name)
-{
- uint64_t snapdev;
- char *parent;
- char *atp;
- int error = 0;
-
- parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- (void) strlcpy(parent, name, MAXPATHLEN);
-
- if ((atp = strrchr(parent, '@')) != NULL) {
- *atp = '\0';
- error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL);
- if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN))
- error = SET_ERROR(ENODEV);
- }
-
- kmem_free(parent, MAXPATHLEN);
-
- return (SET_ERROR(error));
-}
-
-static int
-__zvol_create_minor(const char *name, boolean_t ignore_snapdev)
+zvol_create_minor_impl(const char *name)
{
zvol_state_t *zv;
objset_t *os;
@@ -1389,7 +1392,7 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
unsigned minor = 0;
int error = 0;
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ mutex_enter(&zvol_state_lock);
zv = zvol_find_by_name(name);
if (zv) {
@@ -1397,12 +1400,6 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
goto out;
}
- if (ignore_snapdev == B_FALSE) {
- error = __zvol_snapdev_hidden(name);
- if (error)
- goto out;
- }
-
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os);
@@ -1489,69 +1486,18 @@ out:
*/
mutex_exit(&zvol_state_lock);
add_disk(zv->zv_disk);
- mutex_enter(&zvol_state_lock);
+ } else {
+ mutex_exit(&zvol_state_lock);
}
return (SET_ERROR(error));
}
/*
- * Create a block device minor node and setup the linkage between it
- * and the specified volume. Once this function returns the block
- * device is live and ready for use.
- */
-int
-zvol_create_minor(const char *name)
-{
- int error;
-
- mutex_enter(&zvol_state_lock);
- error = __zvol_create_minor(name, B_FALSE);
- mutex_exit(&zvol_state_lock);
-
- return (SET_ERROR(error));
-}
-
-static int
-__zvol_remove_minor(const char *name)
-{
- zvol_state_t *zv;
-
- ASSERT(MUTEX_HELD(&zvol_state_lock));
-
- zv = zvol_find_by_name(name);
- if (zv == NULL)
- return (SET_ERROR(ENXIO));
-
- if (zv->zv_open_count > 0)
- return (SET_ERROR(EBUSY));
-
- zvol_remove(zv);
- zvol_free(zv);
-
- return (0);
-}
-
-/*
- * Remove a block device minor node for the specified volume.
- */
-int
-zvol_remove_minor(const char *name)
-{
- int error;
-
- mutex_enter(&zvol_state_lock);
- error = __zvol_remove_minor(name);
- mutex_exit(&zvol_state_lock);
-
- return (SET_ERROR(error));
-}
-
-/*
* Rename a block device minor mode for the specified volume.
*/
static void
-__zvol_rename_minor(zvol_state_t *zv, const char *newname)
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
{
int readonly = get_disk_ro(zv->zv_disk);
@@ -1571,30 +1517,120 @@ __zvol_rename_minor(zvol_state_t *zv, const char *newname)
set_disk_ro(zv->zv_disk, readonly);
}
+
+/*
+ * Mask errors to continue dmu_objset_find() traversal
+ */
+static int
+zvol_create_snap_minor_cb(const char *dsname, void *arg)
+{
+ const char *name = (const char *)arg;
+
+ /* skip the designated dataset */
+ if (name && strcmp(dsname, name) == 0)
+ return (0);
+
+ /* at this point, the dsname should name a snapshot */
+ if (strchr(dsname, '@') == 0) {
+ dprintf("zvol_create_snap_minor_cb(): "
+ "%s is not a shapshot name\n", dsname);
+ } else {
+ (void) zvol_create_minor_impl(dsname);
+ }
+
+ return (0);
+}
+
+/*
+ * Mask errors to continue dmu_objset_find() traversal
+ */
static int
zvol_create_minors_cb(const char *dsname, void *arg)
{
- (void) zvol_create_minor(dsname);
+ uint64_t snapdev;
+ int error;
+
+ error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
+ if (error)
+ return (0);
+
+ /*
+ * Given the name and the 'snapdev' property, create device minor nodes
+ * with the linkages to zvols/snapshots as needed.
+ * If the name represents a zvol, create a minor node for the zvol, then
+ * check if its snapshots are 'visible', and if so, iterate over the
+ * snapshots and create device minor nodes for those.
+ */
+ if (strchr(dsname, '@') == 0) {
+ /* create minor for the 'dsname' explicitly */
+ error = zvol_create_minor_impl(dsname);
+ if ((error == 0 || error == EEXIST) &&
+ (snapdev == ZFS_SNAPDEV_VISIBLE)) {
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ /*
+ * traverse snapshots only, do not traverse children,
+ * and skip the 'dsname'
+ */
+ error = dmu_objset_find((char *)dsname,
+ zvol_create_snap_minor_cb, (void *)dsname,
+ DS_FIND_SNAPSHOTS);
+ spl_fstrans_unmark(cookie);
+ }
+ } else {
+ dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
+ dsname);
+ }
return (0);
}
/*
- * Create minors for specified dataset including children and snapshots.
+ * Create minors for the specified dataset, including children and snapshots.
+ * Pay attention to the 'snapdev' property and iterate over the snapshots
+ * only if they are 'visible'. This approach allows one to assure that the
+ * snapshot metadata is read from disk only if it is needed.
+ *
+ * The name can represent a dataset to be recursively scanned for zvols and
+ * their snapshots, or a single zvol snapshot. If the name represents a
+ * dataset, the scan is performed in two nested stages:
+ * - scan the dataset for zvols, and
+ * - for each zvol, create a minor node, then check if the zvol's snapshots
+ * are 'visible', and only then iterate over the snapshots if needed
+ *
+ * If the name represents a snapshot, a check is perfromed if the snapshot is
+ * 'visible' (which also verifies that the parent is a zvol), and if so,
+ * a minor node for that snapshot is created.
*/
-int
-zvol_create_minors(const char *name)
+static int
+zvol_create_minors_impl(const char *name)
{
int error = 0;
fstrans_cookie_t cookie;
+ char *atp, *parent;
if (zvol_inhibit_dev)
return (0);
- cookie = spl_fstrans_mark();
- error = dmu_objset_find((char *)name, zvol_create_minors_cb,
- NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
- spl_fstrans_unmark(cookie);
+ parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ (void) strlcpy(parent, name, MAXPATHLEN);
+
+ if ((atp = strrchr(parent, '@')) != NULL) {
+ uint64_t snapdev;
+
+ *atp = '\0';
+ error = dsl_prop_get_integer(parent, "snapdev",
+ &snapdev, NULL);
+
+ if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
+ error = zvol_create_minor_impl(name);
+ } else {
+ cookie = spl_fstrans_mark();
+ error = dmu_objset_find(parent, zvol_create_minors_cb,
+ NULL, DS_FIND_CHILDREN);
+ spl_fstrans_unmark(cookie);
+ }
+
+ kmem_free(parent, MAXPATHLEN);
return (SET_ERROR(error));
}
@@ -1602,8 +1638,8 @@ zvol_create_minors(const char *name)
/*
* Remove minors for specified dataset including children and snapshots.
*/
-void
-zvol_remove_minors(const char *name)
+static void
+zvol_remove_minors_impl(const char *name)
{
zvol_state_t *zv, *zv_next;
int namelen = ((name) ? strlen(name) : 0);
@@ -1633,11 +1669,41 @@ zvol_remove_minors(const char *name)
mutex_exit(&zvol_state_lock);
}
+/* Remove minor for this specific snapshot only */
+static void
+zvol_remove_minor_impl(const char *name)
+{
+ zvol_state_t *zv, *zv_next;
+
+ if (zvol_inhibit_dev)
+ return;
+
+ if (strchr(name, '@') == NULL)
+ return;
+
+ mutex_enter(&zvol_state_lock);
+
+ for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+ zv_next = list_next(&zvol_state_list, zv);
+
+ if (strcmp(zv->zv_name, name) == 0) {
+ /* If in use, leave alone */
+ if (zv->zv_open_count > 0)
+ continue;
+ zvol_remove(zv);
+ zvol_free(zv);
+ break;
+ }
+ }
+
+ mutex_exit(&zvol_state_lock);
+}
+
/*
* Rename minors for specified dataset including children and snapshots.
*/
-void
-zvol_rename_minors(const char *oldname, const char *newname)
+static void
+zvol_rename_minors_impl(const char *oldname, const char *newname)
{
zvol_state_t *zv, *zv_next;
int oldnamelen, newnamelen;
@@ -1660,14 +1726,14 @@ zvol_rename_minors(const char *oldname, const char *newname)
continue;
if (strcmp(zv->zv_name, oldname) == 0) {
- __zvol_rename_minor(zv, newname);
+ zvol_rename_minor(zv, newname);
} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
(zv->zv_name[oldnamelen] == '/' ||
zv->zv_name[oldnamelen] == '@')) {
snprintf(name, MAXNAMELEN, "%s%c%s", newname,
zv->zv_name[oldnamelen],
zv->zv_name + oldnamelen + 1);
- __zvol_rename_minor(zv, name);
+ zvol_rename_minor(zv, name);
}
}
@@ -1676,42 +1742,227 @@ zvol_rename_minors(const char *oldname, const char *newname)
kmem_free(name, MAXNAMELEN);
}
+typedef struct zvol_snapdev_cb_arg {
+ uint64_t snapdev;
+} zvol_snapdev_cb_arg_t;
+
static int
-snapdev_snapshot_changed_cb(const char *dsname, void *arg) {
- uint64_t snapdev = *(uint64_t *) arg;
+zvol_set_snapdev_cb(const char *dsname, void *param) {
+ zvol_snapdev_cb_arg_t *arg = param;
if (strchr(dsname, '@') == NULL)
return (0);
- switch (snapdev) {
+ switch (arg->snapdev) {
case ZFS_SNAPDEV_VISIBLE:
- mutex_enter(&zvol_state_lock);
- (void) __zvol_create_minor(dsname, B_TRUE);
- mutex_exit(&zvol_state_lock);
+ (void) zvol_create_minor_impl(dsname);
break;
case ZFS_SNAPDEV_HIDDEN:
- (void) zvol_remove_minor(dsname);
+ (void) zvol_remove_minor_impl(dsname);
break;
}
return (0);
}
+static void
+zvol_set_snapdev_impl(char *name, uint64_t snapdev)
+{
+ zvol_snapdev_cb_arg_t arg = {snapdev};
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ /*
+ * The zvol_set_snapdev_sync() sets snapdev appropriately
+ * in the dataset hierarchy. Here, we only scan snapshots.
+ */
+ dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
+ spl_fstrans_unmark(cookie);
+}
+
+static zvol_task_t *
+zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
+ uint64_t snapdev)
+{
+ zvol_task_t *task;
+ char *delim;
+
+ /* Never allow tasks on hidden names. */
+ if (name1[0] == '$')
+ return (NULL);
+
+ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+ task->op = op;
+ task->snapdev = snapdev;
+ delim = strchr(name1, '/');
+ strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
+
+ strlcpy(task->name1, name1, MAXNAMELEN);
+ if (name2 != NULL)
+ strlcpy(task->name2, name2, MAXNAMELEN);
+
+ return (task);
+}
+
+static void
+zvol_task_free(zvol_task_t *task)
+{
+ kmem_free(task, sizeof (zvol_task_t));
+}
+
+/*
+ * The worker thread function performed asynchronously.
+ */
+static void
+zvol_task_cb(void *param)
+{
+ zvol_task_t *task = (zvol_task_t *)param;
+
+ switch (task->op) {
+ case ZVOL_ASYNC_CREATE_MINORS:
+ (void) zvol_create_minors_impl(task->name1);
+ break;
+ case ZVOL_ASYNC_REMOVE_MINORS:
+ zvol_remove_minors_impl(task->name1);
+ break;
+ case ZVOL_ASYNC_RENAME_MINORS:
+ zvol_rename_minors_impl(task->name1, task->name2);
+ break;
+ case ZVOL_ASYNC_SET_SNAPDEV:
+ zvol_set_snapdev_impl(task->name1, task->snapdev);
+ break;
+ default:
+ VERIFY(0);
+ break;
+ }
+
+ zvol_task_free(task);
+}
+
+typedef struct zvol_set_snapdev_arg {
+ const char *zsda_name;
+ uint64_t zsda_value;
+ zprop_source_t zsda_source;
+ dmu_tx_t *zsda_tx;
+} zvol_set_snapdev_arg_t;
+
+/*
+ * Sanity check the dataset for safe use by the sync task. No additional
+ * conditions are imposed.
+ */
+static int
+zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
+{
+ zvol_set_snapdev_arg_t *zsda = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd;
+ int error;
+
+ error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
+ if (error != 0)
+ return (error);
+
+ dsl_dir_rele(dd, FTAG);
+
+ return (error);
+}
+
+static int
+zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ zvol_set_snapdev_arg_t *zsda = arg;
+ char dsname[MAXNAMELEN];
+ zvol_task_t *task;
+
+ dsl_dataset_name(ds, dsname);
+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
+ zsda->zsda_source, sizeof (zsda->zsda_value), 1,
+ &zsda->zsda_value, zsda->zsda_tx);
+
+ task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname,
+ NULL, zsda->zsda_value);
+ if (task == NULL)
+ return (0);
+
+ (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
+ task, TQ_SLEEP);
+ return (0);
+}
+
+/*
+ * Traverse all child snapshot datasets and apply snapdev appropriately.
+ */
+static void
+zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
+{
+ zvol_set_snapdev_arg_t *zsda = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dir_t *dd;
+
+ VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
+ zsda->zsda_tx = tx;
+
+ dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
+ zsda, DS_FIND_CHILDREN);
+
+ dsl_dir_rele(dd, FTAG);
+}
+
int
-zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
- fstrans_cookie_t cookie;
+zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
+{
+ zvol_set_snapdev_arg_t zsda;
- if (zvol_inhibit_dev)
- /* caller should continue to modify snapdev property */
- return (-1);
+ zsda.zsda_name = ddname;
+ zsda.zsda_source = source;
+ zsda.zsda_value = snapdev;
- cookie = spl_fstrans_mark();
- (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb,
- &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
- spl_fstrans_unmark(cookie);
+ return (dsl_sync_task(ddname, zvol_set_snapdev_check,
+ zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
+}
+
+void
+zvol_create_minors(spa_t *spa, const char *name, boolean_t async)
+{
+ zvol_task_t *task;
+ taskqid_t id;
+
+ task = zvol_task_alloc(ZVOL_ASYNC_CREATE_MINORS, name, NULL, ~0ULL);
+ if (task == NULL)
+ return;
+
+ id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
+ if ((async == B_FALSE) && (id != 0))
+ taskq_wait_id(spa->spa_zvol_taskq, id);
+}
+
+void
+zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
+{
+ zvol_task_t *task;
+ taskqid_t id;
+
+ task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
+ if (task == NULL)
+ return;
- /* caller should continue to modify snapdev property */
- return (-1);
+ id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
+ if ((async == B_FALSE) && (id != 0))
+ taskq_wait_id(spa->spa_zvol_taskq, id);
+}
+
+void
+zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
+ boolean_t async)
+{
+ zvol_task_t *task;
+ taskqid_t id;
+
+ task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
+ if (task == NULL)
+ return;
+
+ id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
+ if ((async == B_FALSE) && (id != 0))
+ taskq_wait_id(spa->spa_zvol_taskq, id);
}
int
@@ -1721,7 +1972,6 @@ zvol_init(void)
list_create(&zvol_state_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
-
mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
error = register_blkdev(zvol_major, ZVOL_DRIVER);
@@ -1745,11 +1995,13 @@ out:
void
zvol_fini(void)
{
- zvol_remove_minors(NULL);
+ zvol_remove_minors_impl(NULL);
+
blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
unregister_blkdev(zvol_major, ZVOL_DRIVER);
- mutex_destroy(&zvol_state_lock);
+
list_destroy(&zvol_state_list);
+ mutex_destroy(&zvol_state_lock);
}
module_param(zvol_inhibit_dev, uint, 0644);