summaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorMatthew Macy <[email protected]>2019-09-25 09:20:30 -0700
committerBrian Behlendorf <[email protected]>2019-09-25 09:20:30 -0700
commit5df7e9d85ced424292e8312a7848b6092a29c930 (patch)
tree391e1488d4a8081ce22c7401c6392a72dc661c13 /module
parentd359e99c38f66732d42278c32d52cfcf1839aa4f (diff)
OpenZFS restructuring - zvol
Refactor the zvol in to platform dependent and independent bits. Reviewed-by: Allan Jude <[email protected]> Reviewed-by: Jorgen Lundman <[email protected]> Reviewed-by: Igor Kozhukhov <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Matt Macy <[email protected]> Closes #9295
Diffstat (limited to 'module')
-rw-r--r--module/os/linux/zfs/Makefile.in1
-rw-r--r--module/os/linux/zfs/zvol_os.c1120
-rw-r--r--module/zfs/dmu_objset.c2
-rw-r--r--module/zfs/zfs_ioctl.c6
-rw-r--r--module/zfs/zvol.c1157
5 files changed, 1180 insertions, 1106 deletions
diff --git a/module/os/linux/zfs/Makefile.in b/module/os/linux/zfs/Makefile.in
index 2165e7f74..a4f198907 100644
--- a/module/os/linux/zfs/Makefile.in
+++ b/module/os/linux/zfs/Makefile.in
@@ -33,3 +33,4 @@ $(MODULE)-objs += ../os/linux/zfs/zpl_file.o
$(MODULE)-objs += ../os/linux/zfs/zpl_inode.o
$(MODULE)-objs += ../os/linux/zfs/zpl_super.o
$(MODULE)-objs += ../os/linux/zfs/zpl_xattr.o
+$(MODULE)-objs += ../os/linux/zfs/zvol_os.o
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
new file mode 100644
index 000000000..b89028dc3
--- /dev/null
+++ b/module/os/linux/zfs/zvol_os.c
@@ -0,0 +1,1120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/dataset_kstats.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio.h>
+#include <sys/zfs_rlock.h>
+#include <sys/spa_impl.h>
+#include <sys/zvol.h>
+#include <sys/zvol_impl.h>
+
+#include <linux/blkdev_compat.h>
+#include <linux/task_io_accounting_ops.h>
+
+unsigned int zvol_major = ZVOL_MAJOR;
+unsigned int zvol_request_sync = 0;
+unsigned int zvol_prefetch_bytes = (128 * 1024);
+unsigned long zvol_max_discard_blocks = 16384;
+unsigned int zvol_threads = 32;
+
+struct zvol_state_os {
+ struct gendisk *zvo_disk; /* generic disk */
+ struct request_queue *zvo_queue; /* request queue */
+ dataset_kstats_t zvo_kstat; /* zvol kstats */
+ dev_t zvo_dev; /* device id */
+};
+
+taskq_t *zvol_taskq;
+static struct ida zvol_ida;
+
+typedef struct zv_request {
+ zvol_state_t *zv;
+ struct bio *bio;
+ locked_range_t *lr;
+} zv_request_t;
+
+/*
+ * Given a path, return TRUE if path is a ZVOL.
+ */
+static boolean_t
+zvol_is_zvol_impl(const char *device)
+{
+ struct block_device *bdev;
+ unsigned int major;
+
+ bdev = vdev_lookup_bdev(device);
+ if (IS_ERR(bdev))
+ return (B_FALSE);
+
+ major = MAJOR(bdev->bd_dev);
+ bdput(bdev);
+
+ if (major == zvol_major)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+static void
+uio_from_bio(uio_t *uio, struct bio *bio)
+{
+ uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
+ uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
+ uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
+ uio->uio_segflg = UIO_BVEC;
+ uio->uio_limit = MAXOFFSET_T;
+ uio->uio_resid = BIO_BI_SIZE(bio);
+ uio->uio_skip = BIO_BI_SKIP(bio);
+}
+
+static void
+zvol_write(void *arg)
+{
+ int error = 0;
+
+ zv_request_t *zvr = arg;
+ struct bio *bio = zvr->bio;
+ uio_t uio = { { 0 }, 0 };
+ uio_from_bio(&uio, bio);
+
+ zvol_state_t *zv = zvr->zv;
+ ASSERT(zv && zv->zv_open_count > 0);
+ ASSERT(zv->zv_zilog != NULL);
+
+ ssize_t start_resid = uio.uio_resid;
+ unsigned long start_jif = jiffies;
+ blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
+ bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
+
+ boolean_t sync =
+ bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+ uint64_t volsize = zv->zv_volsize;
+ while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
+ uint64_t off = uio.uio_loffset;
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+ if (bytes > volsize - off) /* don't write past the end */
+ bytes = volsize - off;
+
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
+
+ /* This will only fail for ENOSPC */
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ break;
+ }
+ error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
+ if (error == 0) {
+ zvol_log_write(zv, tx, off, bytes, sync);
+ }
+ dmu_tx_commit(tx);
+
+ if (error)
+ break;
+ }
+ rangelock_exit(zvr->lr);
+
+ int64_t nwritten = start_resid - uio.uio_resid;
+ dataset_kstats_update_write_kstats(&zv->zv_zso->zvo_kstat, nwritten);
+ task_io_account_write(nwritten);
+
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+ rw_exit(&zv->zv_suspend_lock);
+ blk_generic_end_io_acct(zv->zv_zso->zvo_queue,
+ WRITE, &zv->zv_zso->zvo_disk->part0, start_jif);
+ BIO_END_IO(bio, -error);
+ kmem_free(zvr, sizeof (zv_request_t));
+}
+
+static void
+zvol_discard(void *arg)
+{
+ zv_request_t *zvr = arg;
+ struct bio *bio = zvr->bio;
+ zvol_state_t *zv = zvr->zv;
+ uint64_t start = BIO_BI_SECTOR(bio) << 9;
+ uint64_t size = BIO_BI_SIZE(bio);
+ uint64_t end = start + size;
+ boolean_t sync;
+ int error = 0;
+ dmu_tx_t *tx;
+ unsigned long start_jif;
+
+ ASSERT(zv && zv->zv_open_count > 0);
+ ASSERT(zv->zv_zilog != NULL);
+
+ start_jif = jiffies;
+ blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
+ bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
+
+ sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+ if (end > zv->zv_volsize) {
+ error = SET_ERROR(EIO);
+ goto unlock;
+ }
+
+ /*
+ * Align the request to volume block boundaries when a secure erase is
+ * not required. This will prevent dnode_free_range() from zeroing out
+ * the unaligned parts which is slow (read-modify-write) and useless
+ * since we are not freeing any space by doing so.
+ */
+ if (!bio_is_secure_erase(bio)) {
+ start = P2ROUNDUP(start, zv->zv_volblocksize);
+ end = P2ALIGN(end, zv->zv_volblocksize);
+ size = end - start;
+ }
+
+ if (start >= end)
+ goto unlock;
+
+ tx = dmu_tx_create(zv->zv_objset);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ } else {
+ zvol_log_truncate(zv, tx, start, size, B_TRUE);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset,
+ ZVOL_OBJ, start, size);
+ }
+unlock:
+ rangelock_exit(zvr->lr);
+
+ if (error == 0 && sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+ rw_exit(&zv->zv_suspend_lock);
+ blk_generic_end_io_acct(zv->zv_zso->zvo_queue, WRITE,
+ &zv->zv_zso->zvo_disk->part0, start_jif);
+ BIO_END_IO(bio, -error);
+ kmem_free(zvr, sizeof (zv_request_t));
+}
+
+static void
+zvol_read(void *arg)
+{
+ int error = 0;
+
+ zv_request_t *zvr = arg;
+ struct bio *bio = zvr->bio;
+ uio_t uio = { { 0 }, 0 };
+ uio_from_bio(&uio, bio);
+
+ zvol_state_t *zv = zvr->zv;
+ ASSERT(zv && zv->zv_open_count > 0);
+
+ ssize_t start_resid = uio.uio_resid;
+ unsigned long start_jif = jiffies;
+ blk_generic_start_io_acct(zv->zv_zso->zvo_queue, READ, bio_sectors(bio),
+ &zv->zv_zso->zvo_disk->part0);
+
+ uint64_t volsize = zv->zv_volsize;
+ while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
+ uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
+
+ /* don't read past the end */
+ if (bytes > volsize - uio.uio_loffset)
+ bytes = volsize - uio.uio_loffset;
+
+ error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+ }
+ rangelock_exit(zvr->lr);
+
+ int64_t nread = start_resid - uio.uio_resid;
+ dataset_kstats_update_read_kstats(&zv->zv_zso->zvo_kstat, nread);
+ task_io_account_read(nread);
+
+ rw_exit(&zv->zv_suspend_lock);
+ blk_generic_end_io_acct(zv->zv_zso->zvo_queue, READ,
+ &zv->zv_zso->zvo_disk->part0, start_jif);
+ BIO_END_IO(bio, -error);
+ kmem_free(zvr, sizeof (zv_request_t));
+}
+
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
+{
+ zvol_state_t *zv = q->queuedata;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+ uint64_t size = BIO_BI_SIZE(bio);
+ int rw = bio_data_dir(bio);
+ zv_request_t *zvr;
+
+ if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
+ printk(KERN_INFO
+ "%s: bad access: offset=%llu, size=%lu\n",
+ zv->zv_zso->zvo_disk->disk_name,
+ (long long unsigned)offset,
+ (long unsigned)size);
+
+ BIO_END_IO(bio, -SET_ERROR(EIO));
+ goto out;
+ }
+
+ if (rw == WRITE) {
+ boolean_t need_sync = B_FALSE;
+
+ if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
+ BIO_END_IO(bio, -SET_ERROR(EROFS));
+ goto out;
+ }
+
+ /*
+ * To be released in the I/O function. See the comment on
+ * rangelock_enter() below.
+ */
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+ /*
+ * Open a ZIL if this is the first time we have written to this
+ * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
+ * than zv_state_lock so that we don't need to acquire an
+ * additional lock in this path.
+ */
+ if (zv->zv_zilog == NULL) {
+ rw_exit(&zv->zv_suspend_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+ if (zv->zv_zilog == NULL) {
+ zv->zv_zilog = zil_open(zv->zv_objset,
+ zvol_get_data);
+ zv->zv_flags |= ZVOL_WRITTEN_TO;
+ }
+ rw_downgrade(&zv->zv_suspend_lock);
+ }
+
+ /* bio marked as FLUSH need to flush before write */
+ if (bio_is_flush(bio))
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+ /* Some requests are just for flush and nothing else. */
+ if (size == 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ BIO_END_IO(bio, 0);
+ goto out;
+ }
+
+ zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
+ zvr->zv = zv;
+ zvr->bio = bio;
+
+ /*
+ * To be released in the I/O function. Since the I/O functions
+ * are asynchronous, we take it here synchronously to make
+ * sure overlapped I/Os are properly ordered.
+ */
+ zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
+ RL_WRITER);
+ /*
+ * Sync writes and discards execute zil_commit() which may need
+ * to take a RL_READER lock on the whole block being modified
+ * via its zillog->zl_get_data(): to avoid circular dependency
+ * issues with taskq threads execute these requests
+ * synchronously here in zvol_request().
+ */
+ need_sync = bio_is_fua(bio) ||
+ zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+ if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
+ if (zvol_request_sync || need_sync ||
+ taskq_dispatch(zvol_taskq, zvol_discard, zvr,
+ TQ_SLEEP) == TASKQID_INVALID)
+ zvol_discard(zvr);
+ } else {
+ if (zvol_request_sync || need_sync ||
+ taskq_dispatch(zvol_taskq, zvol_write, zvr,
+ TQ_SLEEP) == TASKQID_INVALID)
+ zvol_write(zvr);
+ }
+ } else {
+ /*
+ * The SCST driver, and possibly others, may issue READ I/Os
+ * with a length of zero bytes. These empty I/Os contain no
+ * data and require no additional handling.
+ */
+ if (size == 0) {
+ BIO_END_IO(bio, 0);
+ goto out;
+ }
+
+ zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
+ zvr->zv = zv;
+ zvr->bio = bio;
+
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+ zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
+ RL_READER);
+ if (zvol_request_sync || taskq_dispatch(zvol_taskq,
+ zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID)
+ zvol_read(zvr);
+ }
+
+out:
+ spl_fstrans_unmark(cookie);
+#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
+ return (0);
+#elif defined(HAVE_MAKE_REQUEST_FN_RET_QC)
+ return (BLK_QC_T_NONE);
+#endif
+}
+
+static int
+zvol_open(struct block_device *bdev, fmode_t flag)
+{
+ zvol_state_t *zv;
+ int error = 0;
+ boolean_t drop_suspend = B_TRUE;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+ /*
+ * Obtain a copy of private_data under the zvol_state_lock to make
+ * sure that either the result of zvol free code path setting
+ * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
+ * is not called on this zv because of the positive zv_open_count.
+ */
+ zv = bdev->bd_disk->private_data;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ return (SET_ERROR(-ENXIO));
+ }
+
+ mutex_enter(&zv->zv_state_lock);
+ /*
+ * make sure zvol is not suspended during first open
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 0) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
+
+ if (zv->zv_open_count == 0) {
+ error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
+ if (error)
+ goto out_mutex;
+ }
+
+ if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ error = -EROFS;
+ goto out_open_count;
+ }
+
+ zv->zv_open_count++;
+
+ mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+
+ check_disk_change(bdev);
+
+ return (0);
+
+out_open_count:
+ if (zv->zv_open_count == 0)
+ zvol_last_close(zv);
+
+out_mutex:
+ mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ if (error == -EINTR) {
+ error = -ERESTARTSYS;
+ schedule();
+ }
+ return (SET_ERROR(error));
+}
+
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
+static void
+#else
+static int
+#endif
+zvol_release(struct gendisk *disk, fmode_t mode)
+{
+ zvol_state_t *zv;
+ boolean_t drop_suspend = B_TRUE;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+ zv = disk->private_data;
+
+ mutex_enter(&zv->zv_state_lock);
+ ASSERT(zv->zv_open_count > 0);
+ /*
+ * make sure zvol is not suspended during last close
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 1) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 1) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
+
+ zv->zv_open_count--;
+ if (zv->zv_open_count == 0)
+ zvol_last_close(zv);
+
+ mutex_exit(&zv->zv_state_lock);
+
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+
+#ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
+ return (0);
+#endif
+}
+
+static int
+zvol_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ zvol_state_t *zv = bdev->bd_disk->private_data;
+ int error = 0;
+
+ ASSERT3U(zv->zv_open_count, >, 0);
+
+ switch (cmd) {
+ case BLKFLSBUF:
+ fsync_bdev(bdev);
+ invalidate_bdev(bdev);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+ if (!(zv->zv_flags & ZVOL_RDONLY))
+ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+
+ rw_exit(&zv->zv_suspend_lock);
+ break;
+
+ case BLKZNAME:
+ mutex_enter(&zv->zv_state_lock);
+ error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+ mutex_exit(&zv->zv_state_lock);
+ break;
+
+ default:
+ error = -ENOTTY;
+ break;
+ }
+
+ return (SET_ERROR(error));
+}
+
+#ifdef CONFIG_COMPAT
+static int
+zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned cmd, unsigned long arg)
+{
+ return (zvol_ioctl(bdev, mode, cmd, arg));
+}
+#else
+#define zvol_compat_ioctl NULL
+#endif
+
+/*
+ * Linux 2.6.38 preferred interface.
+ */
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+static unsigned int
+zvol_check_events(struct gendisk *disk, unsigned int clearing)
+{
+ unsigned int mask = 0;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+
+ zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
+ zv->zv_changed = 0;
+ mutex_exit(&zv->zv_state_lock);
+ }
+
+ rw_exit(&zvol_state_lock);
+
+ return (mask);
+}
+#else
+static int
+zvol_media_changed(struct gendisk *disk)
+{
+ int changed = 0;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+
+ zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ changed = zv->zv_changed;
+ zv->zv_changed = 0;
+ mutex_exit(&zv->zv_state_lock);
+ }
+
+ rw_exit(&zvol_state_lock);
+
+ return (changed);
+}
+#endif
+
+static int
+zvol_revalidate_disk(struct gendisk *disk)
+{
+ rw_enter(&zvol_state_lock, RW_READER);
+
+ zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ set_capacity(zv->zv_zso->zvo_disk,
+ zv->zv_volsize >> SECTOR_BITS);
+ mutex_exit(&zv->zv_state_lock);
+ }
+
+ rw_exit(&zvol_state_lock);
+
+ return (0);
+}
+
+int
+zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
+{
+
+ revalidate_disk(zv->zv_zso->zvo_disk);
+ return (0);
+}
+
+static void
+zvol_clear_private(zvol_state_t *zv)
+{
+ /*
+ * Cleared while holding zvol_state_lock as a writer
+ * which will prevent zvol_open() from opening it.
+ */
+ zv->zv_zso->zvo_disk->private_data = NULL;
+}
+
+/*
+ * Provide a simple virtual geometry for legacy compatibility. For devices
+ * smaller than 1 MiB a small head and sector count is used to allow very
+ * tiny devices. For devices over 1 Mib a standard head and sector count
+ * is used to keep the cylinders count reasonable.
+ */
+static int
+zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ zvol_state_t *zv = bdev->bd_disk->private_data;
+ sector_t sectors;
+
+ ASSERT3U(zv->zv_open_count, >, 0);
+
+ sectors = get_capacity(zv->zv_zso->zvo_disk);
+
+ if (sectors > 2048) {
+ geo->heads = 16;
+ geo->sectors = 63;
+ } else {
+ geo->heads = 2;
+ geo->sectors = 4;
+ }
+
+ geo->start = 0;
+ geo->cylinders = sectors / (geo->heads * geo->sectors);
+
+ return (0);
+}
+
+/*
+ * Find a zvol_state_t given the full major+minor dev_t. If found,
+ * return with zv_state_lock taken, otherwise, return (NULL) without
+ * taking zv_state_lock.
+ */
+static zvol_state_t *
+zvol_find_by_dev(dev_t dev)
+{
+ zvol_state_t *zv;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+ for (zv = list_head(&zvol_state_list); zv != NULL;
+ zv = list_next(&zvol_state_list, zv)) {
+ mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_zso->zvo_dev == dev) {
+ rw_exit(&zvol_state_lock);
+ return (zv);
+ }
+ mutex_exit(&zv->zv_state_lock);
+ }
+ rw_exit(&zvol_state_lock);
+
+ return (NULL);
+}
+
+void
+zvol_validate_dev(zvol_state_t *zv)
+{
+ ASSERT3U(MINOR(zv->zv_zso->zvo_dev) & ZVOL_MINOR_MASK, ==, 0);
+}
+
+static struct kobject *
+zvol_probe(dev_t dev, int *part, void *arg)
+{
+ zvol_state_t *zv;
+ struct kobject *kobj;
+
+ zv = zvol_find_by_dev(dev);
+ kobj = zv ? get_disk_and_module(zv->zv_zso->zvo_disk) : NULL;
+ ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
+ if (zv)
+ mutex_exit(&zv->zv_state_lock);
+
+ return (kobj);
+}
+
+static struct block_device_operations zvol_ops = {
+ .open = zvol_open,
+ .release = zvol_release,
+ .ioctl = zvol_ioctl,
+ .compat_ioctl = zvol_compat_ioctl,
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+ .check_events = zvol_check_events,
+#else
+ .media_changed = zvol_media_changed,
+#endif
+ .revalidate_disk = zvol_revalidate_disk,
+ .getgeo = zvol_getgeo,
+ .owner = THIS_MODULE,
+};
+
+/*
+ * Allocate memory for a new zvol_state_t and setup the required
+ * request queue and generic disk structures for the block device.
+ */
+static zvol_state_t *
+zvol_alloc(dev_t dev, const char *name)
+{
+ zvol_state_t *zv;
+ uint64_t volmode;
+
+ if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
+ return (NULL);
+
+ if (volmode == ZFS_VOLMODE_DEFAULT)
+ volmode = zvol_volmode;
+
+ if (volmode == ZFS_VOLMODE_NONE)
+ return (NULL);
+
+ zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
+ zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+
+ list_link_init(&zv->zv_next);
+
+ mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ zv->zv_zso->zvo_queue = blk_alloc_queue(GFP_ATOMIC);
+ if (zv->zv_zso->zvo_queue == NULL)
+ goto out_kmem;
+
+ blk_queue_make_request(zv->zv_zso->zvo_queue, zvol_request);
+ blk_queue_set_write_cache(zv->zv_zso->zvo_queue, B_TRUE, B_TRUE);
+
+ /* Limit read-ahead to a single page to prevent over-prefetching. */
+ blk_queue_set_read_ahead(zv->zv_zso->zvo_queue, 1);
+
+ /* Disable write merging in favor of the ZIO pipeline. */
+ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zv->zv_zso->zvo_queue);
+
+ zv->zv_zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+ if (zv->zv_zso->zvo_disk == NULL)
+ goto out_queue;
+
+ zv->zv_zso->zvo_queue->queuedata = zv;
+ zv->zv_zso->zvo_dev = dev;
+ zv->zv_open_count = 0;
+ strlcpy(zv->zv_name, name, MAXNAMELEN);
+
+ rangelock_init(&zv->zv_rangelock, NULL, NULL);
+ rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+
+ zv->zv_zso->zvo_disk->major = zvol_major;
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+ zv->zv_zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
+#endif
+
+ if (volmode == ZFS_VOLMODE_DEV) {
+ /*
+ * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
+ * gendisk->minors = 1 as noted in include/linux/genhd.h.
+ * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
+ * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
+ * setting gendisk->flags accordingly.
+ */
+ zv->zv_zso->zvo_disk->minors = 1;
+#if defined(GENHD_FL_EXT_DEVT)
+ zv->zv_zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
+#endif
+#if defined(GENHD_FL_NO_PART_SCAN)
+ zv->zv_zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+#endif
+ }
+ zv->zv_zso->zvo_disk->first_minor = (dev & MINORMASK);
+ zv->zv_zso->zvo_disk->fops = &zvol_ops;
+ zv->zv_zso->zvo_disk->private_data = zv;
+ zv->zv_zso->zvo_disk->queue = zv->zv_zso->zvo_queue;
+ snprintf(zv->zv_zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
+ ZVOL_DEV_NAME, (dev & MINORMASK));
+
+ return (zv);
+
+out_queue:
+ blk_cleanup_queue(zv->zv_zso->zvo_queue);
+out_kmem:
+ kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+ kmem_free(zv, sizeof (zvol_state_t));
+ return (NULL);
+}
+
+/*
+ * Cleanup then free a zvol_state_t which was created by zvol_alloc().
+ * At this time, the structure is not opened by anyone, is taken off
+ * the zvol_state_list, and has its private data set to NULL.
+ * The zvol_state_lock is dropped.
+ */
+static void
+zvol_free(zvol_state_t *zv)
+{
+
+ ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(zv->zv_open_count == 0);
+ ASSERT(zv->zv_zso->zvo_disk->private_data == NULL);
+
+ rw_destroy(&zv->zv_suspend_lock);
+ rangelock_fini(&zv->zv_rangelock);
+
+ del_gendisk(zv->zv_zso->zvo_disk);
+ blk_cleanup_queue(zv->zv_zso->zvo_queue);
+ put_disk(zv->zv_zso->zvo_disk);
+
+ ida_simple_remove(&zvol_ida,
+ MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+
+ mutex_destroy(&zv->zv_state_lock);
+ dataset_kstats_destroy(&zv->zv_zso->zvo_kstat);
+
+ kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+ kmem_free(zv, sizeof (zvol_state_t));
+}
+
+/*
+ * Create a block device minor node and setup the linkage between it
+ * and the specified volume. Once this function returns the block
+ * device is live and ready for use.
+ */
+static int
+zvol_create_minor(const char *name)
+{
+ zvol_state_t *zv;
+ objset_t *os;
+ dmu_object_info_t *doi;
+ uint64_t volsize;
+ uint64_t len;
+ unsigned minor = 0;
+ int error = 0;
+ int idx;
+ uint64_t hash = zvol_name_hash(name);
+
+ if (zvol_inhibit_dev)
+ return (0);
+
+ idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
+ if (idx < 0)
+ return (SET_ERROR(-idx));
+ minor = idx << ZVOL_MINOR_BITS;
+
+ zv = zvol_find_by_name_hash(name, hash, RW_NONE);
+ if (zv) {
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ mutex_exit(&zv->zv_state_lock);
+ ida_simple_remove(&zvol_ida, idx);
+ return (SET_ERROR(EEXIST));
+ }
+
+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
+ if (error)
+ goto out_doi;
+
+ error = dmu_object_info(os, ZVOL_OBJ, doi);
+ if (error)
+ goto out_dmu_objset_disown;
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+ if (error)
+ goto out_dmu_objset_disown;
+
+ zv = zvol_alloc(MKDEV(zvol_major, minor), name);
+ if (zv == NULL) {
+ error = SET_ERROR(EAGAIN);
+ goto out_dmu_objset_disown;
+ }
+ zv->zv_hash = hash;
+
+ if (dmu_objset_is_snapshot(os))
+ zv->zv_flags |= ZVOL_RDONLY;
+
+ zv->zv_volblocksize = doi->doi_data_block_size;
+ zv->zv_volsize = volsize;
+ zv->zv_objset = os;
+
+ set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
+
+ blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
+ (DMU_MAX_ACCESS / 4) >> 9);
+ blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
+ blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
+ blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
+ zv->zv_volblocksize);
+ blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
+ blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
+ (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
+ blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
+ zv->zv_volblocksize);
+ blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
+#ifdef QUEUE_FLAG_NONROT
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
+#endif
+#ifdef QUEUE_FLAG_ADD_RANDOM
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
+#endif
+ /* This flag was introduced in kernel version 4.12. */
+#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
+ blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
+#endif
+
+ if (spa_writeable(dmu_objset_spa(os))) {
+ if (zil_replay_disable)
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
+ else
+ zil_replay(os, zv, zvol_replay_vector);
+ }
+ ASSERT3P(zv->zv_zso->zvo_kstat.dk_kstats, ==, NULL);
+ dataset_kstats_create(&zv->zv_zso->zvo_kstat, zv->zv_objset);
+
+ /*
+ * When udev detects the addition of the device it will immediately
+ * invoke blkid(8) to determine the type of content on the device.
+ * Prefetching the blocks commonly scanned by blkid(8) will speed
+ * up this process.
+ */
+ len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
+ if (len > 0) {
+ dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
+ dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+
+ zv->zv_objset = NULL;
+out_dmu_objset_disown:
+ dmu_objset_disown(os, B_TRUE, FTAG);
+out_doi:
+ kmem_free(doi, sizeof (dmu_object_info_t));
+
+ /*
+ * Keep in mind that once add_disk() is called, the zvol is
+ * announced to the world, and zvol_open()/zvol_release() can
+ * be called at any time. Incidentally, add_disk() itself calls
+ * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
+ * directly as well.
+ */
+ if (error == 0) {
+ rw_enter(&zvol_state_lock, RW_WRITER);
+ zvol_insert(zv);
+ rw_exit(&zvol_state_lock);
+ add_disk(zv->zv_zso->zvo_disk);
+ } else {
+ ida_simple_remove(&zvol_ida, idx);
+ }
+
+ return (SET_ERROR(error));
+}
+
+static void
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
+{
+ int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
+
+ ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+
+ /* move to new hashtable entry */
+ zv->zv_hash = zvol_name_hash(zv->zv_name);
+ hlist_del(&zv->zv_hlink);
+ hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
+
+ /*
+ * The block device's read-only state is briefly changed causing
+ * a KOBJ_CHANGE uevent to be issued. This ensures udev detects
+ * the name change and fixes the symlinks. This does not change
+ * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
+ * changes. This would normally be done using kobject_uevent() but
+ * that is a GPL-only symbol which is why we need this workaround.
+ */
+ set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
+ set_disk_ro(zv->zv_zso->zvo_disk, readonly);
+}
+
+static void
+zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
+{
+
+ set_disk_ro(zv->zv_zso->zvo_disk, flags);
+}
+
+static void
+zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
+{
+
+ set_capacity(zv->zv_zso->zvo_disk, capacity);
+}
+
+const static zvol_platform_ops_t zvol_linux_ops = {
+ .zv_free = zvol_free,
+ .zv_rename_minor = zvol_rename_minor,
+ .zv_create_minor = zvol_create_minor,
+ .zv_update_volsize = zvol_update_volsize,
+ .zv_clear_private = zvol_clear_private,
+ .zv_is_zvol = zvol_is_zvol_impl,
+ .zv_set_disk_ro = zvol_set_disk_ro_impl,
+ .zv_set_capacity = zvol_set_capacity_impl,
+};
+
+int
+zvol_init(void)
+{
+ int error;
+ int threads = MIN(MAX(zvol_threads, 1), 1024);
+
+ error = register_blkdev(zvol_major, ZVOL_DRIVER);
+ if (error) {
+ printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
+ return (error);
+ }
+ zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
+ threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ if (zvol_taskq == NULL) {
+ unregister_blkdev(zvol_major, ZVOL_DRIVER);
+ return (-ENOMEM);
+ }
+ zvol_init_impl();
+ blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
+ THIS_MODULE, zvol_probe, NULL, NULL);
+
+ ida_init(&zvol_ida);
+ zvol_register_ops(&zvol_linux_ops);
+ return (0);
+}
+
+void
+zvol_fini(void)
+{
+ zvol_remove_minors_impl(NULL);
+
+ zvol_fini_impl();
+ blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
+ unregister_blkdev(zvol_major, ZVOL_DRIVER);
+ taskq_destroy(zvol_taskq);
+ ida_destroy(&zvol_ida);
+}
+
+/* BEGIN CSTYLED */
+module_param(zvol_inhibit_dev, uint, 0644);
+MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
+
+module_param(zvol_major, uint, 0444);
+MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
+
+module_param(zvol_threads, uint, 0444);
+MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
+
+module_param(zvol_request_sync, uint, 0644);
+MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
+
+module_param(zvol_max_discard_blocks, ulong, 0444);
+MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
+
+module_param(zvol_prefetch_bytes, uint, 0644);
+MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
+
+module_param(zvol_volmode, uint, 0644);
+MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
+/* END CSTYLED */
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 469137ecf..de57b170f 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -2856,7 +2856,7 @@ dmu_objset_find_impl(spa_t *spa, const char *name,
* See comment above dmu_objset_find_impl().
*/
int
-dmu_objset_find(char *name, int func(const char *, void *), void *arg,
+dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
int flags)
{
spa_t *spa;
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index fce074147..39718a9a8 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -4136,7 +4136,7 @@ static int
zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
zfsvfs_t *zfsvfs;
- zvol_state_t *zv;
+ zvol_state_handle_t *zv;
char *target = NULL;
int error;
@@ -4861,7 +4861,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
if (error == 0) {
zfsvfs_t *zfsvfs = NULL;
- zvol_state_t *zv = NULL;
+ zvol_state_handle_t *zv = NULL;
if (getzfsvfs(tofs, &zfsvfs) == 0) {
/* online recv */
@@ -7671,7 +7671,7 @@ _init(void)
{
int error;
- if ((error = -zvol_init()) != 0)
+ if ((error = zvol_init()) != 0)
return (error);
spa_init(FREAD | FWRITE);
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 840b8d008..9b26084bb 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -68,10 +68,6 @@
* allocated and placed on zvol_state_list, and then other minor operations
* for this zvol are going to proceed in the order of issue.
*
- * It is also worth keeping in mind that once add_disk() is called, the zvol is
- * announced to the world, and zvol_open()/zvol_release() can be called at any
- * time. Incidentally, add_disk() itself calls zvol_open()->zvol_first_open()
- * and zvol_release()->zvol_last_close() directly as well.
*/
#include <sys/dataset_kstats.h>
@@ -89,52 +85,16 @@
#include <sys/spa_impl.h>
#include <sys/zvol.h>
-#include <linux/blkdev_compat.h>
-#include <linux/task_io_accounting_ops.h>
+#include <sys/zvol_impl.h>
+
unsigned int zvol_inhibit_dev = 0;
-unsigned int zvol_major = ZVOL_MAJOR;
-unsigned int zvol_threads = 32;
-unsigned int zvol_request_sync = 0;
-unsigned int zvol_prefetch_bytes = (128 * 1024);
-unsigned long zvol_max_discard_blocks = 16384;
unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
-static taskq_t *zvol_taskq;
-static krwlock_t zvol_state_lock;
-static list_t zvol_state_list;
-
-#define ZVOL_HT_SIZE 1024
-static struct hlist_head *zvol_htable;
-#define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)])
-
-static struct ida zvol_ida;
-
-/*
- * The in-core state of each volume.
- */
-struct zvol_state {
- char zv_name[MAXNAMELEN]; /* name */
- uint64_t zv_volsize; /* advertised space */
- uint64_t zv_volblocksize; /* volume block size */
- objset_t *zv_objset; /* objset handle */
- uint32_t zv_flags; /* ZVOL_* flags */
- uint32_t zv_open_count; /* open counts */
- uint32_t zv_changed; /* disk changed */
- zilog_t *zv_zilog; /* ZIL handle */
- rangelock_t zv_rangelock; /* for range locking */
- dnode_t *zv_dn; /* dnode hold */
- dev_t zv_dev; /* device id */
- struct gendisk *zv_disk; /* generic disk */
- struct request_queue *zv_queue; /* request queue */
- dataset_kstats_t zv_kstat; /* zvol kstats */
- list_node_t zv_next; /* next zvol_state_t linkage */
- uint64_t zv_hash; /* name hash */
- struct hlist_node zv_hlink; /* hash link */
- kmutex_t zv_state_lock; /* protects zvol_state_t */
- atomic_t zv_suspend_ref; /* refcount for suspend */
- krwlock_t zv_suspend_lock; /* suspend lock */
-};
+struct hlist_head *zvol_htable;
+list_t zvol_state_list;
+krwlock_t zvol_state_lock;
+const zvol_platform_ops_t *ops;
typedef enum {
ZVOL_ASYNC_CREATE_MINORS,
@@ -154,19 +114,12 @@ typedef struct {
uint64_t value;
} zvol_task_t;
-#define ZVOL_RDONLY 0x1
-/*
- * Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which
- * specifies whether or not the zvol _can_ be written to)
- */
-#define ZVOL_WRITTEN_TO 0x2
-
-static uint64_t
+uint64_t
zvol_name_hash(const char *name)
{
int i;
uint64_t crc = -1ULL;
- uint8_t *p = (uint8_t *)name;
+ const uint8_t *p = (const uint8_t *)name;
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) {
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
@@ -175,38 +128,13 @@ zvol_name_hash(const char *name)
}
/*
- * Find a zvol_state_t given the full major+minor dev_t. If found,
- * return with zv_state_lock taken, otherwise, return (NULL) without
- * taking zv_state_lock.
- */
-static zvol_state_t *
-zvol_find_by_dev(dev_t dev)
-{
- zvol_state_t *zv;
-
- rw_enter(&zvol_state_lock, RW_READER);
- for (zv = list_head(&zvol_state_list); zv != NULL;
- zv = list_next(&zvol_state_list, zv)) {
- mutex_enter(&zv->zv_state_lock);
- if (zv->zv_dev == dev) {
- rw_exit(&zvol_state_lock);
- return (zv);
- }
- mutex_exit(&zv->zv_state_lock);
- }
- rw_exit(&zvol_state_lock);
-
- return (NULL);
-}
-
-/*
* Find a zvol_state_t given the name and hash generated by zvol_name_hash.
* If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
* return (NULL) without the taking locks. The zv_suspend_lock is always taken
* before zv_state_lock. The mode argument indicates the mode (including none)
* for zv_suspend_lock to be taken.
*/
-static zvol_state_t *
+zvol_state_t *
zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
{
zvol_state_t *zv;
@@ -258,29 +186,6 @@ zvol_find_by_name(const char *name, int mode)
return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
}
-
-/*
- * Given a path, return TRUE if path is a ZVOL.
- */
-boolean_t
-zvol_is_zvol(const char *device)
-{
- struct block_device *bdev;
- unsigned int major;
-
- bdev = vdev_lookup_bdev(device);
- if (IS_ERR(bdev))
- return (B_FALSE);
-
- major = MAJOR(bdev->bd_dev);
- bdput(bdev);
-
- if (major == zvol_major)
- return (B_TRUE);
-
- return (B_FALSE);
-}
-
/*
* ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
*/
@@ -407,7 +312,6 @@ int
zvol_set_volsize(const char *name, uint64_t volsize)
{
objset_t *os = NULL;
- struct gendisk *disk = NULL;
uint64_t readonly;
int error;
boolean_t owned = B_FALSE;
@@ -450,7 +354,6 @@ zvol_set_volsize(const char *name, uint64_t volsize)
if (error == 0 && zv != NULL) {
zv->zv_volsize = volsize;
zv->zv_changed = 1;
- disk = zv->zv_disk;
}
out:
kmem_free(doi, sizeof (dmu_object_info_t));
@@ -466,8 +369,8 @@ out:
if (zv != NULL)
mutex_exit(&zv->zv_state_lock);
- if (disk != NULL)
- revalidate_disk(disk);
+ if (error == 0 && zv != NULL)
+ ops->zv_update_volsize(zv, volsize);
return (SET_ERROR(error));
}
@@ -657,7 +560,7 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
*/
ssize_t zvol_immediate_write_sz = 32768;
-static void
+void
zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
uint64_t size, int sync)
{
@@ -717,92 +620,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
}
}
-typedef struct zv_request {
- zvol_state_t *zv;
- struct bio *bio;
- locked_range_t *lr;
-} zv_request_t;
-
-static void
-uio_from_bio(uio_t *uio, struct bio *bio)
-{
- uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
- uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
- uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
- uio->uio_segflg = UIO_BVEC;
- uio->uio_limit = MAXOFFSET_T;
- uio->uio_resid = BIO_BI_SIZE(bio);
- uio->uio_skip = BIO_BI_SKIP(bio);
-}
-
-static void
-zvol_write(void *arg)
-{
- int error = 0;
-
- zv_request_t *zvr = arg;
- struct bio *bio = zvr->bio;
- uio_t uio = { { 0 }, 0 };
- uio_from_bio(&uio, bio);
-
- zvol_state_t *zv = zvr->zv;
- ASSERT(zv && zv->zv_open_count > 0);
- ASSERT(zv->zv_zilog != NULL);
-
- ssize_t start_resid = uio.uio_resid;
- unsigned long start_jif = jiffies;
- blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
- &zv->zv_disk->part0);
-
- boolean_t sync =
- bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
-
- uint64_t volsize = zv->zv_volsize;
- while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
- uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
- uint64_t off = uio.uio_loffset;
- dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
-
- if (bytes > volsize - off) /* don't write past the end */
- bytes = volsize - off;
-
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
-
- /* This will only fail for ENOSPC */
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- break;
- }
- error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
- if (error == 0) {
- zvol_log_write(zv, tx, off, bytes, sync);
- }
- dmu_tx_commit(tx);
-
- if (error)
- break;
- }
- rangelock_exit(zvr->lr);
-
- int64_t nwritten = start_resid - uio.uio_resid;
- dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
- task_io_account_write(nwritten);
-
- if (sync)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
-
- rw_exit(&zv->zv_suspend_lock);
- blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0,
- start_jif);
- BIO_END_IO(bio, -error);
- kmem_free(zvr, sizeof (zv_request_t));
-}
-
/*
* Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
*/
-static void
+void
zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
boolean_t sync)
{
@@ -823,119 +644,6 @@ zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
zil_itx_assign(zilog, itx, tx);
}
-static void
-zvol_discard(void *arg)
-{
- zv_request_t *zvr = arg;
- struct bio *bio = zvr->bio;
- zvol_state_t *zv = zvr->zv;
- uint64_t start = BIO_BI_SECTOR(bio) << 9;
- uint64_t size = BIO_BI_SIZE(bio);
- uint64_t end = start + size;
- boolean_t sync;
- int error = 0;
- dmu_tx_t *tx;
- unsigned long start_jif;
-
- ASSERT(zv && zv->zv_open_count > 0);
- ASSERT(zv->zv_zilog != NULL);
-
- start_jif = jiffies;
- blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
- &zv->zv_disk->part0);
-
- sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
-
- if (end > zv->zv_volsize) {
- error = SET_ERROR(EIO);
- goto unlock;
- }
-
- /*
- * Align the request to volume block boundaries when a secure erase is
- * not required. This will prevent dnode_free_range() from zeroing out
- * the unaligned parts which is slow (read-modify-write) and useless
- * since we are not freeing any space by doing so.
- */
- if (!bio_is_secure_erase(bio)) {
- start = P2ROUNDUP(start, zv->zv_volblocksize);
- end = P2ALIGN(end, zv->zv_volblocksize);
- size = end - start;
- }
-
- if (start >= end)
- goto unlock;
-
- tx = dmu_tx_create(zv->zv_objset);
- dmu_tx_mark_netfree(tx);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error != 0) {
- dmu_tx_abort(tx);
- } else {
- zvol_log_truncate(zv, tx, start, size, B_TRUE);
- dmu_tx_commit(tx);
- error = dmu_free_long_range(zv->zv_objset,
- ZVOL_OBJ, start, size);
- }
-unlock:
- rangelock_exit(zvr->lr);
-
- if (error == 0 && sync)
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
-
- rw_exit(&zv->zv_suspend_lock);
- blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0,
- start_jif);
- BIO_END_IO(bio, -error);
- kmem_free(zvr, sizeof (zv_request_t));
-}
-
-static void
-zvol_read(void *arg)
-{
- int error = 0;
-
- zv_request_t *zvr = arg;
- struct bio *bio = zvr->bio;
- uio_t uio = { { 0 }, 0 };
- uio_from_bio(&uio, bio);
-
- zvol_state_t *zv = zvr->zv;
- ASSERT(zv && zv->zv_open_count > 0);
-
- ssize_t start_resid = uio.uio_resid;
- unsigned long start_jif = jiffies;
- blk_generic_start_io_acct(zv->zv_queue, READ, bio_sectors(bio),
- &zv->zv_disk->part0);
-
- uint64_t volsize = zv->zv_volsize;
- while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
- uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
-
- /* don't read past the end */
- if (bytes > volsize - uio.uio_loffset)
- bytes = volsize - uio.uio_loffset;
-
- error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
- if (error) {
- /* convert checksum errors into IO errors */
- if (error == ECKSUM)
- error = SET_ERROR(EIO);
- break;
- }
- }
- rangelock_exit(zvr->lr);
-
- int64_t nread = start_resid - uio.uio_resid;
- dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
- task_io_account_read(nread);
-
- rw_exit(&zv->zv_suspend_lock);
- blk_generic_end_io_acct(zv->zv_queue, READ, &zv->zv_disk->part0,
- start_jif);
- BIO_END_IO(bio, -error);
- kmem_free(zvr, sizeof (zv_request_t));
-}
/* ARGSUSED */
static void
@@ -952,7 +660,7 @@ zvol_get_done(zgd_t *zgd, int error)
/*
* Get data to generate a TX_WRITE intent log record.
*/
-static int
+int
zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
{
zvol_state_t *zv = arg;
@@ -1017,141 +725,14 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
return (SET_ERROR(error));
}
-static MAKE_REQUEST_FN_RET
-zvol_request(struct request_queue *q, struct bio *bio)
-{
- zvol_state_t *zv = q->queuedata;
- fstrans_cookie_t cookie = spl_fstrans_mark();
- uint64_t offset = BIO_BI_SECTOR(bio) << 9;
- uint64_t size = BIO_BI_SIZE(bio);
- int rw = bio_data_dir(bio);
- zv_request_t *zvr;
-
- if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
- printk(KERN_INFO
- "%s: bad access: offset=%llu, size=%lu\n",
- zv->zv_disk->disk_name,
- (long long unsigned)offset,
- (long unsigned)size);
-
- BIO_END_IO(bio, -SET_ERROR(EIO));
- goto out;
- }
-
- if (rw == WRITE) {
- boolean_t need_sync = B_FALSE;
-
- if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
- BIO_END_IO(bio, -SET_ERROR(EROFS));
- goto out;
- }
-
- /*
- * To be released in the I/O function. See the comment on
- * rangelock_enter() below.
- */
- rw_enter(&zv->zv_suspend_lock, RW_READER);
-
- /*
- * Open a ZIL if this is the first time we have written to this
- * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
- * than zv_state_lock so that we don't need to acquire an
- * additional lock in this path.
- */
- if (zv->zv_zilog == NULL) {
- rw_exit(&zv->zv_suspend_lock);
- rw_enter(&zv->zv_suspend_lock, RW_WRITER);
- if (zv->zv_zilog == NULL) {
- zv->zv_zilog = zil_open(zv->zv_objset,
- zvol_get_data);
- zv->zv_flags |= ZVOL_WRITTEN_TO;
- }
- rw_downgrade(&zv->zv_suspend_lock);
- }
-
- /* bio marked as FLUSH need to flush before write */
- if (bio_is_flush(bio))
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
-
- /* Some requests are just for flush and nothing else. */
- if (size == 0) {
- rw_exit(&zv->zv_suspend_lock);
- BIO_END_IO(bio, 0);
- goto out;
- }
-
- zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
- zvr->zv = zv;
- zvr->bio = bio;
-
- /*
- * To be released in the I/O function. Since the I/O functions
- * are asynchronous, we take it here synchronously to make
- * sure overlapped I/Os are properly ordered.
- */
- zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
- RL_WRITER);
- /*
- * Sync writes and discards execute zil_commit() which may need
- * to take a RL_READER lock on the whole block being modified
- * via its zillog->zl_get_data(): to avoid circular dependency
- * issues with taskq threads execute these requests
- * synchronously here in zvol_request().
- */
- need_sync = bio_is_fua(bio) ||
- zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
- if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
- if (zvol_request_sync || need_sync ||
- taskq_dispatch(zvol_taskq, zvol_discard, zvr,
- TQ_SLEEP) == TASKQID_INVALID)
- zvol_discard(zvr);
- } else {
- if (zvol_request_sync || need_sync ||
- taskq_dispatch(zvol_taskq, zvol_write, zvr,
- TQ_SLEEP) == TASKQID_INVALID)
- zvol_write(zvr);
- }
- } else {
- /*
- * The SCST driver, and possibly others, may issue READ I/Os
- * with a length of zero bytes. These empty I/Os contain no
- * data and require no additional handling.
- */
- if (size == 0) {
- BIO_END_IO(bio, 0);
- goto out;
- }
-
- zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
- zvr->zv = zv;
- zvr->bio = bio;
-
- rw_enter(&zv->zv_suspend_lock, RW_READER);
-
- zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
- RL_READER);
- if (zvol_request_sync || taskq_dispatch(zvol_taskq,
- zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID)
- zvol_read(zvr);
- }
-
-out:
- spl_fstrans_unmark(cookie);
-#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
- return (0);
-#elif defined(HAVE_MAKE_REQUEST_FN_RET_QC)
- return (BLK_QC_T_NONE);
-#endif
-}
-
/*
* The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
*/
-static void
+
+void
zvol_insert(zvol_state_t *zv)
{
ASSERT(RW_WRITE_HELD(&zvol_state_lock));
- ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0);
list_insert_head(&zvol_state_list, zv);
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
}
@@ -1196,15 +777,15 @@ zvol_setup_zv(zvol_state_t *zv)
if (error)
return (SET_ERROR(error));
- set_capacity(zv->zv_disk, volsize >> 9);
+ ops->zv_set_capacity(zv, volsize >> 9);
zv->zv_volsize = volsize;
if (ro || dmu_objset_is_snapshot(os) ||
!spa_writeable(dmu_objset_spa(os))) {
- set_disk_ro(zv->zv_disk, 1);
+ ops->zv_set_disk_ro(zv, 1);
zv->zv_flags |= ZVOL_RDONLY;
} else {
- set_disk_ro(zv->zv_disk, 0);
+ ops->zv_set_disk_ro(zv, 0);
zv->zv_flags &= ~ZVOL_RDONLY;
}
return (0);
@@ -1313,7 +894,7 @@ zvol_resume(zvol_state_t *zv)
return (SET_ERROR(error));
}
-static int
+int
zvol_first_open(zvol_state_t *zv, boolean_t readonly)
{
objset_t *os;
@@ -1344,7 +925,7 @@ zvol_first_open(zvol_state_t *zv, boolean_t readonly)
if (!mutex_owned(&spa_namespace_lock)) {
locked = mutex_tryenter(&spa_namespace_lock);
if (!locked)
- return (-SET_ERROR(ERESTARTSYS));
+ return (SET_ERROR(EINTR));
}
ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
@@ -1364,10 +945,10 @@ zvol_first_open(zvol_state_t *zv, boolean_t readonly)
out_mutex:
if (locked)
mutex_exit(&spa_namespace_lock);
- return (SET_ERROR(-error));
+ return (SET_ERROR(error));
}
-static void
+void
zvol_last_close(zvol_state_t *zv)
{
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
@@ -1379,578 +960,6 @@ zvol_last_close(zvol_state_t *zv)
zv->zv_objset = NULL;
}
-static int
-zvol_open(struct block_device *bdev, fmode_t flag)
-{
- zvol_state_t *zv;
- int error = 0;
- boolean_t drop_suspend = B_TRUE;
-
- rw_enter(&zvol_state_lock, RW_READER);
- /*
- * Obtain a copy of private_data under the zvol_state_lock to make
- * sure that either the result of zvol free code path setting
- * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
- * is not called on this zv because of the positive zv_open_count.
- */
- zv = bdev->bd_disk->private_data;
- if (zv == NULL) {
- rw_exit(&zvol_state_lock);
- return (SET_ERROR(-ENXIO));
- }
-
- mutex_enter(&zv->zv_state_lock);
- /*
- * make sure zvol is not suspended during first open
- * (hold zv_suspend_lock) and respect proper lock acquisition
- * ordering - zv_suspend_lock before zv_state_lock
- */
- if (zv->zv_open_count == 0) {
- if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
- mutex_exit(&zv->zv_state_lock);
- rw_enter(&zv->zv_suspend_lock, RW_READER);
- mutex_enter(&zv->zv_state_lock);
- /* check to see if zv_suspend_lock is needed */
- if (zv->zv_open_count != 0) {
- rw_exit(&zv->zv_suspend_lock);
- drop_suspend = B_FALSE;
- }
- }
- } else {
- drop_suspend = B_FALSE;
- }
- rw_exit(&zvol_state_lock);
-
- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
- ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
-
- if (zv->zv_open_count == 0) {
- error = zvol_first_open(zv, !(flag & FMODE_WRITE));
- if (error)
- goto out_mutex;
- }
-
- if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
- error = -EROFS;
- goto out_open_count;
- }
-
- zv->zv_open_count++;
-
- mutex_exit(&zv->zv_state_lock);
- if (drop_suspend)
- rw_exit(&zv->zv_suspend_lock);
-
- check_disk_change(bdev);
-
- return (0);
-
-out_open_count:
- if (zv->zv_open_count == 0)
- zvol_last_close(zv);
-
-out_mutex:
- mutex_exit(&zv->zv_state_lock);
- if (drop_suspend)
- rw_exit(&zv->zv_suspend_lock);
- if (error == -ERESTARTSYS)
- schedule();
-
- return (SET_ERROR(error));
-}
-
-#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
-static void
-#else
-static int
-#endif
-zvol_release(struct gendisk *disk, fmode_t mode)
-{
- zvol_state_t *zv;
- boolean_t drop_suspend = B_TRUE;
-
- rw_enter(&zvol_state_lock, RW_READER);
- zv = disk->private_data;
-
- mutex_enter(&zv->zv_state_lock);
- ASSERT(zv->zv_open_count > 0);
- /*
- * make sure zvol is not suspended during last close
- * (hold zv_suspend_lock) and respect proper lock acquisition
- * ordering - zv_suspend_lock before zv_state_lock
- */
- if (zv->zv_open_count == 1) {
- if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
- mutex_exit(&zv->zv_state_lock);
- rw_enter(&zv->zv_suspend_lock, RW_READER);
- mutex_enter(&zv->zv_state_lock);
- /* check to see if zv_suspend_lock is needed */
- if (zv->zv_open_count != 1) {
- rw_exit(&zv->zv_suspend_lock);
- drop_suspend = B_FALSE;
- }
- }
- } else {
- drop_suspend = B_FALSE;
- }
- rw_exit(&zvol_state_lock);
-
- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
- ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
-
- zv->zv_open_count--;
- if (zv->zv_open_count == 0)
- zvol_last_close(zv);
-
- mutex_exit(&zv->zv_state_lock);
-
- if (drop_suspend)
- rw_exit(&zv->zv_suspend_lock);
-
-#ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
- return (0);
-#endif
-}
-
-static int
-zvol_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
-{
- zvol_state_t *zv = bdev->bd_disk->private_data;
- int error = 0;
-
- ASSERT3U(zv->zv_open_count, >, 0);
-
- switch (cmd) {
- case BLKFLSBUF:
- fsync_bdev(bdev);
- invalidate_bdev(bdev);
- rw_enter(&zv->zv_suspend_lock, RW_READER);
-
- if (!(zv->zv_flags & ZVOL_RDONLY))
- txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
-
- rw_exit(&zv->zv_suspend_lock);
- break;
-
- case BLKZNAME:
- mutex_enter(&zv->zv_state_lock);
- error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
- mutex_exit(&zv->zv_state_lock);
- break;
-
- default:
- error = -ENOTTY;
- break;
- }
-
- return (SET_ERROR(error));
-}
-
-#ifdef CONFIG_COMPAT
-static int
-zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long arg)
-{
- return (zvol_ioctl(bdev, mode, cmd, arg));
-}
-#else
-#define zvol_compat_ioctl NULL
-#endif
-
-/*
- * Linux 2.6.38 preferred interface.
- */
-#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
-static unsigned int
-zvol_check_events(struct gendisk *disk, unsigned int clearing)
-{
- unsigned int mask = 0;
-
- rw_enter(&zvol_state_lock, RW_READER);
-
- zvol_state_t *zv = disk->private_data;
- if (zv != NULL) {
- mutex_enter(&zv->zv_state_lock);
- mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
- zv->zv_changed = 0;
- mutex_exit(&zv->zv_state_lock);
- }
-
- rw_exit(&zvol_state_lock);
-
- return (mask);
-}
-#else
-static int zvol_media_changed(struct gendisk *disk)
-{
- int changed = 0;
-
- rw_enter(&zvol_state_lock, RW_READER);
-
- zvol_state_t *zv = disk->private_data;
- if (zv != NULL) {
- mutex_enter(&zv->zv_state_lock);
- changed = zv->zv_changed;
- zv->zv_changed = 0;
- mutex_exit(&zv->zv_state_lock);
- }
-
- rw_exit(&zvol_state_lock);
-
- return (changed);
-}
-#endif
-
-static int zvol_revalidate_disk(struct gendisk *disk)
-{
- rw_enter(&zvol_state_lock, RW_READER);
-
- zvol_state_t *zv = disk->private_data;
- if (zv != NULL) {
- mutex_enter(&zv->zv_state_lock);
- set_capacity(zv->zv_disk, zv->zv_volsize >> SECTOR_BITS);
- mutex_exit(&zv->zv_state_lock);
- }
-
- rw_exit(&zvol_state_lock);
-
- return (0);
-}
-
-/*
- * Provide a simple virtual geometry for legacy compatibility. For devices
- * smaller than 1 MiB a small head and sector count is used to allow very
- * tiny devices. For devices over 1 Mib a standard head and sector count
- * is used to keep the cylinders count reasonable.
- */
-static int
-zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- zvol_state_t *zv = bdev->bd_disk->private_data;
- sector_t sectors;
-
- ASSERT3U(zv->zv_open_count, >, 0);
-
- sectors = get_capacity(zv->zv_disk);
-
- if (sectors > 2048) {
- geo->heads = 16;
- geo->sectors = 63;
- } else {
- geo->heads = 2;
- geo->sectors = 4;
- }
-
- geo->start = 0;
- geo->cylinders = sectors / (geo->heads * geo->sectors);
-
- return (0);
-}
-
-static struct kobject *
-zvol_probe(dev_t dev, int *part, void *arg)
-{
- zvol_state_t *zv;
- struct kobject *kobj;
-
- zv = zvol_find_by_dev(dev);
- kobj = zv ? get_disk_and_module(zv->zv_disk) : NULL;
- ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
- if (zv)
- mutex_exit(&zv->zv_state_lock);
-
- return (kobj);
-}
-
-static struct block_device_operations zvol_ops = {
- .open = zvol_open,
- .release = zvol_release,
- .ioctl = zvol_ioctl,
- .compat_ioctl = zvol_compat_ioctl,
-#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
- .check_events = zvol_check_events,
-#else
- .media_changed = zvol_media_changed,
-#endif
- .revalidate_disk = zvol_revalidate_disk,
- .getgeo = zvol_getgeo,
- .owner = THIS_MODULE,
-};
-
-/*
- * Allocate memory for a new zvol_state_t and setup the required
- * request queue and generic disk structures for the block device.
- */
-static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name)
-{
- zvol_state_t *zv;
- uint64_t volmode;
-
- if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
- return (NULL);
-
- if (volmode == ZFS_VOLMODE_DEFAULT)
- volmode = zvol_volmode;
-
- if (volmode == ZFS_VOLMODE_NONE)
- return (NULL);
-
- zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
-
- list_link_init(&zv->zv_next);
-
- mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
-
- zv->zv_queue = blk_alloc_queue(GFP_ATOMIC);
- if (zv->zv_queue == NULL)
- goto out_kmem;
-
- blk_queue_make_request(zv->zv_queue, zvol_request);
- blk_queue_set_write_cache(zv->zv_queue, B_TRUE, B_TRUE);
-
- /* Limit read-ahead to a single page to prevent over-prefetching. */
- blk_queue_set_read_ahead(zv->zv_queue, 1);
-
- /* Disable write merging in favor of the ZIO pipeline. */
- blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zv->zv_queue);
-
- zv->zv_disk = alloc_disk(ZVOL_MINORS);
- if (zv->zv_disk == NULL)
- goto out_queue;
-
- zv->zv_queue->queuedata = zv;
- zv->zv_dev = dev;
- zv->zv_open_count = 0;
- strlcpy(zv->zv_name, name, MAXNAMELEN);
-
- rangelock_init(&zv->zv_rangelock, NULL, NULL);
- rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
-
- zv->zv_disk->major = zvol_major;
-#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
- zv->zv_disk->events = DISK_EVENT_MEDIA_CHANGE;
-#endif
-
- if (volmode == ZFS_VOLMODE_DEV) {
- /*
- * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
- * gendisk->minors = 1 as noted in include/linux/genhd.h.
- * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
- * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
- * setting gendisk->flags accordingly.
- */
- zv->zv_disk->minors = 1;
-#if defined(GENHD_FL_EXT_DEVT)
- zv->zv_disk->flags &= ~GENHD_FL_EXT_DEVT;
-#endif
-#if defined(GENHD_FL_NO_PART_SCAN)
- zv->zv_disk->flags |= GENHD_FL_NO_PART_SCAN;
-#endif
- }
- zv->zv_disk->first_minor = (dev & MINORMASK);
- zv->zv_disk->fops = &zvol_ops;
- zv->zv_disk->private_data = zv;
- zv->zv_disk->queue = zv->zv_queue;
- snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d",
- ZVOL_DEV_NAME, (dev & MINORMASK));
-
- return (zv);
-
-out_queue:
- blk_cleanup_queue(zv->zv_queue);
-out_kmem:
- kmem_free(zv, sizeof (zvol_state_t));
-
- return (NULL);
-}
-
-/*
- * Cleanup then free a zvol_state_t which was created by zvol_alloc().
- * At this time, the structure is not opened by anyone, is taken off
- * the zvol_state_list, and has its private data set to NULL.
- * The zvol_state_lock is dropped.
- */
-static void
-zvol_free(void *arg)
-{
- zvol_state_t *zv = arg;
-
- ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
- ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
- ASSERT(zv->zv_open_count == 0);
- ASSERT(zv->zv_disk->private_data == NULL);
-
- rw_destroy(&zv->zv_suspend_lock);
- rangelock_fini(&zv->zv_rangelock);
-
- del_gendisk(zv->zv_disk);
- blk_cleanup_queue(zv->zv_queue);
- put_disk(zv->zv_disk);
-
- ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS);
-
- mutex_destroy(&zv->zv_state_lock);
- dataset_kstats_destroy(&zv->zv_kstat);
-
- kmem_free(zv, sizeof (zvol_state_t));
-}
-
-/*
- * Create a block device minor node and setup the linkage between it
- * and the specified volume. Once this function returns the block
- * device is live and ready for use.
- */
-static int
-zvol_create_minor_impl(const char *name)
-{
- zvol_state_t *zv;
- objset_t *os;
- dmu_object_info_t *doi;
- uint64_t volsize;
- uint64_t len;
- unsigned minor = 0;
- int error = 0;
- int idx;
- uint64_t hash = zvol_name_hash(name);
-
- if (zvol_inhibit_dev)
- return (0);
-
- idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
- if (idx < 0)
- return (SET_ERROR(-idx));
- minor = idx << ZVOL_MINOR_BITS;
-
- zv = zvol_find_by_name_hash(name, hash, RW_NONE);
- if (zv) {
- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
- mutex_exit(&zv->zv_state_lock);
- ida_simple_remove(&zvol_ida, idx);
- return (SET_ERROR(EEXIST));
- }
-
- doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
-
- error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
- if (error)
- goto out_doi;
-
- error = dmu_object_info(os, ZVOL_OBJ, doi);
- if (error)
- goto out_dmu_objset_disown;
-
- error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
- if (error)
- goto out_dmu_objset_disown;
-
- zv = zvol_alloc(MKDEV(zvol_major, minor), name);
- if (zv == NULL) {
- error = SET_ERROR(EAGAIN);
- goto out_dmu_objset_disown;
- }
- zv->zv_hash = hash;
-
- if (dmu_objset_is_snapshot(os))
- zv->zv_flags |= ZVOL_RDONLY;
-
- zv->zv_volblocksize = doi->doi_data_block_size;
- zv->zv_volsize = volsize;
- zv->zv_objset = os;
-
- set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
-
- blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9);
- blk_queue_max_segments(zv->zv_queue, UINT16_MAX);
- blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
- blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
- blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
- blk_queue_max_discard_sectors(zv->zv_queue,
- (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
- blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
- blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_queue);
-#ifdef QUEUE_FLAG_NONROT
- blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_queue);
-#endif
-#ifdef QUEUE_FLAG_ADD_RANDOM
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
-#endif
- /* This flag was introduced in kernel version 4.12. */
-#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
- blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_queue);
-#endif
-
- if (spa_writeable(dmu_objset_spa(os))) {
- if (zil_replay_disable)
- zil_destroy(dmu_objset_zil(os), B_FALSE);
- else
- zil_replay(os, zv, zvol_replay_vector);
- }
- ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
- dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
-
- /*
- * When udev detects the addition of the device it will immediately
- * invoke blkid(8) to determine the type of content on the device.
- * Prefetching the blocks commonly scanned by blkid(8) will speed
- * up this process.
- */
- len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
- if (len > 0) {
- dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
- dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
- ZIO_PRIORITY_SYNC_READ);
- }
-
- zv->zv_objset = NULL;
-out_dmu_objset_disown:
- dmu_objset_disown(os, B_TRUE, FTAG);
-out_doi:
- kmem_free(doi, sizeof (dmu_object_info_t));
-
- if (error == 0) {
- rw_enter(&zvol_state_lock, RW_WRITER);
- zvol_insert(zv);
- rw_exit(&zvol_state_lock);
- add_disk(zv->zv_disk);
- } else {
- ida_simple_remove(&zvol_ida, idx);
- }
-
- return (SET_ERROR(error));
-}
-
-/*
- * Rename a block device minor mode for the specified volume.
- */
-static void
-zvol_rename_minor(zvol_state_t *zv, const char *newname)
-{
- int readonly = get_disk_ro(zv->zv_disk);
-
- ASSERT(RW_LOCK_HELD(&zvol_state_lock));
- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
-
- strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
-
- /* move to new hashtable entry */
- zv->zv_hash = zvol_name_hash(zv->zv_name);
- hlist_del(&zv->zv_hlink);
- hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
-
- /*
- * The block device's read-only state is briefly changed causing
- * a KOBJ_CHANGE uevent to be issued. This ensures udev detects
- * the name change and fixes the symlinks. This does not change
- * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
- * changes. This would normally be done using kobject_uevent() but
- * that is a GPL-only symbol which is why we need this workaround.
- */
- set_disk_ro(zv->zv_disk, !readonly);
- set_disk_ro(zv->zv_disk, readonly);
-}
-
typedef struct minors_job {
list_t *list;
list_node_t link;
@@ -2060,7 +1069,7 @@ zvol_create_minors_cb(const char *dsname, void *arg)
* traverse snapshots only, do not traverse children,
* and skip the 'dsname'
*/
- error = dmu_objset_find((char *)dsname,
+ error = dmu_objset_find(dsname,
zvol_create_snap_minor_cb, (void *)job,
DS_FIND_SNAPSHOTS);
}
@@ -2124,7 +1133,7 @@ zvol_create_minors_impl(const char *name)
&snapdev, NULL);
if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
- error = zvol_create_minor_impl(name);
+ error = ops->zv_create_minor(name);
} else {
cookie = spl_fstrans_mark();
error = dmu_objset_find(parent, zvol_create_minors_cb,
@@ -2142,7 +1151,7 @@ zvol_create_minors_impl(const char *name)
while ((job = list_head(&minors_list)) != NULL) {
list_remove(&minors_list, job);
if (!job->error)
- zvol_create_minor_impl(job->name);
+ ops->zv_create_minor(job->name);
strfree(job->name);
kmem_free(job, sizeof (minors_job_t));
}
@@ -2155,7 +1164,8 @@ zvol_create_minors_impl(const char *name)
/*
* Remove minors for specified dataset including children and snapshots.
*/
-static void
+
+void
zvol_remove_minors_impl(const char *name)
{
zvol_state_t *zv, *zv_next;
@@ -2197,14 +1207,14 @@ zvol_remove_minors_impl(const char *name)
* Cleared while holding zvol_state_lock as a writer
* which will prevent zvol_open() from opening it.
*/
- zv->zv_disk->private_data = NULL;
+ ops->zv_clear_private(zv);
/* Drop zv_state_lock before zvol_free() */
mutex_exit(&zv->zv_state_lock);
/* Try parallel zv_free, if failed do it in place */
- t = taskq_dispatch(system_taskq, zvol_free, zv,
- TQ_SLEEP);
+ t = taskq_dispatch(system_taskq,
+ (task_func_t *)ops->zv_free, zv, TQ_SLEEP);
if (t == TASKQID_INVALID)
list_insert_head(&free_list, zv);
else
@@ -2218,7 +1228,7 @@ zvol_remove_minors_impl(const char *name)
/* Drop zvol_state_lock before calling zvol_free() */
while ((zv = list_head(&free_list)) != NULL) {
list_remove(&free_list, zv);
- zvol_free(zv);
+ ops->zv_free(zv);
}
if (tid != TASKQID_INVALID)
@@ -2254,12 +1264,7 @@ zvol_remove_minor_impl(const char *name)
}
zvol_remove(zv);
- /*
- * Cleared while holding zvol_state_lock as a writer
- * which will prevent zvol_open() from opening it.
- */
- zv->zv_disk->private_data = NULL;
-
+ ops->zv_clear_private(zv);
mutex_exit(&zv->zv_state_lock);
break;
} else {
@@ -2271,7 +1276,7 @@ zvol_remove_minor_impl(const char *name)
rw_exit(&zvol_state_lock);
if (zv != NULL)
- zvol_free(zv);
+ ops->zv_free(zv);
}
/*
@@ -2297,14 +1302,14 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
mutex_enter(&zv->zv_state_lock);
if (strcmp(zv->zv_name, oldname) == 0) {
- zvol_rename_minor(zv, newname);
+ ops->zv_rename_minor(zv, newname);
} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
(zv->zv_name[oldnamelen] == '/' ||
zv->zv_name[oldnamelen] == '@')) {
char *name = kmem_asprintf("%s%c%s", newname,
zv->zv_name[oldnamelen],
zv->zv_name + oldnamelen + 1);
- zvol_rename_minor(zv, name);
+ ops->zv_rename_minor(zv, name);
strfree(name);
}
@@ -2328,7 +1333,7 @@ zvol_set_snapdev_cb(const char *dsname, void *param)
switch (arg->snapdev) {
case ZFS_SNAPDEV_VISIBLE:
- (void) zvol_create_minor_impl(dsname);
+ (void) ops->zv_create_minor(dsname);
break;
case ZFS_SNAPDEV_HIDDEN:
(void) zvol_remove_minor_impl(dsname);
@@ -2378,14 +1383,14 @@ zvol_set_volmode_impl(char *name, uint64_t volmode)
case ZFS_VOLMODE_GEOM:
case ZFS_VOLMODE_DEV:
(void) zvol_remove_minor_impl(name);
- (void) zvol_create_minor_impl(name);
+ (void) ops->zv_create_minor(name);
break;
case ZFS_VOLMODE_DEFAULT:
(void) zvol_remove_minor_impl(name);
if (zvol_volmode == ZFS_VOLMODE_NONE)
break;
else /* if zvol_volmode is invalid defaults to "geom" */
- (void) zvol_create_minor_impl(name);
+ (void) ops->zv_create_minor(name);
break;
}
@@ -2680,92 +1685,40 @@ zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
taskq_wait_id(spa->spa_zvol_taskq, id);
}
+boolean_t
+zvol_is_zvol(const char *name)
+{
+
+ return (ops->zv_is_zvol(name));
+}
+
+void
+zvol_register_ops(const zvol_platform_ops_t *zvol_ops)
+{
+ ops = zvol_ops;
+}
+
int
-zvol_init(void)
+zvol_init_impl(void)
{
- int threads = MIN(MAX(zvol_threads, 1), 1024);
- int i, error;
+ int i;
list_create(&zvol_state_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
- ida_init(&zvol_ida);
-
- zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
- threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
- if (zvol_taskq == NULL) {
- printk(KERN_INFO "ZFS: taskq_create() failed\n");
- error = -ENOMEM;
- goto out;
- }
zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
KM_SLEEP);
- if (!zvol_htable) {
- error = -ENOMEM;
- goto out_taskq;
- }
for (i = 0; i < ZVOL_HT_SIZE; i++)
INIT_HLIST_HEAD(&zvol_htable[i]);
- error = register_blkdev(zvol_major, ZVOL_DRIVER);
- if (error) {
- printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
- goto out_free;
- }
-
- blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
- THIS_MODULE, zvol_probe, NULL, NULL);
-
return (0);
-
-out_free:
- kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
-out_taskq:
- taskq_destroy(zvol_taskq);
-out:
- ida_destroy(&zvol_ida);
- rw_destroy(&zvol_state_lock);
- list_destroy(&zvol_state_list);
-
- return (SET_ERROR(error));
}
void
-zvol_fini(void)
+zvol_fini_impl(void)
{
- zvol_remove_minors_impl(NULL);
-
- blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
- unregister_blkdev(zvol_major, ZVOL_DRIVER);
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
-
- taskq_destroy(zvol_taskq);
list_destroy(&zvol_state_list);
rw_destroy(&zvol_state_lock);
-
- ida_destroy(&zvol_ida);
}
-
-/* BEGIN CSTYLED */
-module_param(zvol_inhibit_dev, uint, 0644);
-MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
-
-module_param(zvol_major, uint, 0444);
-MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
-
-module_param(zvol_threads, uint, 0444);
-MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
-
-module_param(zvol_request_sync, uint, 0644);
-MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
-
-module_param(zvol_max_discard_blocks, ulong, 0444);
-MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
-
-module_param(zvol_prefetch_bytes, uint, 0644);
-MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
-
-module_param(zvol_volmode, uint, 0644);
-MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
-/* END CSTYLED */