summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--module/os/linux/zfs/zvol_os.c122
1 files changed, 78 insertions, 44 deletions
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index ce719734c..9439954b8 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -18,6 +18,9 @@
*
* CDDL HEADER END
*/
+/*
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ */
#include <sys/dataset_kstats.h>
#include <sys/dbuf.h>
@@ -57,7 +60,7 @@ static struct ida zvol_ida;
typedef struct zv_request {
zvol_state_t *zv;
struct bio *bio;
- zfs_locked_range_t *lr;
+ taskq_ent_t ent;
} zv_request_t;
/*
@@ -108,6 +111,18 @@ zvol_write(void *arg)
ASSERT(zv && zv->zv_open_count > 0);
ASSERT(zv->zv_zilog != NULL);
+ /* bio marked as FLUSH need to flush before write */
+ if (bio_is_flush(bio))
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+ /* Some requests are just for flush and nothing else. */
+ if (uio.uio_resid == 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ BIO_END_IO(bio, 0);
+ kmem_free(zvr, sizeof (zv_request_t));
+ return;
+ }
+
ssize_t start_resid = uio.uio_resid;
unsigned long start_jif = jiffies;
blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
@@ -116,6 +131,9 @@ zvol_write(void *arg)
boolean_t sync =
bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+ uio.uio_loffset, uio.uio_resid, RL_WRITER);
+
uint64_t volsize = zv->zv_volsize;
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
@@ -142,7 +160,7 @@ zvol_write(void *arg)
if (error)
break;
}
- zfs_rangelock_exit(zvr->lr);
+ zfs_rangelock_exit(lr);
int64_t nwritten = start_resid - uio.uio_resid;
dataset_kstats_update_write_kstats(&zv->zv_zso->zvo_kstat, nwritten);
@@ -201,6 +219,9 @@ zvol_discard(void *arg)
if (start >= end)
goto unlock;
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+ start, size, RL_WRITER);
+
tx = dmu_tx_create(zv->zv_objset);
dmu_tx_mark_netfree(tx);
error = dmu_tx_assign(tx, TXG_WAIT);
@@ -212,12 +233,12 @@ zvol_discard(void *arg)
error = dmu_free_long_range(zv->zv_objset,
ZVOL_OBJ, start, size);
}
-unlock:
- zfs_rangelock_exit(zvr->lr);
+ zfs_rangelock_exit(lr);
if (error == 0 && sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
+unlock:
rw_exit(&zv->zv_suspend_lock);
blk_generic_end_io_acct(zv->zv_zso->zvo_queue, WRITE,
&zv->zv_zso->zvo_disk->part0, start_jif);
@@ -243,6 +264,9 @@ zvol_read(void *arg)
blk_generic_start_io_acct(zv->zv_zso->zvo_queue, READ, bio_sectors(bio),
&zv->zv_zso->zvo_disk->part0);
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+ uio.uio_loffset, uio.uio_resid, RL_READER);
+
uint64_t volsize = zv->zv_volsize;
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
@@ -259,7 +283,7 @@ zvol_read(void *arg)
break;
}
}
- zfs_rangelock_exit(zvr->lr);
+ zfs_rangelock_exit(lr);
int64_t nread = start_resid - uio.uio_resid;
dataset_kstats_update_read_kstats(&zv->zv_zso->zvo_kstat, nread);
@@ -294,16 +318,15 @@ zvol_request(struct request_queue *q, struct bio *bio)
}
if (rw == WRITE) {
- boolean_t need_sync = B_FALSE;
-
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
BIO_END_IO(bio, -SET_ERROR(EROFS));
goto out;
}
/*
- * To be released in the I/O function. See the comment on
- * rangelock_enter() below.
+ * Prevents the zvol from being suspended, or the ZIL being
+ * concurrently opened. Will be released after the i/o
+ * completes.
*/
rw_enter(&zv->zv_suspend_lock, RW_READER);
@@ -324,47 +347,55 @@ zvol_request(struct request_queue *q, struct bio *bio)
rw_downgrade(&zv->zv_suspend_lock);
}
- /* bio marked as FLUSH need to flush before write */
- if (bio_is_flush(bio))
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
-
- /* Some requests are just for flush and nothing else. */
- if (size == 0) {
- rw_exit(&zv->zv_suspend_lock);
- BIO_END_IO(bio, 0);
- goto out;
- }
-
zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
zvr->zv = zv;
zvr->bio = bio;
+ taskq_init_ent(&zvr->ent);
/*
- * To be released in the I/O function. Since the I/O functions
- * are asynchronous, we take it here synchronously to make
- * sure overlapped I/Os are properly ordered.
+ * We don't want this thread to be blocked waiting for i/o to
+ * complete, so we instead wait from a taskq callback. The
+ * i/o may be a ZIL write (via zil_commit()), or a read of an
+ * indirect block, or a read of a data block (if this is a
+ * partial-block write). We will indicate that the i/o is
+ * complete by calling BIO_END_IO() from the taskq callback.
+ *
+ * This design allows the calling thread to continue and
+ * initiate more concurrent operations by calling
+ * zvol_request() again. There are typically only a small
+ * number of threads available to call zvol_request() (e.g.
+ * one per iSCSI target), so keeping the latency of
+ * zvol_request() low is important for performance.
+ *
+ * The zvol_request_sync module parameter allows this
+ * behavior to be altered, for performance evaluation
+ * purposes. If the callback blocks, setting
+ * zvol_request_sync=1 will result in much worse performance.
+ *
+ * We can have up to zvol_threads concurrent i/o's being
+ * processed for all zvols on the system. This is typically
+ * a vast improvement over the zvol_request_sync=1 behavior
+ * of one i/o at a time per zvol. However, an even better
+ * design would be for zvol_request() to initiate the zio
+ * directly, and then be notified by the zio_done callback,
+ * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL
+ * interfaces lack this functionality (they block waiting for
+ * the i/o to complete).
*/
- zvr->lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size,
- RL_WRITER);
- /*
- * Sync writes and discards execute zil_commit() which may need
- * to take a RL_READER lock on the whole block being modified
- * via its zillog->zl_get_data(): to avoid circular dependency
- * issues with taskq threads execute these requests
- * synchronously here in zvol_request().
- */
- need_sync = bio_is_fua(bio) ||
- zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
- if (zvol_request_sync || need_sync ||
- taskq_dispatch(zvol_taskq, zvol_discard, zvr,
- TQ_SLEEP) == TASKQID_INVALID)
+ if (zvol_request_sync) {
zvol_discard(zvr);
+ } else {
+ taskq_dispatch_ent(zvol_taskq,
+ zvol_discard, zvr, 0, &zvr->ent);
+ }
} else {
- if (zvol_request_sync || need_sync ||
- taskq_dispatch(zvol_taskq, zvol_write, zvr,
- TQ_SLEEP) == TASKQID_INVALID)
+ if (zvol_request_sync) {
zvol_write(zvr);
+ } else {
+ taskq_dispatch_ent(zvol_taskq,
+ zvol_write, zvr, 0, &zvr->ent);
+ }
}
} else {
/*
@@ -380,14 +411,17 @@ zvol_request(struct request_queue *q, struct bio *bio)
zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
zvr->zv = zv;
zvr->bio = bio;
+ taskq_init_ent(&zvr->ent);
rw_enter(&zv->zv_suspend_lock, RW_READER);
- zvr->lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size,
- RL_READER);
- if (zvol_request_sync || taskq_dispatch(zvol_taskq,
- zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID)
+ /* See comment in WRITE case above. */
+ if (zvol_request_sync) {
zvol_read(zvr);
+ } else {
+ taskq_dispatch_ent(zvol_taskq,
+ zvol_read, zvr, 0, &zvr->ent);
+ }
}
out: