aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/zvol.c
diff options
context:
space:
mode:
authorEtienne Dechamps <[email protected]>2011-09-05 11:11:38 +0200
committerBrian Behlendorf <[email protected]>2012-02-07 16:23:06 -0800
commitb18019d2d810585185493c62e9567fa85e51692c (patch)
treedf0d885190bdb964f934576578eee72481449b75 /module/zfs/zvol.c
parent56c34bac44d47898809c46db3e5444511bbe0ef6 (diff)
Fix synchronicity for ZVOLs.
zvol_write() assumes that the write request must be written to stable storage if rq_is_sync() is true. Unfortunately, this assumption is incorrect. Indeed, "sync" does *not* mean what we think it means in the context of the Linux block layer. This is well explained in linux/fs.h: WRITE: A normal async write. Device will be plugged. WRITE_SYNC: Synchronous write. Identical to WRITE, but passes down the hint that someone will be waiting on this IO shortly. WRITE_FLUSH: Like WRITE_SYNC but with preceding cache flush. WRITE_FUA: Like WRITE_SYNC but data is guaranteed to be on non-volatile media on completion. In other words, SYNC does not *mean* that the write must be on stable storage on completion. It just means that someone is waiting on us to complete the write request. Thus triggering a ZIL commit for each SYNC write request on a ZVOL is unnecessary and harmful for performance. To make matters worse, ZVOL users have no way to express that they actually want data to be written to stable storage, which means the ZIL is broken for ZVOLs. The request for stable storage is expressed by the FUA flag, so we must commit the ZIL after the write if the FUA flag is set. In addition, we must commit the ZIL before the write if the FLUSH flag is set. Also, we must inform the block layer that we actually support FLUSH and FUA. Signed-off-by: Brian Behlendorf <[email protected]>
Diffstat (limited to 'module/zfs/zvol.c')
-rw-r--r--module/zfs/zvol.c30
1 files changed, 27 insertions, 3 deletions
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 9dda04077..0aaa268fa 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -534,6 +534,17 @@ zvol_write(void *arg)
dmu_tx_t *tx;
rl_t *rl;
+ if (req->cmd_flags & VDEV_REQ_FLUSH)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+ /*
+ * Some requests are just for flush and nothing else.
+ */
+ if (size == 0) {
+ blk_end_request(req, 0, size);
+ return;
+ }
+
rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
tx = dmu_tx_create(zv->zv_objset);
@@ -550,12 +561,14 @@ zvol_write(void *arg)
error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx);
if (error == 0)
- zvol_log_write(zv, tx, offset, size, rq_is_sync(req));
+ zvol_log_write(zv, tx, offset, size,
+ req->cmd_flags & VDEV_REQ_FUA);
dmu_tx_commit(tx);
zfs_range_unlock(rl);
- if (rq_is_sync(req) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
+ if ((req->cmd_flags & VDEV_REQ_FUA) ||
+ zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
blk_end_request(req, -error, size);
@@ -578,6 +591,11 @@ zvol_read(void *arg)
int error;
rl_t *rl;
+ if (size == 0) {
+ blk_end_request(req, 0, size);
+ return;
+ }
+
rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
error = dmu_read_req(zv->zv_objset, ZVOL_OBJ, req);
@@ -627,7 +645,7 @@ zvol_request(struct request_queue *q)
while ((req = blk_fetch_request(q)) != NULL) {
size = blk_rq_bytes(req);
- if (blk_rq_pos(req) + blk_rq_sectors(req) >
+ if (size != 0 && blk_rq_pos(req) + blk_rq_sectors(req) >
get_capacity(zv->zv_disk)) {
printk(KERN_INFO
"%s: bad access: block=%llu, count=%lu\n",
@@ -1062,6 +1080,12 @@ zvol_alloc(dev_t dev, const char *name)
if (zv->zv_queue == NULL)
goto out_kmem;
+#ifdef HAVE_BLK_QUEUE_FLUSH
+ blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
+#else
+ blk_queue_ordered(zv->zv_queue, QUEUE_ORDERED_DRAIN, NULL);
+#endif /* HAVE_BLK_QUEUE_FLUSH */
+
zv->zv_disk = alloc_disk(ZVOL_MINORS);
if (zv->zv_disk == NULL)
goto out_queue;