aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--module/zfs/vdev_disk.c4
-rw-r--r--module/zfs/zvol.c198
2 files changed, 116 insertions, 86 deletions
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index b329ef3c2..1419ae6ad 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -23,7 +23,7 @@
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Rewritten for Linux by Brian Behlendorf <[email protected]>.
* LLNL-CODE-403049.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -56,7 +56,7 @@ typedef struct dio_request {
} dio_request_t;
-#ifdef HAVE_OPEN_BDEV_EXCLUSIVE
+#if defined(HAVE_OPEN_BDEV_EXCLUSIVE) || defined(HAVE_BLKDEV_GET_BY_PATH)
static fmode_t
vdev_bdev_mode(int smode)
{
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 9fd689fbd..a77339d7f 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -36,7 +36,7 @@
*
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
*/
/*
@@ -155,6 +155,11 @@ typedef struct {
} zvol_task_t;
#define ZVOL_RDONLY 0x1
+/*
+ * Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which
+ * specifies whether or not the zvol _can_ be written to)
+ */
+#define ZVOL_WRITTEN_TO 0x2
static uint64_t
zvol_name_hash(const char *name)
@@ -742,6 +747,7 @@ zvol_write(void *arg)
zvol_state_t *zv = zvr->zv;
ASSERT(zv && zv->zv_open_count > 0);
+ ASSERT(zv->zv_zilog != NULL);
ssize_t start_resid = uio.uio_resid;
unsigned long start_jif = jiffies;
@@ -832,6 +838,7 @@ zvol_discard(void *arg)
unsigned long start_jif;
ASSERT(zv && zv->zv_open_count > 0);
+ ASSERT(zv->zv_zilog != NULL);
start_jif = jiffies;
blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
@@ -930,6 +937,86 @@ zvol_read(void *arg)
kmem_free(zvr, sizeof (zv_request_t));
}
+/* ARGSUSED */
+static void
+zvol_get_done(zgd_t *zgd, int error)
+{
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ rangelock_exit(zgd->zgd_lr);
+
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+static int
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+ zvol_state_t *zv = arg;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error;
+
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(zio, !=, NULL);
+ ASSERT3U(size, !=, 0);
+
+ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_lwb = lwb;
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
+ RL_READER);
+ error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ } else { /* indirect write */
+ /*
+ * Have to lock the whole block to ensure when it's written out
+ * and its checksum is being calculated that no one can change
+ * the data. Contrarily to zfs_get_data we need not re-check
+ * blocksize after we get the lock because it cannot be changed.
+ */
+ size = zv->zv_volblocksize;
+ offset = P2ALIGN_TYPED(offset, size, uint64_t);
+ zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
+ RL_READER);
+ error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+ if (error == 0) {
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db != NULL);
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zvol_get_done, zgd);
+
+ if (error == 0)
+ return (0);
+ }
+ }
+
+ zvol_get_done(zgd, error);
+
+ return (SET_ERROR(error));
+}
+
static MAKE_REQUEST_FN_RET
zvol_request(struct request_queue *q, struct bio *bio)
{
@@ -965,6 +1052,23 @@ zvol_request(struct request_queue *q, struct bio *bio)
*/
rw_enter(&zv->zv_suspend_lock, RW_READER);
+ /*
+ * Open a ZIL if this is the first time we have written to this
+ * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
+ * than zv_state_lock so that we don't need to acquire an
+ * additional lock in this path.
+ */
+ if (zv->zv_zilog == NULL) {
+ rw_exit(&zv->zv_suspend_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+ if (zv->zv_zilog == NULL) {
+ zv->zv_zilog = zil_open(zv->zv_objset,
+ zvol_get_data);
+ zv->zv_flags |= ZVOL_WRITTEN_TO;
+ }
+ rw_downgrade(&zv->zv_suspend_lock);
+ }
+
/* bio marked as FLUSH need to flush before write */
if (bio_is_flush(bio))
zil_commit(zv->zv_zilog, ZVOL_OBJ);
@@ -1040,86 +1144,6 @@ out:
#endif
}
-/* ARGSUSED */
-static void
-zvol_get_done(zgd_t *zgd, int error)
-{
- if (zgd->zgd_db)
- dmu_buf_rele(zgd->zgd_db, zgd);
-
- rangelock_exit(zgd->zgd_lr);
-
- kmem_free(zgd, sizeof (zgd_t));
-}
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-static int
-zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
-{
- zvol_state_t *zv = arg;
- uint64_t offset = lr->lr_offset;
- uint64_t size = lr->lr_length;
- dmu_buf_t *db;
- zgd_t *zgd;
- int error;
-
- ASSERT3P(lwb, !=, NULL);
- ASSERT3P(zio, !=, NULL);
- ASSERT3U(size, !=, 0);
-
- zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_lwb = lwb;
-
- /*
- * Write records come in two flavors: immediate and indirect.
- * For small writes it's cheaper to store the data with the
- * log record (immediate); for large writes it's cheaper to
- * sync the data and get a pointer to it (indirect) so that
- * we don't have to write the data twice.
- */
- if (buf != NULL) { /* immediate write */
- zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
- RL_READER);
- error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
- DMU_READ_NO_PREFETCH);
- } else { /* indirect write */
- /*
- * Have to lock the whole block to ensure when it's written out
- * and its checksum is being calculated that no one can change
- * the data. Contrarily to zfs_get_data we need not re-check
- * blocksize after we get the lock because it cannot be changed.
- */
- size = zv->zv_volblocksize;
- offset = P2ALIGN_TYPED(offset, size, uint64_t);
- zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
- RL_READER);
- error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
- DMU_READ_NO_PREFETCH);
- if (error == 0) {
- blkptr_t *bp = &lr->lr_blkptr;
-
- zgd->zgd_db = db;
- zgd->zgd_bp = bp;
-
- ASSERT(db != NULL);
- ASSERT(db->db_offset == offset);
- ASSERT(db->db_size == size);
-
- error = dmu_sync(zio, lr->lr_common.lrc_txg,
- zvol_get_done, zgd);
-
- if (error == 0)
- return (0);
- }
- }
-
- zvol_get_done(zgd, error);
-
- return (SET_ERROR(error));
-}
-
/*
* The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
*/
@@ -1157,6 +1181,9 @@ zvol_setup_zv(zvol_state_t *zv)
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
+ zv->zv_zilog = NULL;
+ zv->zv_flags &= ~ZVOL_WRITTEN_TO;
+
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
if (error)
return (SET_ERROR(error));
@@ -1171,7 +1198,6 @@ zvol_setup_zv(zvol_state_t *zv)
set_capacity(zv->zv_disk, volsize >> 9);
zv->zv_volsize = volsize;
- zv->zv_zilog = zil_open(os, zvol_get_data);
if (ro || dmu_objset_is_snapshot(os) ||
!spa_writeable(dmu_objset_spa(os))) {
@@ -1194,7 +1220,11 @@ zvol_shutdown_zv(zvol_state_t *zv)
ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
RW_LOCK_HELD(&zv->zv_suspend_lock));
- zil_close(zv->zv_zilog);
+ if (zv->zv_flags & ZVOL_WRITTEN_TO) {
+ ASSERT(zv->zv_zilog != NULL);
+ zil_close(zv->zv_zilog);
+ }
+
zv->zv_zilog = NULL;
dnode_rele(zv->zv_dn, FTAG);
@@ -1204,7 +1234,7 @@ zvol_shutdown_zv(zvol_state_t *zv)
* Evict cached data. We must write out any dirty data before
* disowning the dataset.
*/
- if (!(zv->zv_flags & ZVOL_RDONLY))
+ if (zv->zv_flags & ZVOL_WRITTEN_TO)
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
(void) dmu_objset_evict_dbufs(zv->zv_objset);
}