diff options
-rw-r--r-- | module/zfs/vdev_disk.c | 4 | ||||
-rw-r--r-- | module/zfs/zvol.c | 198 |
2 files changed, 116 insertions, 86 deletions
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index b329ef3c2..1419ae6ad 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -23,7 +23,7 @@ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf <[email protected]>. * LLNL-CODE-403049. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -56,7 +56,7 @@ typedef struct dio_request { } dio_request_t; -#ifdef HAVE_OPEN_BDEV_EXCLUSIVE +#if defined(HAVE_OPEN_BDEV_EXCLUSIVE) || defined(HAVE_BLKDEV_GET_BY_PATH) static fmode_t vdev_bdev_mode(int smode) { diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 9fd689fbd..a77339d7f 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -36,7 +36,7 @@ * * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ /* @@ -155,6 +155,11 @@ typedef struct { } zvol_task_t; #define ZVOL_RDONLY 0x1 +/* + * Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which + * specifies whether or not the zvol _can_ be written to) + */ +#define ZVOL_WRITTEN_TO 0x2 static uint64_t zvol_name_hash(const char *name) @@ -742,6 +747,7 @@ zvol_write(void *arg) zvol_state_t *zv = zvr->zv; ASSERT(zv && zv->zv_open_count > 0); + ASSERT(zv->zv_zilog != NULL); ssize_t start_resid = uio.uio_resid; unsigned long start_jif = jiffies; @@ -832,6 +838,7 @@ zvol_discard(void *arg) unsigned long start_jif; ASSERT(zv && zv->zv_open_count > 0); + ASSERT(zv->zv_zilog != NULL); start_jif = jiffies; blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio), @@ -930,6 +937,86 @@ zvol_read(void *arg) kmem_free(zvr, sizeof (zv_request_t)); } +/* ARGSUSED */ +static void +zvol_get_done(zgd_t *zgd, int error) +{ + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + rangelock_exit(zgd->zgd_lr); + + kmem_free(zgd, sizeof (zgd_t)); +} + +/* + * Get data to generate a TX_WRITE intent log record. + */ +static int +zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zvol_state_t *zv = arg; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, + RL_READER); + error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, + DMU_READ_NO_PREFETCH); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's written out + * and its checksum is being calculated that no one can change + * the data. Contrarily to zfs_get_data we need not re-check + * blocksize after we get the lock because it cannot be changed. + */ + size = zv->zv_volblocksize; + offset = P2ALIGN_TYPED(offset, size, uint64_t); + zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, + RL_READER); + error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db != NULL); + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zvol_get_done, zgd); + + if (error == 0) + return (0); + } + } + + zvol_get_done(zgd, error); + + return (SET_ERROR(error)); +} + static MAKE_REQUEST_FN_RET zvol_request(struct request_queue *q, struct bio *bio) { @@ -965,6 +1052,23 @@ zvol_request(struct request_queue *q, struct bio *bio) */ rw_enter(&zv->zv_suspend_lock, RW_READER); + /* + * Open a ZIL if this is the first time we have written to this + * zvol. We protect zv->zv_zilog with zv_suspend_lock rather + * than zv_state_lock so that we don't need to acquire an + * additional lock in this path. + */ + if (zv->zv_zilog == NULL) { + rw_exit(&zv->zv_suspend_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + if (zv->zv_zilog == NULL) { + zv->zv_zilog = zil_open(zv->zv_objset, + zvol_get_data); + zv->zv_flags |= ZVOL_WRITTEN_TO; + } + rw_downgrade(&zv->zv_suspend_lock); + } + /* bio marked as FLUSH need to flush before write */ if (bio_is_flush(bio)) zil_commit(zv->zv_zilog, ZVOL_OBJ); @@ -1040,86 +1144,6 @@ out: #endif } -/* ARGSUSED */ -static void -zvol_get_done(zgd_t *zgd, int error) -{ - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); - - rangelock_exit(zgd->zgd_lr); - - kmem_free(zgd, sizeof (zgd_t)); -} - -/* - * Get data to generate a TX_WRITE intent log record. - */ -static int -zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) -{ - zvol_state_t *zv = arg; - uint64_t offset = lr->lr_offset; - uint64_t size = lr->lr_length; - dmu_buf_t *db; - zgd_t *zgd; - int error; - - ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); - ASSERT3U(size, !=, 0); - - zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_lwb = lwb; - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, - RL_READER); - error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, - DMU_READ_NO_PREFETCH); - } else { /* indirect write */ - /* - * Have to lock the whole block to ensure when it's written out - * and its checksum is being calculated that no one can change - * the data. Contrarily to zfs_get_data we need not re-check - * blocksize after we get the lock because it cannot be changed. - */ - size = zv->zv_volblocksize; - offset = P2ALIGN_TYPED(offset, size, uint64_t); - zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, - RL_READER); - error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, - DMU_READ_NO_PREFETCH); - if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; - - zgd->zgd_db = db; - zgd->zgd_bp = bp; - - ASSERT(db != NULL); - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); - - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zvol_get_done, zgd); - - if (error == 0) - return (0); - } - } - - zvol_get_done(zgd, error); - - return (SET_ERROR(error)); -} - /* * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. */ @@ -1157,6 +1181,9 @@ zvol_setup_zv(zvol_state_t *zv) ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock)); + zv->zv_zilog = NULL; + zv->zv_flags &= ~ZVOL_WRITTEN_TO; + error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) return (SET_ERROR(error)); @@ -1171,7 +1198,6 @@ zvol_setup_zv(zvol_state_t *zv) set_capacity(zv->zv_disk, volsize >> 9); zv->zv_volsize = volsize; - zv->zv_zilog = zil_open(os, zvol_get_data); if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { @@ -1194,7 +1220,11 @@ zvol_shutdown_zv(zvol_state_t *zv) ASSERT(MUTEX_HELD(&zv->zv_state_lock) && RW_LOCK_HELD(&zv->zv_suspend_lock)); - zil_close(zv->zv_zilog); + if (zv->zv_flags & ZVOL_WRITTEN_TO) { + ASSERT(zv->zv_zilog != NULL); + zil_close(zv->zv_zilog); + } + zv->zv_zilog = NULL; dnode_rele(zv->zv_dn, FTAG); @@ -1204,7 +1234,7 @@ zvol_shutdown_zv(zvol_state_t *zv) * Evict cached data. We must write out any dirty data before * disowning the dataset. */ - if (!(zv->zv_flags & ZVOL_RDONLY)) + if (zv->zv_flags & ZVOL_WRITTEN_TO) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); (void) dmu_objset_evict_dbufs(zv->zv_objset); } |