summaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorGeorge Wilson <[email protected]>2013-05-10 12:47:54 -0700
committerBrian Behlendorf <[email protected]>2013-11-05 12:14:21 -0800
commit03c6040bee6c87a9413b7da41d9f580f79a8ab62 (patch)
tree86f8f3e6220c87da8e1b075fba858848f59a9c81 /module/zfs
parent831baf06efb3023ddee7ed41800d3b44521bf2ee (diff)
Illumos #3236
3236 zio nop-write Reviewed by: Matt Ahrens <[email protected]> Reviewed by: Adam Leventhal <[email protected]> Reviewed by: Christopher Siden <[email protected]> Approved by: Garrett D'Amore <[email protected]> References: illumos/illumos-gate@80901aea8e78a2c20751f61f01bebd1d5b5c2ba5 https://www.illumos.org/issues/3236 Porting Notes 1. This patch is being merged dispite an increased instance of https://www.illumos.org/issues/3113 being triggered by ztest. Ported-by: Brian Behlendorf <[email protected]> Issue #1489
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/arc.c6
-rw-r--r--module/zfs/dbuf.c19
-rw-r--r--module/zfs/dmu.c138
-rw-r--r--module/zfs/dsl_pool.c1
-rw-r--r--module/zfs/spa_misc.c1
-rw-r--r--module/zfs/zfs_vnops.c6
-rw-r--r--module/zfs/zio.c105
-rw-r--r--module/zfs/zvol.c13
8 files changed, 234 insertions, 55 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index c66ff009d..04f87a810 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -3708,6 +3708,12 @@ arc_write_done(zio_t *zio)
arc_hdr_destroy(exists);
exists = buf_hash_insert(hdr, &hash_lock);
ASSERT3P(exists, ==, NULL);
+ } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+ /* nopwrite */
+ ASSERT(zio->io_prop.zp_nopwrite);
+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+ panic("bad nopwrite, hdr=%p exists=%p",
+ (void *)hdr, (void *)exists);
} else {
/* Dedup */
ASSERT(hdr->b_datacnt == 1);
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 95c7b3297..cfa16b7c0 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -823,13 +823,15 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
ASSERT(db->db_data_pending != dr);
/* free this block */
- if (!BP_IS_HOLE(bp)) {
+ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
spa_t *spa;
DB_GET_SPA(&spa, db);
zio_free(spa, txg, bp);
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ dr->dt.dl.dr_nopwrite = B_FALSE;
+
/*
* Release the already-written buffer, so we leave it in
* a consistent dirty state. Note that all callers are
@@ -2269,6 +2271,13 @@ dmu_buf_freeable(dmu_buf_t *dbuf)
return (res);
}
+blkptr_t *
+dmu_buf_get_blkptr(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ return (dbi->db_blkptr);
+}
+
static void
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
{
@@ -2622,7 +2631,11 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT0(zio->io_error);
ASSERT(db->db_blkptr == bp);
- if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ /*
+ * For nopwrites and rewrites we ensure that the bp matches our
+ * original and bypass all the accounting.
+ */
+ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
ASSERT(BP_EQUAL(bp, bp_orig));
} else {
objset_t *os;
@@ -2822,7 +2835,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
- dr->dt.dl.dr_copies);
+ dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
mutex_exit(&db->db_mtx);
} else if (db->db_state == DB_NOFILL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 34f3eeef9..1e16b1eb0 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -41,12 +41,18 @@
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
#include <sys/sa.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <sys/zfs_znode.h>
#endif
+/*
+ * Enable/disable nopwrite feature.
+ */
+int zfs_nopwrite_enabled = 1;
+
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ DMU_BSWAP_UINT8, TRUE, "unallocated" },
{ DMU_BSWAP_ZAP, TRUE, "object directory" },
@@ -1473,6 +1479,16 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
if (zio->io_error == 0) {
+ dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
+ if (dr->dt.dl.dr_nopwrite) {
+ ASSERTV(blkptr_t *bp = zio->io_bp);
+ ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
+ ASSERTV(uint8_t chksum = BP_GET_CHECKSUM(bp_orig));
+
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
+ ASSERT(zio_checksum_table[chksum].ci_dedup);
+ }
dr->dt.dl.dr_overridden_by = *zio->io_bp;
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
@@ -1494,11 +1510,22 @@ dmu_sync_late_arrival_done(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
dmu_sync_arg_t *dsa = zio->io_private;
+ ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
- ASSERT(zio->io_bp->blk_birth == zio->io_txg);
- ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
- zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+ /*
+ * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
+ * then there is nothing to do here. Otherwise, free the
+ * newly allocated block in this txg.
+ */
+ if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ } else {
+ ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+ ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+ ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+ zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+ }
}
dmu_tx_commit(dsa->dsa_tx);
@@ -1544,7 +1571,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
*
* Return values:
*
- * EEXIST: this txg has already been synced, so there's nothing to to.
+ * EEXIST: this txg has already been synced, so there's nothing to do.
* The caller should not log the write.
*
* ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
@@ -1576,7 +1603,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
dnode_t *dn;
ASSERT(pio != NULL);
- ASSERT(BP_IS_HOLE(bp));
ASSERT(txg != 0);
SET_BOOKMARK(&zb, ds->ds_object,
@@ -1631,6 +1657,23 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
return (SET_ERROR(ENOENT));
}
+ ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
+
+ /*
+ * Assume the on-disk data is X, the current syncing data is Y,
+ * and the current in-memory data is Z (currently in dmu_sync).
+ * X and Z are identical but Y is has been modified. Normally,
+ * when X and Z are the same we will perform a nopwrite but if Y
+ * is different we must disable nopwrite since the resulting write
+ * of Y to disk can free the block containing X. If we allowed a
+ * nopwrite to occur the block pointing to Z would reference a freed
+ * block. Since this is a rare case we simplify this by disabling
+ * nopwrite if the current dmu_sync-ing dbuf has been modified in
+ * a previous transaction.
+ */
+ if (dr->dr_next)
+ zp.zp_nopwrite = B_FALSE;
+
ASSERT(dr->dr_txg == txg);
if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
@@ -1715,15 +1758,27 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
enum zio_checksum checksum = os->os_checksum;
enum zio_compress compress = os->os_compress;
enum zio_checksum dedup_checksum = os->os_dedup_checksum;
- boolean_t dedup;
+ boolean_t dedup = B_FALSE;
+ boolean_t nopwrite = B_FALSE;
boolean_t dedup_verify = os->os_dedup_verify;
int copies = os->os_copies;
/*
- * Determine checksum setting.
+ * We maintain different write policies for each of the following
+ * types of data:
+ * 1. metadata
+ * 2. preallocated blocks (i.e. level-0 blocks of a dump device)
+ * 3. all other level 0 blocks
*/
if (ismd) {
/*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+ ZIO_COMPRESS_LZJB;
+
+ /*
* Metadata always gets checksummed. If the data
* checksum is multi-bit correctable, and it's not a
* ZBT-style checksum, then it's suitable for metadata
@@ -1733,45 +1788,47 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
if (zio_checksum_table[checksum].ci_correctable < 1 ||
zio_checksum_table[checksum].ci_eck)
checksum = ZIO_CHECKSUM_FLETCHER_4;
- } else {
- checksum = zio_checksum_select(dn->dn_checksum, checksum);
- }
+ } else if (wp & WP_NOFILL) {
+ ASSERT(level == 0);
- /*
- * Determine compression setting.
- */
- if (ismd) {
/*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
+ * If we're writing preallocated blocks, we aren't actually
+ * writing them so don't set any policy properties. These
+ * blocks are currently only used by an external subsystem
+ * outside of zfs (i.e. dump) and not written by the zio
+ * pipeline.
*/
- compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
- ZIO_COMPRESS_LZJB;
+ compress = ZIO_COMPRESS_OFF;
+ checksum = ZIO_CHECKSUM_OFF;
} else {
compress = zio_compress_select(dn->dn_compress, compress);
- }
- /*
- * Determine dedup setting. If we are in dmu_sync(), we won't
- * actually dedup now because that's all done in syncing context;
- * but we do want to use the dedup checkum. If the checksum is not
- * strong enough to ensure unique signatures, force dedup_verify.
- */
- dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
- if (dedup) {
- checksum = dedup_checksum;
- if (!zio_checksum_table[checksum].ci_dedup)
- dedup_verify = 1;
- }
+ checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
+ zio_checksum_select(dn->dn_checksum, checksum) :
+ dedup_checksum;
- if (wp & WP_DMU_SYNC)
- dedup = 0;
+ /*
+ * Determine dedup setting. If we are in dmu_sync(),
+ * we won't actually dedup now because that's all
+ * done in syncing context; but we do want to use the
+ * dedup checkum. If the checksum is not strong
+ * enough to ensure unique signatures, force
+ * dedup_verify.
+ */
+ if (dedup_checksum != ZIO_CHECKSUM_OFF) {
+ dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
+ if (!zio_checksum_table[checksum].ci_dedup)
+ dedup_verify = B_TRUE;
+ }
- if (wp & WP_NOFILL) {
- ASSERT(!ismd && level == 0);
- checksum = ZIO_CHECKSUM_OFF;
- compress = ZIO_COMPRESS_OFF;
- dedup = B_FALSE;
+ /*
+ * Enable nopwrite if we have a cryptographically secure
+ * checksum that has no known collisions (i.e. SHA-256)
+ * and compression is enabled. We don't enable nopwrite if
+ * dedup is enabled as the two features are mutually exclusive.
+ */
+ nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
+ compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
}
zp->zp_checksum = checksum;
@@ -1781,6 +1838,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
+ zp->zp_nopwrite = nopwrite;
}
int
@@ -2005,4 +2063,8 @@ EXPORT_SYMBOL(dmu_ot);
module_param(zfs_mdcomp_disable, int, 0644);
MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression");
+
+module_param(zfs_nopwrite_enabled, int, 0644);
+MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes");
+
#endif
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 72e819f6b..e7127c535 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -421,7 +421,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
* clean up our in-memory structures accumulated while syncing:
*
* - move dead blocks from the pending deadlist to the on-disk deadlist
- * - clean up zil records
* - release hold from dsl_dataset_dirty()
*/
while ((ds = list_remove_head(&synced_datasets))) {
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 3b7922b6c..7bdfbf738 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -245,7 +245,6 @@ int spa_mode_global;
* Secondly, the value determines if an I/O is considered "hung".
* Any I/O that has not completed in zfs_deadman_synctime is considered
* "hung" resulting in a zevent being posted.
- * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
*/
unsigned long zfs_deadman_synctime = 1000ULL;
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index e6c1711ac..84b4fe81f 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -1050,6 +1050,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
DMU_READ_NO_PREFETCH);
if (error == 0) {
+ blkptr_t *obp = dmu_buf_get_blkptr(db);
+ if (obp) {
+ ASSERT(BP_IS_HOLE(bp));
+ *bp = *obp;
+ }
+
zgd->zgd_db = db;
zgd->zgd_bp = bp;
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index a24f2bef3..1f803a44b 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -723,9 +723,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
DMU_OT_IS_VALID(zp->zp_type) &&
zp->zp_level < 32 &&
zp->zp_copies > 0 &&
- zp->zp_copies <= spa_max_replication(spa) &&
- zp->zp_dedup <= 1 &&
- zp->zp_dedup_verify <= 1);
+ zp->zp_copies <= spa_max_replication(spa));
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
@@ -753,13 +751,20 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
}
void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
{
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+ /*
+ * We must reset the io_prop to match the values that existed
+ * when the bp was first written by dmu_sync() keeping in mind
+ * that nopwrite and dedup are mutually exclusive.
+ */
+ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
+ zio->io_prop.zp_nopwrite = nopwrite;
zio->io_prop.zp_copies = copies;
zio->io_bp_override = bp;
}
@@ -1051,6 +1056,19 @@ zio_write_bp_init(zio_t *zio)
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ /*
+ * If we've been overridden and nopwrite is set then
+ * set the flag accordingly to indicate that a nopwrite
+ * has already occurred.
+ */
+ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
+ ASSERT(!zp->zp_dedup);
+ zio->io_flags |= ZIO_FLAG_NOPWRITE;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ASSERT(!zp->zp_nopwrite);
+
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
return (ZIO_PIPELINE_CONTINUE);
@@ -1138,6 +1156,11 @@ zio_write_bp_init(zio_t *zio)
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
}
+ if (zp->zp_nopwrite) {
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
+ }
}
return (ZIO_PIPELINE_CONTINUE);
@@ -1404,6 +1427,7 @@ zio_reexecute(zio_t *pio)
pio->io_stage = pio->io_orig_stage;
pio->io_pipeline = pio->io_orig_pipeline;
pio->io_reexecute = 0;
+ pio->io_flags |= ZIO_FLAG_REEXECUTED;
pio->io_error = 0;
for (w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_state[w] = 0;
@@ -1887,8 +1911,9 @@ zio_write_gang_block(zio_t *pio)
zp.zp_type = DMU_OT_NONE;
zp.zp_level = 0;
zp.zp_copies = gio->io_prop.zp_copies;
- zp.zp_dedup = 0;
- zp.zp_dedup_verify = 0;
+ zp.zp_dedup = B_FALSE;
+ zp.zp_dedup_verify = B_FALSE;
+ zp.zp_nopwrite = B_FALSE;
zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1913,6 +1938,62 @@ zio_write_gang_block(zio_t *pio)
}
/*
+ * The zio_nop_write stage in the pipeline determines if allocating
+ * a new bp is necessary. By leveraging a cryptographically secure checksum,
+ * such as SHA256, we can compare the checksums of the new data and the old
+ * to determine if allocating a new block is required. The nopwrite
+ * feature can handle writes in either syncing or open context (i.e. zil
+ * writes) and as a result is mutually exclusive with dedup.
+ */
+static int
+zio_nop_write(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ zio_prop_t *zp = &zio->io_prop;
+
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ ASSERT(zp->zp_nopwrite);
+ ASSERT(!zp->zp_dedup);
+ ASSERT(zio->io_bp_override == NULL);
+ ASSERT(IO_IS_ALLOCATING(zio));
+
+ /*
+ * Check to see if the original bp and the new bp have matching
+ * characteristics (i.e. same checksum, compression algorithms, etc).
+ * If they don't then just continue with the pipeline which will
+ * allocate a new bp.
+ */
+ if (BP_IS_HOLE(bp_orig) ||
+ !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
+ BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
+ BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
+ BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
+ zp->zp_copies != BP_GET_NDVAS(bp_orig))
+ return (ZIO_PIPELINE_CONTINUE);
+
+ /*
+ * If the checksums match then reset the pipeline so that we
+ * avoid allocating a new bp and issuing any I/O.
+ */
+ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
+ ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
+ ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
+ ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
+ ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
+ ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
+ sizeof (uint64_t)) == 0);
+
+ *bp = *bp_orig;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ zio->io_flags |= ZIO_FLAG_NOPWRITE;
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
* ==========================================================================
* Dedup
* ==========================================================================
@@ -2186,7 +2267,7 @@ zio_ddt_write(zio_t *zio)
zio->io_stage = ZIO_STAGE_OPEN;
BP_ZERO(bp);
} else {
- zp->zp_dedup = 0;
+ zp->zp_dedup = B_FALSE;
}
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
@@ -2815,7 +2896,8 @@ zio_ready(zio_t *zio)
if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
- ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+ ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
+ (zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
zio->io_ready(zio);
@@ -2893,6 +2975,8 @@ zio_done(zio_t *zio)
ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
(BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp)));
}
+ if (zio->io_flags & ZIO_FLAG_NOPWRITE)
+ VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
}
/*
@@ -3015,7 +3099,7 @@ zio_done(zio_t *zio)
if ((zio->io_error || zio->io_reexecute) &&
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
- !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
+ !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
zio_gang_tree_free(&zio->io_gang_tree);
@@ -3112,7 +3196,7 @@ zio_done(zio_t *zio)
}
if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
- !BP_IS_HOLE(zio->io_bp)) {
+ !BP_IS_HOLE(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
}
@@ -3159,6 +3243,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_issue_async,
zio_write_bp_init,
zio_checksum_generate,
+ zio_nop_write,
zio_ddt_read_start,
zio_ddt_read_done,
zio_ddt_write,
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 59822a6cd..79c56cd78 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -35,6 +35,7 @@
* needs to be run before opening and using a device.
*/
+#include <sys/dbuf.h>
#include <sys/dmu_traverse.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_prop.h>
@@ -815,8 +816,10 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
{
zvol_state_t *zv = arg;
objset_t *os = zv->zv_objset;
+ uint64_t object = ZVOL_OBJ;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length;
+ blkptr_t *bp = &lr->lr_blkptr;
dmu_buf_t *db;
zgd_t *zgd;
int error;
@@ -836,14 +839,20 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
- error = dmu_read(os, ZVOL_OBJ, offset, size, buf,
+ error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
} else {
size = zv->zv_volblocksize;
offset = P2ALIGN_TYPED(offset, size, uint64_t);
- error = dmu_buf_hold(os, ZVOL_OBJ, offset, zgd, &db,
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
if (error == 0) {
+ blkptr_t *obp = dmu_buf_get_blkptr(db);
+ if (obp) {
+ ASSERT(BP_IS_HOLE(bp));
+ *bp = *obp;
+ }
+
zgd->zgd_db = db;
zgd->zgd_bp = &lr->lr_blkptr;