diff options
author | George Wilson <[email protected]> | 2013-05-10 12:47:54 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2013-11-05 12:14:21 -0800 |
commit | 03c6040bee6c87a9413b7da41d9f580f79a8ab62 (patch) | |
tree | 86f8f3e6220c87da8e1b075fba858848f59a9c81 /module/zfs/dmu.c | |
parent | 831baf06efb3023ddee7ed41800d3b44521bf2ee (diff) |
Illumos #3236
3236 zio nop-write
Reviewed by: Matt Ahrens <[email protected]>
Reviewed by: Adam Leventhal <[email protected]>
Reviewed by: Christopher Siden <[email protected]>
Approved by: Garrett D'Amore <[email protected]>
References:
illumos/illumos-gate@80901aea8e78a2c20751f61f01bebd1d5b5c2ba5
https://www.illumos.org/issues/3236
Porting Notes
1. This patch is being merged dispite an increased instance of
https://www.illumos.org/issues/3113 being triggered by ztest.
Ported-by: Brian Behlendorf <[email protected]>
Issue #1489
Diffstat (limited to 'module/zfs/dmu.c')
-rw-r--r-- | module/zfs/dmu.c | 138 |
1 files changed, 100 insertions, 38 deletions
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 34f3eeef9..1e16b1eb0 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -41,12 +41,18 @@ #include <sys/zfs_ioctl.h> #include <sys/zap.h> #include <sys/zio_checksum.h> +#include <sys/zio_compress.h> #include <sys/sa.h> #ifdef _KERNEL #include <sys/vmsystm.h> #include <sys/zfs_znode.h> #endif +/* + * Enable/disable nopwrite feature. + */ +int zfs_nopwrite_enabled = 1; + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, "object directory" }, @@ -1473,6 +1479,16 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { + dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); + if (dr->dt.dl.dr_nopwrite) { + ASSERTV(blkptr_t *bp = zio->io_bp); + ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); + ASSERTV(uint8_t chksum = BP_GET_CHECKSUM(bp_orig)); + + ASSERT(BP_EQUAL(bp, bp_orig)); + ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); + ASSERT(zio_checksum_table[chksum].ci_dedup); + } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; @@ -1494,11 +1510,22 @@ dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; + ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { - ASSERT(zio->io_bp->blk_birth == zio->io_txg); - ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); - zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + /* + * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) + * then there is nothing to do here. Otherwise, free the + * newly allocated block in this txg. + */ + if (zio->io_flags & ZIO_FLAG_NOPWRITE) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); + ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); + zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + } } dmu_tx_commit(dsa->dsa_tx); @@ -1544,7 +1571,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, * * Return values: * - * EEXIST: this txg has already been synced, so there's nothing to to. + * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. @@ -1576,7 +1603,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dnode_t *dn; ASSERT(pio != NULL); - ASSERT(BP_IS_HOLE(bp)); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, @@ -1631,6 +1657,23 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) return (SET_ERROR(ENOENT)); } + ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); + + /* + * Assume the on-disk data is X, the current syncing data is Y, + * and the current in-memory data is Z (currently in dmu_sync). + * X and Z are identical but Y is has been modified. Normally, + * when X and Z are the same we will perform a nopwrite but if Y + * is different we must disable nopwrite since the resulting write + * of Y to disk can free the block containing X. If we allowed a + * nopwrite to occur the block pointing to Z would reference a freed + * block. Since this is a rare case we simplify this by disabling + * nopwrite if the current dmu_sync-ing dbuf has been modified in + * a previous transaction. + */ + if (dr->dr_next) + zp.zp_nopwrite = B_FALSE; + ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { @@ -1715,15 +1758,27 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; - boolean_t dedup; + boolean_t dedup = B_FALSE; + boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; /* - * Determine checksum setting. + * We maintain different write policies for each of the following + * types of data: + * 1. metadata + * 2. preallocated blocks (i.e. level-0 blocks of a dump device) + * 3. all other level 0 blocks */ if (ismd) { /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : + ZIO_COMPRESS_LZJB; + + /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata @@ -1733,45 +1788,47 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) if (zio_checksum_table[checksum].ci_correctable < 1 || zio_checksum_table[checksum].ci_eck) checksum = ZIO_CHECKSUM_FLETCHER_4; - } else { - checksum = zio_checksum_select(dn->dn_checksum, checksum); - } + } else if (wp & WP_NOFILL) { + ASSERT(level == 0); - /* - * Determine compression setting. - */ - if (ismd) { /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. + * If we're writing preallocated blocks, we aren't actually + * writing them so don't set any policy properties. These + * blocks are currently only used by an external subsystem + * outside of zfs (i.e. dump) and not written by the zio + * pipeline. */ - compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : - ZIO_COMPRESS_LZJB; + compress = ZIO_COMPRESS_OFF; + checksum = ZIO_CHECKSUM_OFF; } else { compress = zio_compress_select(dn->dn_compress, compress); - } - /* - * Determine dedup setting. If we are in dmu_sync(), we won't - * actually dedup now because that's all done in syncing context; - * but we do want to use the dedup checkum. If the checksum is not - * strong enough to ensure unique signatures, force dedup_verify. - */ - dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); - if (dedup) { - checksum = dedup_checksum; - if (!zio_checksum_table[checksum].ci_dedup) - dedup_verify = 1; - } + checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? + zio_checksum_select(dn->dn_checksum, checksum) : + dedup_checksum; - if (wp & WP_DMU_SYNC) - dedup = 0; + /* + * Determine dedup setting. If we are in dmu_sync(), + * we won't actually dedup now because that's all + * done in syncing context; but we do want to use the + * dedup checkum. If the checksum is not strong + * enough to ensure unique signatures, force + * dedup_verify. + */ + if (dedup_checksum != ZIO_CHECKSUM_OFF) { + dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; + if (!zio_checksum_table[checksum].ci_dedup) + dedup_verify = B_TRUE; + } - if (wp & WP_NOFILL) { - ASSERT(!ismd && level == 0); - checksum = ZIO_CHECKSUM_OFF; - compress = ZIO_COMPRESS_OFF; - dedup = B_FALSE; + /* + * Enable nopwrite if we have a cryptographically secure + * checksum that has no known collisions (i.e. SHA-256) + * and compression is enabled. We don't enable nopwrite if + * dedup is enabled as the two features are mutually exclusive. + */ + nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && + compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } zp->zp_checksum = checksum; @@ -1781,6 +1838,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; + zp->zp_nopwrite = nopwrite; } int @@ -2005,4 +2063,8 @@ EXPORT_SYMBOL(dmu_ot); module_param(zfs_mdcomp_disable, int, 0644); MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression"); + +module_param(zfs_nopwrite_enabled, int, 0644); +MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes"); + #endif |