diff options
author | Matthew Ahrens <[email protected]> | 2014-06-05 13:19:08 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2014-08-01 14:28:05 -0700 |
commit | 9b67f605601c77c814037613d8129562db642a29 (patch) | |
tree | 21a3270ed7eda24858e56a9584f64f6359f4b28f /module/zfs | |
parent | faf0f58c69607a15e2d1563567afb815842805de (diff) |
Illumos 4757, 4913
4757 ZFS embedded-data block pointers ("zero block compression")
4913 zfs release should not be subject to space checks
Reviewed by: Adam Leventhal <[email protected]>
Reviewed by: Max Grossman <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Christopher Siden <[email protected]>
Reviewed by: Dan McDonald <[email protected]>
Approved by: Dan McDonald <[email protected]>
References:
https://www.illumos.org/issues/4757
https://www.illumos.org/issues/4913
https://github.com/illumos/illumos-gate/commit/5d7b4d4
Porting notes:
For compatibility with the fastpath code the zio_done() function
needed to be updated. Because embedded-data block pointers do
not require DVAs to be allocated the associated vdevs will not
be marked and therefore should not be unmarked.
Ported by: Tim Chase <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #2544
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/Makefile.in | 1 | ||||
-rw-r--r-- | module/zfs/arc.c | 95 | ||||
-rw-r--r-- | module/zfs/blkptr.c | 121 | ||||
-rw-r--r-- | module/zfs/bpobj.c | 41 | ||||
-rw-r--r-- | module/zfs/dbuf.c | 61 | ||||
-rw-r--r-- | module/zfs/dmu.c | 85 | ||||
-rw-r--r-- | module/zfs/dmu_objset.c | 32 | ||||
-rw-r--r-- | module/zfs/dmu_send.c | 215 | ||||
-rw-r--r-- | module/zfs/dmu_traverse.c | 2 | ||||
-rw-r--r-- | module/zfs/dnode.c | 4 | ||||
-rw-r--r-- | module/zfs/dnode_sync.c | 7 | ||||
-rw-r--r-- | module/zfs/dsl_dataset.c | 2 | ||||
-rw-r--r-- | module/zfs/dsl_destroy.c | 5 | ||||
-rw-r--r-- | module/zfs/dsl_scan.c | 7 | ||||
-rw-r--r-- | module/zfs/dsl_userhold.c | 3 | ||||
-rw-r--r-- | module/zfs/metaslab.c | 2 | ||||
-rw-r--r-- | module/zfs/spa.c | 12 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 9 | ||||
-rw-r--r-- | module/zfs/zfeature_common.c | 5 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 11 | ||||
-rw-r--r-- | module/zfs/zil.c | 9 | ||||
-rw-r--r-- | module/zfs/zio.c | 87 | ||||
-rw-r--r-- | module/zfs/zio_checksum.c | 2 | ||||
-rw-r--r-- | module/zfs/zio_compress.c | 19 |
24 files changed, 670 insertions, 167 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 56ecd4918..48e7e97e9 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -5,6 +5,7 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ obj-$(CONFIG_ZFS) := $(MODULE).o $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o +$(MODULE)-objs += @top_srcdir@/module/zfs/blkptr.o $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 7f6df0ae8..ec006cb0f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -812,8 +812,10 @@ buf_discard_identity(arc_buf_hdr_t *hdr) } static arc_buf_hdr_t * -buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) +buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { + const dva_t *dva = BP_IDENTITY(bp); + uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *buf; @@ -845,6 +847,8 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) arc_buf_hdr_t *fbuf; uint32_t i; + ASSERT(!DVA_IS_EMPTY(&buf->b_dva)); + ASSERT(buf->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(buf)); *lockp = hash_lock; mutex_enter(hash_lock); @@ -3034,10 +3038,10 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) static void arc_read_done(zio_t *zio) { - arc_buf_hdr_t *hdr, *found; + arc_buf_hdr_t *hdr; arc_buf_t *buf; arc_buf_t *abuf; /* buffer we're assigning to callback */ - kmutex_t *hash_lock; + kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; int freeable = FALSE; @@ -3052,12 +3056,24 @@ arc_read_done(zio_t *zio) * reason for it not to be found is if we were freed during the * read. */ - found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, - &hash_lock); - - ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || - (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || - (found == hdr && HDR_L2_READING(hdr))); + if (HDR_IN_HASH_TABLE(hdr)) { + arc_buf_hdr_t *found; + + ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); + ASSERT3U(hdr->b_dva.dva_word[0], ==, + BP_IDENTITY(zio->io_bp)->dva_word[0]); + ASSERT3U(hdr->b_dva.dva_word[1], ==, + BP_IDENTITY(zio->io_bp)->dva_word[1]); + + found = buf_hash_find(hdr->b_spa, zio->io_bp, + &hash_lock); + + ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && + hash_lock == NULL) || + (found == hdr && + DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + } hdr->b_flags &= ~ARC_L2_EVICTED; if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) @@ -3181,17 +3197,26 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { - arc_buf_hdr_t *hdr; + arc_buf_hdr_t *hdr = NULL; arc_buf_t *buf = NULL; - kmutex_t *hash_lock; + kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); int rc = 0; + ASSERT(!BP_IS_EMBEDDED(bp) || + BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); + top: - hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), - &hash_lock); - if (hdr && hdr->b_datacnt > 0) { + if (!BP_IS_EMBEDDED(bp)) { + /* + * Embedded BP's have no DVA and require no I/O to "read". + * Create an anonymous arc buf to back it. + */ + hdr = buf_hash_find(guid, bp, &hash_lock); + } + + if (hdr != NULL && hdr->b_datacnt > 0) { *arc_flags |= ARC_CACHED; @@ -3265,7 +3290,7 @@ top: done(NULL, buf, private); } else { uint64_t size = BP_GET_LSIZE(bp); - arc_callback_t *acb; + arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; @@ -3274,15 +3299,17 @@ top: if (hdr == NULL) { /* this block is not in the cache */ - arc_buf_hdr_t *exists; + arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); buf = arc_buf_alloc(spa, size, private, type); hdr = buf->b_hdr; - hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(bp); - hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; - exists = buf_hash_insert(hdr, &hash_lock); - if (exists) { + if (!BP_IS_EMBEDDED(bp)) { + hdr->b_dva = *BP_IDENTITY(bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(bp); + hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; + exists = buf_hash_insert(hdr, &hash_lock); + } + if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); @@ -3354,7 +3381,8 @@ top: vd = NULL; } - mutex_exit(hash_lock); + if (hash_lock != NULL) + mutex_exit(hash_lock); /* * At this point, we have a level 1 cache miss. Try again in @@ -3526,8 +3554,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp) kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); - hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), - &hash_lock); + ASSERT(!BP_IS_EMBEDDED(bp)); + + hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; if (HDR_BUF_AVAILABLE(hdr)) { @@ -3854,7 +3883,7 @@ arc_write_done(zio_t *zio) ASSERT(hdr->b_acb == NULL); if (zio->io_error == 0) { - if (BP_IS_HOLE(zio->io_bp)) { + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); @@ -3866,10 +3895,10 @@ arc_write_done(zio_t *zio) } /* - * If the block to be written was all-zero, we may have - * compressed it away. In this case no write was performed - * so there will be no dva/birth/checksum. The buffer must - * therefore remain anonymous (and uncached). + * If the block to be written was all-zero or compressed enough to be + * embedded in the BP, no write was performed so there will be no + * dva/birth/checksum. The buffer must therefore remain anonymous + * (and uncached). */ if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; @@ -5219,7 +5248,7 @@ static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) { void *cdata; - size_t csize, len; + size_t csize, len, rounded; ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); ASSERT(l2hdr->b_tmp_cdata != NULL); @@ -5229,6 +5258,12 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, cdata, l2hdr->b_asize); + rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); + if (rounded > csize) { + bzero((char *)cdata + csize, rounded - csize); + csize = rounded; + } + if (csize == 0) { /* zero block, indicate that there's nothing to write */ zio_data_buf_free(cdata, len); diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c new file mode 100644 index 000000000..d56e19996 --- /dev/null +++ b/module/zfs/blkptr.c @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/zio_compress.h> + +/* + * Embedded-data Block Pointers + * + * Normally, block pointers point (via their DVAs) to a block which holds data. + * If the data that we need to store is very small, this is an inefficient + * use of space, because a block must be at minimum 1 sector (typically 512 + * bytes or 4KB). Additionally, reading these small blocks tends to generate + * more random reads. + * + * Embedded-data Block Pointers allow small pieces of data (the "payload", + * up to 112 bytes) to be stored in the block pointer itself, instead of + * being pointed to. The "Pointer" part of this name is a bit of a + * misnomer, as nothing is pointed to. + * + * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to + * be embedded in the block pointer. The logic for this is handled in + * the SPA, by the zio pipeline. Therefore most code outside the zio + * pipeline doesn't need special-cases to handle these block pointers. + * + * See spa.h for details on the exact layout of embedded block pointers. + */ + +void +encode_embedded_bp_compressed(blkptr_t *bp, void *data, + enum zio_compress comp, int uncompressed_size, int compressed_size) +{ + uint64_t *bp64 = (uint64_t *)bp; + uint64_t w = 0; + uint8_t *data8 = data; + int i; + + ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE); + ASSERT(uncompressed_size == compressed_size || + comp != ZIO_COMPRESS_OFF); + ASSERT3U(comp, >=, ZIO_COMPRESS_OFF); + ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); + + bzero(bp, sizeof (*bp)); + BP_SET_EMBEDDED(bp, B_TRUE); + BP_SET_COMPRESS(bp, comp); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + BPE_SET_LSIZE(bp, uncompressed_size); + BPE_SET_PSIZE(bp, compressed_size); + + /* + * Encode the byte array into the words of the block pointer. + * First byte goes into low bits of first word (little endian). + */ + for (i = 0; i < compressed_size; i++) { + BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]); + if (i % sizeof (w) == sizeof (w) - 1) { + /* we've reached the end of a word */ + ASSERT3P(bp64, <, bp + 1); + *bp64 = w; + bp64++; + if (!BPE_IS_PAYLOADWORD(bp, bp64)) + bp64++; + w = 0; + } + } + /* write last partial word */ + if (bp64 < (uint64_t *)(bp + 1)) + *bp64 = w; +} + +/* + * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be + * more than BPE_PAYLOAD_SIZE bytes). + */ +void +decode_embedded_bp_compressed(const blkptr_t *bp, void *buf) +{ + int psize; + uint8_t *buf8 = buf; + uint64_t w = 0; + const uint64_t *bp64 = (const uint64_t *)bp; + int i; + + ASSERT(BP_IS_EMBEDDED(bp)); + + psize = BPE_GET_PSIZE(bp); + + /* + * Decode the words of the block pointer into the byte array. + * Low bits of first word are the first byte (little endian). + */ + for (i = 0; i < psize; i++) { + if (i % sizeof (w) == 0) { + /* beginning of a word */ + ASSERT3P(bp64, <, bp + 1); + w = *bp64; + bp64++; + if (!BPE_IS_PAYLOADWORD(bp, bp64)) + bp64++; + } + buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY); + } +} diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index eb3233b10..7c8f932f5 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -192,6 +192,13 @@ bpobj_close(bpobj_t *bpo) mutex_destroy(&bpo->bpo_lock); } +static boolean_t +bpobj_hasentries(bpobj_t *bpo) +{ + return (bpo->bpo_phys->bpo_num_blkptrs != 0 || + (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0)); +} + static int bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, boolean_t free) @@ -332,9 +339,11 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, out: /* If there are no entries, there should be no bytes. */ - ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || - (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || - bpo->bpo_phys->bpo_bytes == 0); + if (!bpobj_hasentries(bpo)) { + ASSERT0(bpo->bpo_phys->bpo_bytes); + ASSERT0(bpo->bpo_phys->bpo_comp); + ASSERT0(bpo->bpo_phys->bpo_uncomp); + } mutex_exit(&bpo->bpo_lock); return (err); @@ -378,7 +387,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); - if (used == 0) { + if (!bpobj_hasentries(&subbpo)) { /* No point in having an empty subobj. */ bpobj_close(&subbpo); bpobj_free(bpo->bpo_os, subobj, tx); @@ -453,13 +462,29 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); + if (BP_IS_EMBEDDED(bp)) { + /* + * The bpobj will compress better without the payload. + * + * Note that we store EMBEDDED bp's because they have an + * uncompressed size, which must be accounted for. An + * alternative would be to add their size to bpo_uncomp + * without storing the bp, but that would create additional + * complications: bpo_uncomp would be inconsistent with the + * set of BP's stored, and bpobj_iterate() wouldn't visit + * all the space accounted for in the bpobj. + */ + bzero(&stored_bp, sizeof (stored_bp)); + stored_bp.blk_prop = bp->blk_prop; + stored_bp.blk_birth = bp->blk_birth; + } else if (!BP_GET_DEDUP(bp)) { + /* The bpobj will compress better without the checksum */ + bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); + } + /* We never need the fill count. */ stored_bp.blk_fill = 0; - /* The bpobj will compress better if we can leave off the checksum */ - if (!BP_GET_DEDUP(bp)) - bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); - mutex_enter(&bpo->bpo_lock); offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index c6e7197b6..1e5fac78e 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -40,6 +40,8 @@ #include <sys/dmu_zfetch.h> #include <sys/sa.h> #include <sys/sa_impl.h> +#include <sys/zfeature.h> +#include <sys/blkptr.h> #include <sys/range_tree.h> struct dbuf_hold_impl_data { @@ -1492,6 +1494,38 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_exit(&db->db_mtx); } +void +dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, + bp_embedded_type_t etype, enum zio_compress comp, + int uncompressed_size, int compressed_size, int byteorder, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + struct dirty_leaf *dl; + dmu_object_type_t type; + + DB_DNODE_ENTER(db); + type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + dmu_buf_will_not_fill(dbuf, tx); + + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + dl = &db->db_last_dirty->dt.dl; + encode_embedded_bp_compressed(&dl->dr_overridden_by, + data, comp, uncompressed_size, compressed_size); + BPE_SET_ETYPE(&dl->dr_overridden_by, etype); + BP_SET_TYPE(&dl->dr_overridden_by, type); + BP_SET_LEVEL(&dl->dr_overridden_by, 0); + BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); + + dl->dr_override_state = DR_OVERRIDDEN; + dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; +} + /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. @@ -1885,7 +1919,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) } if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { - if (bp && !BP_IS_HOLE(bp)) { + if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; @@ -2575,7 +2609,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) uint64_t fill = 0; int i; - ASSERT(db->db_blkptr == bp); + ASSERT3P(db->db_blkptr, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -2587,7 +2621,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && - BP_GET_TYPE(bp) == dn->dn_bonustype)); + BP_GET_TYPE(bp) == dn->dn_bonustype) || + BP_IS_EMBEDDED(bp)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); } @@ -2628,12 +2663,13 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; - fill += ibp->blk_fill; + fill += BP_GET_FILL(ibp); } } DB_DNODE_EXIT(db); - bp->blk_fill = fill; + if (!BP_IS_EMBEDDED(bp)) + bp->blk_fill = fill; mutex_exit(&db->db_mtx); } @@ -2745,7 +2781,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); - arc_set_callback(db->db_buf, dbuf_do_evict, db); + if (!arc_released(db->db_buf)) + arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); @@ -2871,10 +2908,16 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); DB_DNODE_EXIT(db); - if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - ASSERT(db->db_state != DB_NOFILL); + if (db->db_level == 0 && + dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * The BP for this block has been provided by open context + * (by dmu_sync() or dmu_buf_write_embedded()). + */ + void *contents = (data != NULL) ? data->b_data : NULL; + dr->dr_zio = zio_write(zio, os->os_spa, txg, - db->db_blkptr, data->b_data, arc_buf_size(data), &zp, + db->db_blkptr, contents, db->db.db_size, &zp, dbuf_write_override_ready, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index d8e5739f4..305973abf 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -124,17 +124,13 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { }; int -dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **dbp, int flags) +dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; - int db_flags = DB_RF_CANFAIL; - - if (flags & DMU_READ_NO_PREFETCH) - db_flags |= DB_RF_NOPREFETCH; err = dnode_hold(os, object, FTAG, &dn); if (err) @@ -143,18 +139,37 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + if (db == NULL) { - err = SET_ERROR(EIO); - } else { + *dbp = NULL; + return (SET_ERROR(EIO)); + } + + *dbp = &db->db; + return (err); +} + +int +dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags) +{ + int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + + err = dmu_buf_hold_noread(os, object, offset, tag, dbp); + if (err == 0) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); - if (err) { + if (err != 0) { dbuf_rele(db, tag); - db = NULL; + *dbp = NULL; } } - dnode_rele(dn, FTAG); - *dbp = &db->db; /* NULL db plus first field offset is NULL */ return (err); } @@ -852,6 +867,25 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } +void +dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, + void *data, uint8_t etype, uint8_t comp, int uncompressed_size, + int compressed_size, int byteorder, dmu_tx_t *tx) +{ + dmu_buf_t *db; + + ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); + ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); + VERIFY0(dmu_buf_hold_noread(os, object, offset, + FTAG, &db)); + + dmu_buf_write_embedded(db, + data, (bp_embedded_type_t)etype, (enum zio_compress)comp, + uncompressed_size, compressed_size, byteorder, tx); + + dmu_buf_rele(db, FTAG); +} + /* * DMU support for xuio */ @@ -1393,7 +1427,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); - } else { + } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } @@ -1664,9 +1698,15 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, { dnode_t *dn; - /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os, object, FTAG, &dn); - ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); + /* + * Send streams include each object's checksum function. This + * check ensures that the receiving system can understand the + * checksum function transmitted. + */ + ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); + + VERIFY0(dnode_hold(os, object, FTAG, &dn)); + ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); @@ -1678,9 +1718,14 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, { dnode_t *dn; - /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os, object, FTAG, &dn); - ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); + /* + * Send streams include each object's compression function. This + * check ensures that the receiving system can understand the + * compression function transmitted. + */ + ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); + + VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); @@ -1843,7 +1888,7 @@ __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (i = 0; i < dnp->dn_nblkptr; i++) - doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; + doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); } void diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index b82783098..238892cf4 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -337,7 +337,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * default (fletcher2/off). Snapshots don't need to know about * checksum/compression/copies. */ - if (ds) { + if (ds != NULL) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); @@ -390,7 +390,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, kmem_free(os, sizeof (objset_t)); return (err); } - } else if (ds == NULL) { + } else { /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_LZJB; @@ -434,17 +434,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, &os->os_groupused_dnode); } - /* - * We should be the only thread trying to do this because we - * have ds_opening_lock - */ - if (ds) { - mutex_enter(&ds->ds_lock); - ASSERT(ds->ds_objset == NULL); - ds->ds_objset = os; - mutex_exit(&ds->ds_lock); - } - *osp = os; return (0); } @@ -455,11 +444,19 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) int err = 0; mutex_enter(&ds->ds_opening_lock); - *osp = ds->ds_objset; - if (*osp == NULL) { + if (ds->ds_objset == NULL) { + objset_t *os; err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), - ds, dsl_dataset_get_blkptr(ds), osp); + ds, dsl_dataset_get_blkptr(ds), &os); + + if (err == 0) { + mutex_enter(&ds->ds_lock); + ASSERT(ds->ds_objset == NULL); + ds->ds_objset = os; + mutex_exit(&ds->ds_lock); + } } + *osp = ds->ds_objset; mutex_exit(&ds->ds_opening_lock); return (err); } @@ -981,6 +978,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; + ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3P(bp, ==, os->os_rootbp); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); ASSERT0(BP_GET_LEVEL(bp)); @@ -993,7 +991,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) */ bp->blk_fill = 0; for (i = 0; i < dnp->dn_nblkptr; i++) - bp->blk_fill += dnp->dn_blkptr[i].blk_fill; + bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]); } /* ARGSUSED */ diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 885da23ba..39f282ce6 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -50,7 +50,9 @@ #include <sys/zfs_onexit.h> #include <sys/dmu_send.h> #include <sys/dsl_destroy.h> +#include <sys/blkptr.h> #include <sys/dsl_bookmark.h> +#include <sys/zfeature.h> /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; @@ -197,7 +199,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, } static int -dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, +dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) { struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); @@ -232,13 +234,22 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; - drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); - if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) - drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; - DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); - DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); - DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); - drrw->drr_key.ddk_cksum = bp->blk_cksum; + if (BP_IS_EMBEDDED(bp)) { + /* + * There's no pre-computed checksum of embedded BP's, so + * (like fletcher4-checkummed blocks) userland will have + * to compute a dedup-capable checksum itself. + */ + drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; + } else { + drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); + if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) + drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; + DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); + drrw->drr_key.ddk_cksum = bp->blk_cksum; + } if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); @@ -248,6 +259,43 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, } static int +dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, + int blksz, const blkptr_t *bp) +{ + char buf[BPE_PAYLOAD_SIZE]; + struct drr_write_embedded *drrw = + &(dsp->dsa_drr->drr_u.drr_write_embedded); + + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_bytes(dsp, dsp->dsa_drr, + sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + dsp->dsa_pending_op = PENDING_NONE; + } + + ASSERT(BP_IS_EMBEDDED(bp)); + + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; + drrw->drr_object = object; + drrw->drr_offset = offset; + drrw->drr_length = blksz; + drrw->drr_toguid = dsp->dsa_toguid; + drrw->drr_compression = BP_GET_COMPRESS(bp); + drrw->drr_etype = BPE_GET_ETYPE(bp); + drrw->drr_lsize = BPE_GET_LSIZE(bp); + drrw->drr_psize = BPE_GET_PSIZE(bp); + + decode_embedded_bp_compressed(bp, buf); + + if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) + return (EINTR); + return (0); +} + +static int dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) { struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); @@ -367,6 +415,33 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) return (0); } +static boolean_t +backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) +{ + if (!BP_IS_EMBEDDED(bp)) + return (B_FALSE); + + /* + * Compression function must be legacy, or explicitly enabled. + */ + if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && + !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) + return (B_FALSE); + + /* + * Embed type must be explicitly enabled. + */ + switch (BPE_GET_ETYPE(bp)) { + case BP_EMBEDDED_TYPE_DATA: + if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) + return (B_TRUE); + break; + default: + return (B_FALSE); + } + return (B_FALSE); +} + #define BP_SPAN(dnp, level) \ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) @@ -435,11 +510,17 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); + } else if (backup_do_embed(dsp, bp)) { + /* it's an embedded level-0 block of a regular object */ + int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + err = dump_write_embedded(dsp, zb->zb_object, + zb->zb_blkid * blksz, blksz, bp); } else { /* it's a level-0 block of a regular object */ uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); + ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, @@ -458,7 +539,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } } - err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz, + err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz, blksz, bp, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); } @@ -472,14 +553,15 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, - zfs_bookmark_phys_t *fromzb, boolean_t is_clone, int outfd, - vnode_t *vp, offset_t *off) + zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, + int outfd, vnode_t *vp, offset_t *off) { objset_t *os; dmu_replay_record_t *drr; dmu_sendarg_t *dsp; int err; uint64_t fromtxg = 0; + uint64_t featureflags = 0; err = dmu_objset_from_ds(ds, &os); if (err != 0) { @@ -502,13 +584,23 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, return (SET_ERROR(EINVAL)); } if (version >= ZPL_VERSION_SA) { - DMU_SET_FEATUREFLAGS( - drr->drr_u.drr_begin.drr_versioninfo, - DMU_BACKUP_FEATURE_SA_SPILL); + featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } } #endif + if (embedok && + spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { + featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; + } else { + embedok = B_FALSE; + } + + DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, + featureflags); + drr->drr_u.drr_begin.drr_creation_time = ds->ds_phys->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); @@ -540,6 +632,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_incremental = (fromzb != NULL); + dsp->dsa_featureflags = featureflags; mutex_enter(&ds->ds_sendstream_lock); list_insert_head(&ds->ds_sendstreams, dsp); @@ -591,7 +684,7 @@ out: int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, - int outfd, vnode_t *vp, offset_t *off) + boolean_t embedok, int outfd, vnode_t *vp, offset_t *off) { dsl_pool_t *dp; dsl_dataset_t *ds; @@ -625,10 +718,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, zb.zbm_guid = fromds->ds_phys->ds_guid; is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, + err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, outfd, vp, off); } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, + err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, outfd, vp, off); } dsl_dataset_rele(ds, FTAG); @@ -636,7 +729,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, } int -dmu_send(const char *tosnap, const char *fromsnap, +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, int outfd, vnode_t *vp, offset_t *off) { dsl_pool_t *dp; @@ -703,10 +796,10 @@ dmu_send(const char *tosnap, const char *fromsnap, dsl_pool_rele(dp, FTAG); return (err); } - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, + err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, outfd, vp, off); } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, + err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, outfd, vp, off); } if (owned) @@ -861,6 +954,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; int error; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; @@ -874,11 +968,22 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) return (SET_ERROR(EINVAL)); /* Verify pool version supports SA if SA_SPILL feature set */ - if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_SA_SPILL) && - spa_version(dp->dp_spa) < SPA_VERSION_SA) { + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate a WRITE_EMBEDDED + * record to a plan WRITE record, so the pool must have the + * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED + * records. Same with WRITE_EMBEDDED records that use LZ4 compression. + */ + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); - } error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { @@ -1153,7 +1258,6 @@ backup_byteswap(dmu_replay_record_t *drr) break; case DRR_OBJECT: DO64(drr_object.drr_object); - /* DO64(drr_object.drr_allocation_txg); */ DO32(drr_object.drr_type); DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); @@ -1191,6 +1295,14 @@ backup_byteswap(dmu_replay_record_t *drr) DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); DO64(drr_write_byref.drr_key.ddk_prop); break; + case DRR_WRITE_EMBEDDED: + DO64(drr_write_embedded.drr_object); + DO64(drr_write_embedded.drr_offset); + DO64(drr_write_embedded.drr_length); + DO64(drr_write_embedded.drr_toguid); + DO32(drr_write_embedded.drr_lsize); + DO32(drr_write_embedded.drr_psize); + break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); @@ -1380,7 +1492,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, int err; guid_map_entry_t gmesrch; guid_map_entry_t *gmep; - avl_index_t where; + avl_index_t where; objset_t *ref_os = NULL; dmu_buf_t *dbp; @@ -1405,7 +1517,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); - if (err) + if (err != 0) return (err); tx = dmu_tx_create(os); @@ -1425,6 +1537,48 @@ restore_write_byref(struct restorearg *ra, objset_t *os, } static int +restore_write_embedded(struct restorearg *ra, objset_t *os, + struct drr_write_embedded *drrwnp) +{ + dmu_tx_t *tx; + int err; + void *data; + + if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) + return (EINVAL); + + if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) + return (EINVAL); + + if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) + return (EINVAL); + if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) + return (EINVAL); + + data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8)); + if (data == NULL) + return (ra->err); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, drrwnp->drr_object, + drrwnp->drr_offset, drrwnp->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + + dmu_write_embedded(os, drrwnp->drr_object, + drrwnp->drr_offset, data, drrwnp->drr_etype, + drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, + ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); + + dmu_tx_commit(tx); + return (0); +} + +static int restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) { dmu_tx_t *tx; @@ -1618,6 +1772,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, ra.err = restore_write_byref(&ra, os, &drrwbr); break; } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded drrwe = + drr->drr_u.drr_write_embedded; + ra.err = restore_write_embedded(&ra, os, &drrwe); + break; + } case DRR_FREE: { struct drr_free drrf = drr->drr_u.drr_free; diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 8c44e1771..e086e2487 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -463,7 +463,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (pfd->pd_cancel) return (SET_ERROR(EINTR)); - if (BP_IS_HOLE(bp) || + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 5aee10409..436816497 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1814,8 +1814,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, *offset = *offset >> span; for (i = BF64_GET(*offset, 0, epbs); i >= 0 && i < epb; i += inc) { - if (bp[i].blk_fill >= minfill && - bp[i].blk_fill <= maxfill && + if (BP_GET_FILL(&bp[i]) >= minfill && + BP_GET_FILL(&bp[i]) <= maxfill && (hole || bp[i].blk_birth > txg)) break; if (inc > 0 || *offset > 0) diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 238920060..676578859 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -237,8 +237,6 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) } #endif -#define ALL -1 - static void free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) @@ -601,11 +599,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonuslen = dn->dn_bonuslen; } - ASSERT(dnp->dn_nlevels > 1 || BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || BP_GET_LSIZE(&dnp->dn_blkptr[0]) == dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT(dnp->dn_nlevels < 2 || + BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift); if (dn->dn_next_type[txgoff] != 0) { dnp->dn_type = dn->dn_type; diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 2fe6b858d..e23dd6b06 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -1525,7 +1525,7 @@ dsl_dataset_space(dsl_dataset_t *ds, else *availbytesp = 0; } - *usedobjsp = ds->ds_phys->ds_bp.blk_fill; + *usedobjsp = BP_GET_FILL(&ds->ds_phys->ds_bp); *availobjsp = DN_MAX_OBJECT - *usedobjsp; } diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 113b7261b..dc49fd558 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -546,7 +546,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; - if (BP_IS_HOLE(bp)) + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { @@ -596,6 +596,7 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) uint64_t count; objset_t *mos; + ASSERT(!dsl_dataset_is_snapshot(ds)); if (dsl_dataset_is_snapshot(ds)) return (SET_ERROR(EINVAL)); @@ -708,7 +709,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) ds->ds_prev->ds_phys->ds_num_children == 2 && ds->ds_prev->ds_userrefs == 0); - /* Remove our reservation */ + /* Remove our reservation. */ if (ds->ds_reserved != 0) { dsl_dataset_set_refreservation_sync_impl(ds, (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index bf06b51ee..f03f1f54b 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1515,6 +1515,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) } if (err == ERESTART) return; + /* finished; verify that space accounting went to zero */ + ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes); + ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes); + ASSERT0(dp->dp_free_dir->dd_phys->dd_uncompressed_bytes); } if (scn->scn_phys.scn_state != DSS_SCANNING) @@ -1700,6 +1704,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, count_block(dp->dp_blkstats, bp); + if (BP_IS_EMBEDDED(bp)) + return (0); + ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c index e24ed6444..93fd5d9e1 100644 --- a/module/zfs/dsl_userhold.c +++ b/module/zfs/dsl_userhold.c @@ -610,8 +610,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, KM_PUSHPAGE)); error = dsl_sync_task(pool, dsl_dataset_user_release_check, - dsl_dataset_user_release_sync, &ddura, - fnvlist_num_pairs(holds)); + dsl_dataset_user_release_sync, &ddura, 0); fnvlist_free(ddura.ddura_todelete); fnvlist_free(ddura.ddura_chkholds); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 2dfdafb3a..9c09837d5 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -2236,6 +2236,7 @@ metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) vdev_t *vd; ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(psize > 0); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -2259,6 +2260,7 @@ metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) vdev_t *vd; ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(psize > 0); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 5bf59315a..b9fa45f82 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1872,7 +1872,7 @@ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { - if (!BP_IS_HOLE(bp)) { + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); void *data = zio_data_buf_alloc(size); @@ -2423,9 +2423,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, - &spa->spa_feat_enabled_txg_obj) != 0) { + &spa->spa_feat_enabled_txg_obj) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } } spa->spa_is_initializing = B_TRUE; @@ -5333,11 +5332,6 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) ASSERT(!locked); ASSERT(vd == vd->vdev_top); - /* - * XXX - Once we have bp-rewrite this should - * become the common case. - */ - mg = vd->vdev_mg; /* @@ -6487,7 +6481,7 @@ spa_upgrade(spa_t *spa, uint64_t version) * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); - ASSERT(version >= spa->spa_uberblock.ub_version); + ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 02ccb13a2..1bed90027 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1293,7 +1293,10 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, sizeof (type)); } - checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; + if (!BP_IS_EMBEDDED(bp)) { + checksum = + zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; + } compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } @@ -1588,7 +1591,7 @@ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) uint64_t dsize = 0; int d; - for (d = 0; d < SPA_DVAS_PER_BP; d++) + for (d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); @@ -1602,7 +1605,7 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp) spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (d = 0; d < SPA_DVAS_PER_BP; d++) + for (d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c index 2ea220245..f99e5349e 100644 --- a/module/zfs/zfeature_common.c +++ b/module/zfs/zfeature_common.c @@ -213,4 +213,9 @@ zpool_feature_init(void) "\"zfs bookmark\" command", B_TRUE, B_FALSE, B_FALSE, bookmarks_deps); } + + zfeature_register(SPA_FEATURE_EMBEDDED_DATA, + "com.delphix:embedded_data", "embedded_data", + "Blocks which compress very well use even less space.", + B_FALSE, B_TRUE, B_TRUE, NULL); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 409d2c737..491ba492c 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4238,6 +4238,7 @@ out: * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. + * zc_flags if =1, WRITE_EMBEDDED records are permitted * * outputs: * zc_objset_type estimated size, if zc_guid is set @@ -4248,6 +4249,7 @@ zfs_ioc_send(zfs_cmd_t *zc) int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); + boolean_t embedok = (zc->zc_flags & 0x1); if (zc->zc_obj != 0) { dsl_pool_t *dp; @@ -4308,7 +4310,7 @@ zfs_ioc_send(zfs_cmd_t *zc) off = fp->f_offset; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, - zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off); + zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; @@ -5174,6 +5176,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from + * (optional) "embedok" -> (value ignored) + * presence indicates DRR_WRITE_EMBEDDED records are permitted * } * * outnvl is unused @@ -5187,6 +5191,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) char *fromname = NULL; int fd; file_t *fp; + boolean_t embedok; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) @@ -5194,11 +5199,13 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); + embedok = nvlist_exists(innvl, "embedok"); + if ((fp = getf(fd)) == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; - error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off); + error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; diff --git a/module/zfs/zil.c b/module/zfs/zil.c index b5ee395d1..7c3c6592b 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -159,10 +159,15 @@ int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_bp_tree; - const dva_t *dva = BP_IDENTITY(bp); + const dva_t *dva; zil_bp_node_t *zn; avl_index_t where; + if (BP_IS_EMBEDDED(bp)) + return (0); + + dva = BP_IDENTITY(bp); + if (avl_find(t, dva, &where) != NULL) return (SET_ERROR(EEXIST)); @@ -863,7 +868,7 @@ zil_lwb_write_done(zio_t *zio) ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); ASSERT(!BP_IS_GANG(zio->io_bp)); ASSERT(!BP_IS_HOLE(zio->io_bp)); - ASSERT(zio->io_bp->blk_fill == 0); + ASSERT(BP_GET_FILL(zio->io_bp) == 0); /* * Ensure the lwb buffer pointer is cleared before releasing diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 6352ab3a3..ad97ef5db 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -36,6 +36,7 @@ #include <sys/dmu_objset.h> #include <sys/arc.h> #include <sys/ddt.h> +#include <sys/blkptr.h> #include <sys/zfeature.h> /* @@ -243,7 +244,7 @@ zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; - ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE | KM_NODEBUG)); } @@ -711,6 +712,16 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio->io_physdone = physdone; zio->io_prop = *zp; + /* + * Data can be NULL if we are going to call zio_write_override() to + * provide the already-allocated BP. But we may need the data to + * verify a dedup hit (if requested). In this case, don't try to + * dedup (just take the already-allocated BP verbatim). + */ + if (data == NULL && zio->io_prop.zp_dedup_verify) { + zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; + } + return (zio); } @@ -750,6 +761,14 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { + + /* + * The check for EMBEDDED is a performance optimization. We + * process the free here (by ignoring it) rather than + * putting it on the list and then processing it in zio_free_sync(). + */ + if (BP_IS_EMBEDDED(bp)) + return; metaslab_check_free(spa, bp); /* @@ -774,13 +793,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_t *zio; enum zio_stage stage = ZIO_FREE_PIPELINE; - dprintf_bp(bp, "freeing in txg %llu, pass %u", - (longlong_t)txg, spa->spa_sync_pass); - ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); + if (BP_IS_EMBEDDED(bp)) + return (zio_null(pio, spa, NULL, NULL, NULL, 0)); + metaslab_check_free(spa, bp); arc_freed(spa, bp); @@ -805,6 +824,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; + dprintf_bp(bp, "claiming in txg %llu", txg); + + if (BP_IS_EMBEDDED(bp)) + return (zio_null(pio, spa, NULL, NULL, NULL, 0)); + /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that @@ -1011,12 +1035,20 @@ zio_read_bp_init(zio_t *zio) if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW)) { - uint64_t psize = BP_GET_PSIZE(bp); + uint64_t psize = + BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); void *cbuf = zio_buf_alloc(psize); zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } + if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + decode_embedded_bp_compressed(bp, zio->io_data); + } else { + ASSERT(!BP_IS_EMBEDDED(bp)); + } + if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; @@ -1060,6 +1092,9 @@ zio_write_bp_init(zio_t *zio) *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (BP_IS_EMBEDDED(bp)) + return (ZIO_PIPELINE_CONTINUE); + /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite @@ -1108,7 +1143,7 @@ zio_write_bp_init(zio_t *zio) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ - ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), + ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } @@ -1118,9 +1153,38 @@ zio_write_bp_init(zio_t *zio) if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); + } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && + zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && + spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { + encode_embedded_bp_compressed(bp, + cbuf, compress, lsize, psize); + BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); + BP_SET_TYPE(bp, zio->io_prop.zp_type); + BP_SET_LEVEL(bp, zio->io_prop.zp_level); + zio_buf_free(cbuf, lsize); + bp->blk_birth = zio->io_txg; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_EMBEDDED_DATA)); + return (ZIO_PIPELINE_CONTINUE); } else { - ASSERT(psize < lsize); - zio_push_transform(zio, cbuf, psize, lsize, NULL); + /* + * Round up compressed size to MINBLOCKSIZE and + * zero the tail. + */ + size_t rounded = + P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE); + if (rounded > psize) { + bzero((char *)cbuf + psize, rounded - psize); + psize = rounded; + } + if (psize == lsize) { + compress = ZIO_COMPRESS_OFF; + zio_buf_free(cbuf, lsize); + } else { + zio_push_transform(zio, cbuf, + psize, lsize, NULL); + } } } @@ -2873,7 +2937,7 @@ zio_checksum_verified(zio_t *zio) /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. - * An error of 0 indictes success. ENXIO indicates whole-device failure, + * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. @@ -2979,7 +3043,7 @@ zio_done(zio_t *zio) for (w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); - if (zio->io_bp != NULL) { + if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, @@ -3216,7 +3280,8 @@ zio_done(zio_t *zio) } if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && - !BP_IS_HOLE(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { + !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && + !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); } diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index bc7331764..3a5c73a6a 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -126,7 +126,7 @@ zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, static void zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) { - dva_t *dva = BP_IDENTITY(bp); + const dva_t *dva = BP_IDENTITY(bp); uint64_t txg = BP_PHYSICAL_BIRTH(bp); ASSERT(BP_IS_GANG(bp)); diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index 5b63f0aa0..074462349 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -80,7 +80,7 @@ size_t zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) { uint64_t *word, *word_end; - size_t c_len, d_len, r_len; + size_t c_len, d_len; zio_compress_info_t *ci = &zio_compress_table[c]; ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); @@ -102,28 +102,13 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) return (s_len); /* Compress at least 12.5% */ - d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE); - if (d_len == 0) - return (s_len); - + d_len = s_len - (s_len >> 3); c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level); if (c_len > d_len) return (s_len); - /* - * Cool. We compressed at least as much as we were hoping to. - * For both security and repeatability, pad out the last sector. - */ - r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE); - if (r_len > c_len) { - bzero((char *)dst + c_len, r_len - c_len); - c_len = r_len; - } - ASSERT3U(c_len, <=, d_len); - ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0); - return (c_len); } |