aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorMatthew Ahrens <[email protected]>2014-06-05 13:19:08 -0800
committerBrian Behlendorf <[email protected]>2014-08-01 14:28:05 -0700
commit9b67f605601c77c814037613d8129562db642a29 (patch)
tree21a3270ed7eda24858e56a9584f64f6359f4b28f /module
parentfaf0f58c69607a15e2d1563567afb815842805de (diff)
Illumos 4757, 4913
4757 ZFS embedded-data block pointers ("zero block compression") 4913 zfs release should not be subject to space checks Reviewed by: Adam Leventhal <[email protected]> Reviewed by: Max Grossman <[email protected]> Reviewed by: George Wilson <[email protected]> Reviewed by: Christopher Siden <[email protected]> Reviewed by: Dan McDonald <[email protected]> Approved by: Dan McDonald <[email protected]> References: https://www.illumos.org/issues/4757 https://www.illumos.org/issues/4913 https://github.com/illumos/illumos-gate/commit/5d7b4d4 Porting notes: For compatibility with the fastpath code the zio_done() function needed to be updated. Because embedded-data block pointers do not require DVAs to be allocated the associated vdevs will not be marked and therefore should not be unmarked. Ported by: Tim Chase <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #2544
Diffstat (limited to 'module')
-rw-r--r--module/zfs/Makefile.in1
-rw-r--r--module/zfs/arc.c95
-rw-r--r--module/zfs/blkptr.c121
-rw-r--r--module/zfs/bpobj.c41
-rw-r--r--module/zfs/dbuf.c61
-rw-r--r--module/zfs/dmu.c85
-rw-r--r--module/zfs/dmu_objset.c32
-rw-r--r--module/zfs/dmu_send.c215
-rw-r--r--module/zfs/dmu_traverse.c2
-rw-r--r--module/zfs/dnode.c4
-rw-r--r--module/zfs/dnode_sync.c7
-rw-r--r--module/zfs/dsl_dataset.c2
-rw-r--r--module/zfs/dsl_destroy.c5
-rw-r--r--module/zfs/dsl_scan.c7
-rw-r--r--module/zfs/dsl_userhold.c3
-rw-r--r--module/zfs/metaslab.c2
-rw-r--r--module/zfs/spa.c12
-rw-r--r--module/zfs/spa_misc.c9
-rw-r--r--module/zfs/zfeature_common.c5
-rw-r--r--module/zfs/zfs_ioctl.c11
-rw-r--r--module/zfs/zil.c9
-rw-r--r--module/zfs/zio.c87
-rw-r--r--module/zfs/zio_checksum.c2
-rw-r--r--module/zfs/zio_compress.c19
24 files changed, 670 insertions, 167 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index 56ecd4918..48e7e97e9 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -5,6 +5,7 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
obj-$(CONFIG_ZFS) := $(MODULE).o
$(MODULE)-objs += @top_srcdir@/module/zfs/arc.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/blkptr.o
$(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o
$(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o
$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 7f6df0ae8..ec006cb0f 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -812,8 +812,10 @@ buf_discard_identity(arc_buf_hdr_t *hdr)
}
static arc_buf_hdr_t *
-buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
{
+ const dva_t *dva = BP_IDENTITY(bp);
+ uint64_t birth = BP_PHYSICAL_BIRTH(bp);
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
arc_buf_hdr_t *buf;
@@ -845,6 +847,8 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
arc_buf_hdr_t *fbuf;
uint32_t i;
+ ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
+ ASSERT(buf->b_birth != 0);
ASSERT(!HDR_IN_HASH_TABLE(buf));
*lockp = hash_lock;
mutex_enter(hash_lock);
@@ -3034,10 +3038,10 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
static void
arc_read_done(zio_t *zio)
{
- arc_buf_hdr_t *hdr, *found;
+ arc_buf_hdr_t *hdr;
arc_buf_t *buf;
arc_buf_t *abuf; /* buffer we're assigning to callback */
- kmutex_t *hash_lock;
+ kmutex_t *hash_lock = NULL;
arc_callback_t *callback_list, *acb;
int freeable = FALSE;
@@ -3052,12 +3056,24 @@ arc_read_done(zio_t *zio)
* reason for it not to be found is if we were freed during the
* read.
*/
- found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
- &hash_lock);
-
- ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
- (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
- (found == hdr && HDR_L2_READING(hdr)));
+ if (HDR_IN_HASH_TABLE(hdr)) {
+ arc_buf_hdr_t *found;
+
+ ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+ ASSERT3U(hdr->b_dva.dva_word[0], ==,
+ BP_IDENTITY(zio->io_bp)->dva_word[0]);
+ ASSERT3U(hdr->b_dva.dva_word[1], ==,
+ BP_IDENTITY(zio->io_bp)->dva_word[1]);
+
+ found = buf_hash_find(hdr->b_spa, zio->io_bp,
+ &hash_lock);
+
+ ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
+ hash_lock == NULL) ||
+ (found == hdr &&
+ DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+ (found == hdr && HDR_L2_READING(hdr)));
+ }
hdr->b_flags &= ~ARC_L2_EVICTED;
if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
@@ -3181,17 +3197,26 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
const zbookmark_t *zb)
{
- arc_buf_hdr_t *hdr;
+ arc_buf_hdr_t *hdr = NULL;
arc_buf_t *buf = NULL;
- kmutex_t *hash_lock;
+ kmutex_t *hash_lock = NULL;
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
int rc = 0;
+ ASSERT(!BP_IS_EMBEDDED(bp) ||
+ BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
+
top:
- hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
- &hash_lock);
- if (hdr && hdr->b_datacnt > 0) {
+ if (!BP_IS_EMBEDDED(bp)) {
+ /*
+ * Embedded BP's have no DVA and require no I/O to "read".
+ * Create an anonymous arc buf to back it.
+ */
+ hdr = buf_hash_find(guid, bp, &hash_lock);
+ }
+
+ if (hdr != NULL && hdr->b_datacnt > 0) {
*arc_flags |= ARC_CACHED;
@@ -3265,7 +3290,7 @@ top:
done(NULL, buf, private);
} else {
uint64_t size = BP_GET_LSIZE(bp);
- arc_callback_t *acb;
+ arc_callback_t *acb;
vdev_t *vd = NULL;
uint64_t addr = 0;
boolean_t devw = B_FALSE;
@@ -3274,15 +3299,17 @@ top:
if (hdr == NULL) {
/* this block is not in the cache */
- arc_buf_hdr_t *exists;
+ arc_buf_hdr_t *exists = NULL;
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
buf = arc_buf_alloc(spa, size, private, type);
hdr = buf->b_hdr;
- hdr->b_dva = *BP_IDENTITY(bp);
- hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
- hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
- exists = buf_hash_insert(hdr, &hash_lock);
- if (exists) {
+ if (!BP_IS_EMBEDDED(bp)) {
+ hdr->b_dva = *BP_IDENTITY(bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+ hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
+ exists = buf_hash_insert(hdr, &hash_lock);
+ }
+ if (exists != NULL) {
/* somebody beat us to the hash insert */
mutex_exit(hash_lock);
buf_discard_identity(hdr);
@@ -3354,7 +3381,8 @@ top:
vd = NULL;
}
- mutex_exit(hash_lock);
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
/*
* At this point, we have a level 1 cache miss. Try again in
@@ -3526,8 +3554,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
kmutex_t *hash_lock;
uint64_t guid = spa_load_guid(spa);
- hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
- &hash_lock);
+ ASSERT(!BP_IS_EMBEDDED(bp));
+
+ hdr = buf_hash_find(guid, bp, &hash_lock);
if (hdr == NULL)
return;
if (HDR_BUF_AVAILABLE(hdr)) {
@@ -3854,7 +3883,7 @@ arc_write_done(zio_t *zio)
ASSERT(hdr->b_acb == NULL);
if (zio->io_error == 0) {
- if (BP_IS_HOLE(zio->io_bp)) {
+ if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
buf_discard_identity(hdr);
} else {
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
@@ -3866,10 +3895,10 @@ arc_write_done(zio_t *zio)
}
/*
- * If the block to be written was all-zero, we may have
- * compressed it away. In this case no write was performed
- * so there will be no dva/birth/checksum. The buffer must
- * therefore remain anonymous (and uncached).
+ * If the block to be written was all-zero or compressed enough to be
+ * embedded in the BP, no write was performed so there will be no
+ * dva/birth/checksum. The buffer must therefore remain anonymous
+ * (and uncached).
*/
if (!BUF_EMPTY(hdr)) {
arc_buf_hdr_t *exists;
@@ -5219,7 +5248,7 @@ static boolean_t
l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
{
void *cdata;
- size_t csize, len;
+ size_t csize, len, rounded;
ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
ASSERT(l2hdr->b_tmp_cdata != NULL);
@@ -5229,6 +5258,12 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
cdata, l2hdr->b_asize);
+ rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
+ if (rounded > csize) {
+ bzero((char *)cdata + csize, rounded - csize);
+ csize = rounded;
+ }
+
if (csize == 0) {
/* zero block, indicate that there's nothing to write */
zio_data_buf_free(cdata, len);
diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c
new file mode 100644
index 000000000..d56e19996
--- /dev/null
+++ b/module/zfs/blkptr.c
@@ -0,0 +1,121 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Embedded-data Block Pointers
+ *
+ * Normally, block pointers point (via their DVAs) to a block which holds data.
+ * If the data that we need to store is very small, this is an inefficient
+ * use of space, because a block must be at minimum 1 sector (typically 512
+ * bytes or 4KB). Additionally, reading these small blocks tends to generate
+ * more random reads.
+ *
+ * Embedded-data Block Pointers allow small pieces of data (the "payload",
+ * up to 112 bytes) to be stored in the block pointer itself, instead of
+ * being pointed to. The "Pointer" part of this name is a bit of a
+ * misnomer, as nothing is pointed to.
+ *
+ * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
+ * be embedded in the block pointer. The logic for this is handled in
+ * the SPA, by the zio pipeline. Therefore most code outside the zio
+ * pipeline doesn't need special-cases to handle these block pointers.
+ *
+ * See spa.h for details on the exact layout of embedded block pointers.
+ */
+
+void
+encode_embedded_bp_compressed(blkptr_t *bp, void *data,
+ enum zio_compress comp, int uncompressed_size, int compressed_size)
+{
+ uint64_t *bp64 = (uint64_t *)bp;
+ uint64_t w = 0;
+ uint8_t *data8 = data;
+ int i;
+
+ ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
+ ASSERT(uncompressed_size == compressed_size ||
+ comp != ZIO_COMPRESS_OFF);
+ ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
+ ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+
+ bzero(bp, sizeof (*bp));
+ BP_SET_EMBEDDED(bp, B_TRUE);
+ BP_SET_COMPRESS(bp, comp);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ BPE_SET_LSIZE(bp, uncompressed_size);
+ BPE_SET_PSIZE(bp, compressed_size);
+
+ /*
+ * Encode the byte array into the words of the block pointer.
+ * First byte goes into low bits of first word (little endian).
+ */
+ for (i = 0; i < compressed_size; i++) {
+ BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
+ if (i % sizeof (w) == sizeof (w) - 1) {
+ /* we've reached the end of a word */
+ ASSERT3P(bp64, <, bp + 1);
+ *bp64 = w;
+ bp64++;
+ if (!BPE_IS_PAYLOADWORD(bp, bp64))
+ bp64++;
+ w = 0;
+ }
+ }
+ /* write last partial word */
+ if (bp64 < (uint64_t *)(bp + 1))
+ *bp64 = w;
+}
+
+/*
+ * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
+ * more than BPE_PAYLOAD_SIZE bytes).
+ */
+void
+decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
+{
+ int psize;
+ uint8_t *buf8 = buf;
+ uint64_t w = 0;
+ const uint64_t *bp64 = (const uint64_t *)bp;
+ int i;
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ psize = BPE_GET_PSIZE(bp);
+
+ /*
+ * Decode the words of the block pointer into the byte array.
+ * Low bits of first word are the first byte (little endian).
+ */
+ for (i = 0; i < psize; i++) {
+ if (i % sizeof (w) == 0) {
+ /* beginning of a word */
+ ASSERT3P(bp64, <, bp + 1);
+ w = *bp64;
+ bp64++;
+ if (!BPE_IS_PAYLOADWORD(bp, bp64))
+ bp64++;
+ }
+ buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
+ }
+}
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index eb3233b10..7c8f932f5 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -192,6 +192,13 @@ bpobj_close(bpobj_t *bpo)
mutex_destroy(&bpo->bpo_lock);
}
+static boolean_t
+bpobj_hasentries(bpobj_t *bpo)
+{
+ return (bpo->bpo_phys->bpo_num_blkptrs != 0 ||
+ (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
+}
+
static int
bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
boolean_t free)
@@ -332,9 +339,11 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
out:
/* If there are no entries, there should be no bytes. */
- ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
- (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
- bpo->bpo_phys->bpo_bytes == 0);
+ if (!bpobj_hasentries(bpo)) {
+ ASSERT0(bpo->bpo_phys->bpo_bytes);
+ ASSERT0(bpo->bpo_phys->bpo_comp);
+ ASSERT0(bpo->bpo_phys->bpo_uncomp);
+ }
mutex_exit(&bpo->bpo_lock);
return (err);
@@ -378,7 +387,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
- if (used == 0) {
+ if (!bpobj_hasentries(&subbpo)) {
/* No point in having an empty subobj. */
bpobj_close(&subbpo);
bpobj_free(bpo->bpo_os, subobj, tx);
@@ -453,13 +462,29 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
ASSERT(!BP_IS_HOLE(bp));
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+ if (BP_IS_EMBEDDED(bp)) {
+ /*
+ * The bpobj will compress better without the payload.
+ *
+ * Note that we store EMBEDDED bp's because they have an
+ * uncompressed size, which must be accounted for. An
+ * alternative would be to add their size to bpo_uncomp
+ * without storing the bp, but that would create additional
+ * complications: bpo_uncomp would be inconsistent with the
+ * set of BP's stored, and bpobj_iterate() wouldn't visit
+ * all the space accounted for in the bpobj.
+ */
+ bzero(&stored_bp, sizeof (stored_bp));
+ stored_bp.blk_prop = bp->blk_prop;
+ stored_bp.blk_birth = bp->blk_birth;
+ } else if (!BP_GET_DEDUP(bp)) {
+ /* The bpobj will compress better without the checksum */
+ bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+ }
+
/* We never need the fill count. */
stored_bp.blk_fill = 0;
- /* The bpobj will compress better if we can leave off the checksum */
- if (!BP_GET_DEDUP(bp))
- bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
-
mutex_enter(&bpo->bpo_lock);
offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index c6e7197b6..1e5fac78e 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -40,6 +40,8 @@
#include <sys/dmu_zfetch.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
#include <sys/range_tree.h>
struct dbuf_hold_impl_data {
@@ -1492,6 +1494,38 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&db->db_mtx);
}
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+ bp_embedded_type_t etype, enum zio_compress comp,
+ int uncompressed_size, int compressed_size, int byteorder,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+ struct dirty_leaf *dl;
+ dmu_object_type_t type;
+
+ DB_DNODE_ENTER(db);
+ type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ ASSERT0(db->db_level);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ dmu_buf_will_not_fill(dbuf, tx);
+
+ ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+ dl = &db->db_last_dirty->dt.dl;
+ encode_embedded_bp_compressed(&dl->dr_overridden_by,
+ data, comp, uncompressed_size, compressed_size);
+ BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+ BP_SET_TYPE(&dl->dr_overridden_by, type);
+ BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+ BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+ dl->dr_override_state = DR_OVERRIDDEN;
+ dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
+}
+
/*
* Directly assign a provided arc buf to a given dbuf if it's not referenced
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
@@ -1885,7 +1919,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
}
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
- if (bp && !BP_IS_HOLE(bp)) {
+ if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
@@ -2575,7 +2609,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
uint64_t fill = 0;
int i;
- ASSERT(db->db_blkptr == bp);
+ ASSERT3P(db->db_blkptr, ==, bp);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@@ -2587,7 +2621,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
- BP_GET_TYPE(bp) == dn->dn_bonustype));
+ BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+ BP_IS_EMBEDDED(bp));
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
}
@@ -2628,12 +2663,13 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
if (BP_IS_HOLE(ibp))
continue;
- fill += ibp->blk_fill;
+ fill += BP_GET_FILL(ibp);
}
}
DB_DNODE_EXIT(db);
- bp->blk_fill = fill;
+ if (!BP_IS_EMBEDDED(bp))
+ bp->blk_fill = fill;
mutex_exit(&db->db_mtx);
}
@@ -2745,7 +2781,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size);
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ if (!arc_released(db->db_buf))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
@@ -2871,10 +2908,16 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
DB_DNODE_EXIT(db);
- if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- ASSERT(db->db_state != DB_NOFILL);
+ if (db->db_level == 0 &&
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ /*
+ * The BP for this block has been provided by open context
+ * (by dmu_sync() or dmu_buf_write_embedded()).
+ */
+ void *contents = (data != NULL) ? data->b_data : NULL;
+
dr->dr_zio = zio_write(zio, os->os_spa, txg,
- db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+ db->db_blkptr, contents, db->db.db_size, &zp,
dbuf_write_override_ready, NULL, dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index d8e5739f4..305973abf 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -124,17 +124,13 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
};
int
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
- void *tag, dmu_buf_t **dbp, int flags)
+dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
uint64_t blkid;
dmu_buf_impl_t *db;
int err;
- int db_flags = DB_RF_CANFAIL;
-
- if (flags & DMU_READ_NO_PREFETCH)
- db_flags |= DB_RF_NOPREFETCH;
err = dnode_hold(os, object, FTAG, &dn);
if (err)
@@ -143,18 +139,37 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+
if (db == NULL) {
- err = SET_ERROR(EIO);
- } else {
+ *dbp = NULL;
+ return (SET_ERROR(EIO));
+ }
+
+ *dbp = &db->db;
+ return (err);
+}
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp, int flags)
+{
+ int err;
+ int db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+
+ err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
+ if (err == 0) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags);
- if (err) {
+ if (err != 0) {
dbuf_rele(db, tag);
- db = NULL;
+ *dbp = NULL;
}
}
- dnode_rele(dn, FTAG);
- *dbp = &db->db; /* NULL db plus first field offset is NULL */
return (err);
}
@@ -852,6 +867,25 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+ void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+ int compressed_size, int byteorder, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+
+ ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
+ ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+ VERIFY0(dmu_buf_hold_noread(os, object, offset,
+ FTAG, &db));
+
+ dmu_buf_write_embedded(db,
+ data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
+ uncompressed_size, compressed_size, byteorder, tx);
+
+ dmu_buf_rele(db, FTAG);
+}
+
/*
* DMU support for xuio
*/
@@ -1393,7 +1427,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
* block size still needs to be known for replay.
*/
BP_SET_LSIZE(bp, db->db_size);
- } else {
+ } else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1;
}
@@ -1664,9 +1698,15 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
{
dnode_t *dn;
- /* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os, object, FTAG, &dn);
- ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+ /*
+ * Send streams include each object's checksum function. This
+ * check ensures that the receiving system can understand the
+ * checksum function transmitted.
+ */
+ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+
+ VERIFY0(dnode_hold(os, object, FTAG, &dn));
+ ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
dn->dn_checksum = checksum;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
@@ -1678,9 +1718,14 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
{
dnode_t *dn;
- /* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os, object, FTAG, &dn);
- ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+ /*
+ * Send streams include each object's compression function. This
+ * check ensures that the receiving system can understand the
+ * compression function transmitted.
+ */
+ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
+
+ VERIFY0(dnode_hold(os, object, FTAG, &dn));
dn->dn_compress = compress;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
@@ -1843,7 +1888,7 @@ __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
doi->doi_fill_count = 0;
for (i = 0; i < dnp->dn_nblkptr; i++)
- doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
+ doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
}
void
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index b82783098..238892cf4 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -337,7 +337,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
* default (fletcher2/off). Snapshots don't need to know about
* checksum/compression/copies.
*/
- if (ds) {
+ if (ds != NULL) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os);
@@ -390,7 +390,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
kmem_free(os, sizeof (objset_t));
return (err);
}
- } else if (ds == NULL) {
+ } else {
/* It's the meta-objset. */
os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
os->os_compress = ZIO_COMPRESS_LZJB;
@@ -434,17 +434,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
&os->os_groupused_dnode);
}
- /*
- * We should be the only thread trying to do this because we
- * have ds_opening_lock
- */
- if (ds) {
- mutex_enter(&ds->ds_lock);
- ASSERT(ds->ds_objset == NULL);
- ds->ds_objset = os;
- mutex_exit(&ds->ds_lock);
- }
-
*osp = os;
return (0);
}
@@ -455,11 +444,19 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
int err = 0;
mutex_enter(&ds->ds_opening_lock);
- *osp = ds->ds_objset;
- if (*osp == NULL) {
+ if (ds->ds_objset == NULL) {
+ objset_t *os;
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
- ds, dsl_dataset_get_blkptr(ds), osp);
+ ds, dsl_dataset_get_blkptr(ds), &os);
+
+ if (err == 0) {
+ mutex_enter(&ds->ds_lock);
+ ASSERT(ds->ds_objset == NULL);
+ ds->ds_objset = os;
+ mutex_exit(&ds->ds_lock);
+ }
}
+ *osp = ds->ds_objset;
mutex_exit(&ds->ds_opening_lock);
return (err);
}
@@ -981,6 +978,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
+ ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT3P(bp, ==, os->os_rootbp);
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
ASSERT0(BP_GET_LEVEL(bp));
@@ -993,7 +991,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
*/
bp->blk_fill = 0;
for (i = 0; i < dnp->dn_nblkptr; i++)
- bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+ bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
}
/* ARGSUSED */
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 885da23ba..39f282ce6 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -50,7 +50,9 @@
#include <sys/zfs_onexit.h>
#include <sys/dmu_send.h>
#include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
#include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
int zfs_send_corrupt_data = B_FALSE;
@@ -197,7 +199,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
}
static int
-dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
+dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
{
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
@@ -232,13 +234,22 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
- drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
- if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
- drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
- DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
- DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
- DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
- drrw->drr_key.ddk_cksum = bp->blk_cksum;
+ if (BP_IS_EMBEDDED(bp)) {
+ /*
+ * There's no pre-computed checksum of embedded BP's, so
+ * (like fletcher4-checkummed blocks) userland will have
+ * to compute a dedup-capable checksum itself.
+ */
+ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
+ } else {
+ drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+ if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
+ drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+ DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+ drrw->drr_key.ddk_cksum = bp->blk_cksum;
+ }
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
@@ -248,6 +259,43 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
}
static int
+dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
+ int blksz, const blkptr_t *bp)
+{
+ char buf[BPE_PAYLOAD_SIZE];
+ struct drr_write_embedded *drrw =
+ &(dsp->dsa_drr->drr_u.drr_write_embedded);
+
+ if (dsp->dsa_pending_op != PENDING_NONE) {
+ if (dump_bytes(dsp, dsp->dsa_drr,
+ sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+ dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
+ drrw->drr_object = object;
+ drrw->drr_offset = offset;
+ drrw->drr_length = blksz;
+ drrw->drr_toguid = dsp->dsa_toguid;
+ drrw->drr_compression = BP_GET_COMPRESS(bp);
+ drrw->drr_etype = BPE_GET_ETYPE(bp);
+ drrw->drr_lsize = BPE_GET_LSIZE(bp);
+ drrw->drr_psize = BPE_GET_PSIZE(bp);
+
+ decode_embedded_bp_compressed(bp, buf);
+
+ if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+ return (EINTR);
+ return (0);
+}
+
+static int
dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
{
struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
@@ -367,6 +415,33 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
return (0);
}
+static boolean_t
+backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
+{
+ if (!BP_IS_EMBEDDED(bp))
+ return (B_FALSE);
+
+ /*
+ * Compression function must be legacy, or explicitly enabled.
+ */
+ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
+ !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
+ return (B_FALSE);
+
+ /*
+ * Embed type must be explicitly enabled.
+ */
+ switch (BPE_GET_ETYPE(bp)) {
+ case BP_EMBEDDED_TYPE_DATA:
+ if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+ return (B_TRUE);
+ break;
+ default:
+ return (B_FALSE);
+ }
+ return (B_FALSE);
+}
+
#define BP_SPAN(dnp, level) \
(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
@@ -435,11 +510,17 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf);
+ } else if (backup_do_embed(dsp, bp)) {
+ /* it's an embedded level-0 block of a regular object */
+ int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ err = dump_write_embedded(dsp, zb->zb_object,
+ zb->zb_blkid * blksz, blksz, bp);
} else { /* it's a level-0 block of a regular object */
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
+ ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT0(zb->zb_level);
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
@@ -458,7 +539,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
}
}
- err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
+ err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
blksz, bp, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf);
}
@@ -472,14 +553,15 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
- zfs_bookmark_phys_t *fromzb, boolean_t is_clone, int outfd,
- vnode_t *vp, offset_t *off)
+ zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
+ int outfd, vnode_t *vp, offset_t *off)
{
objset_t *os;
dmu_replay_record_t *drr;
dmu_sendarg_t *dsp;
int err;
uint64_t fromtxg = 0;
+ uint64_t featureflags = 0;
err = dmu_objset_from_ds(ds, &os);
if (err != 0) {
@@ -502,13 +584,23 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
return (SET_ERROR(EINVAL));
}
if (version >= ZPL_VERSION_SA) {
- DMU_SET_FEATUREFLAGS(
- drr->drr_u.drr_begin.drr_versioninfo,
- DMU_BACKUP_FEATURE_SA_SPILL);
+ featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
}
}
#endif
+ if (embedok &&
+ spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
+ featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+ featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
+ } else {
+ embedok = B_FALSE;
+ }
+
+ DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
+ featureflags);
+
drr->drr_u.drr_begin.drr_creation_time =
ds->ds_phys->ds_creation_time;
drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
@@ -540,6 +632,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
dsp->dsa_pending_op = PENDING_NONE;
dsp->dsa_incremental = (fromzb != NULL);
+ dsp->dsa_featureflags = featureflags;
mutex_enter(&ds->ds_sendstream_lock);
list_insert_head(&ds->ds_sendstreams, dsp);
@@ -591,7 +684,7 @@ out:
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- int outfd, vnode_t *vp, offset_t *off)
+ boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
@@ -625,10 +718,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
zb.zbm_guid = fromds->ds_phys->ds_guid;
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, vp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, vp, off);
}
dsl_dataset_rele(ds, FTAG);
@@ -636,7 +729,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
}
int
-dmu_send(const char *tosnap, const char *fromsnap,
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
@@ -703,10 +796,10 @@ dmu_send(const char *tosnap, const char *fromsnap,
dsl_pool_rele(dp, FTAG);
return (err);
}
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, vp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, vp, off);
}
if (owned)
@@ -861,6 +954,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
uint64_t fromguid = drrb->drr_fromguid;
int flags = drrb->drr_flags;
int error;
+ uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds;
const char *tofs = drba->drba_cookie->drc_tofs;
@@ -874,11 +968,22 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
return (SET_ERROR(EINVAL));
/* Verify pool version supports SA if SA_SPILL feature set */
- if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
- DMU_BACKUP_FEATURE_SA_SPILL) &&
- spa_version(dp->dp_spa) < SPA_VERSION_SA) {
+ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+ spa_version(dp->dp_spa) < SPA_VERSION_SA)
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+ * record to a plan WRITE record, so the pool must have the
+ * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+ * records. Same with WRITE_EMBEDDED records that use LZ4 compression.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
- }
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
@@ -1153,7 +1258,6 @@ backup_byteswap(dmu_replay_record_t *drr)
break;
case DRR_OBJECT:
DO64(drr_object.drr_object);
- /* DO64(drr_object.drr_allocation_txg); */
DO32(drr_object.drr_type);
DO32(drr_object.drr_bonustype);
DO32(drr_object.drr_blksz);
@@ -1191,6 +1295,14 @@ backup_byteswap(dmu_replay_record_t *drr)
DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
DO64(drr_write_byref.drr_key.ddk_prop);
break;
+ case DRR_WRITE_EMBEDDED:
+ DO64(drr_write_embedded.drr_object);
+ DO64(drr_write_embedded.drr_offset);
+ DO64(drr_write_embedded.drr_length);
+ DO64(drr_write_embedded.drr_toguid);
+ DO32(drr_write_embedded.drr_lsize);
+ DO32(drr_write_embedded.drr_psize);
+ break;
case DRR_FREE:
DO64(drr_free.drr_object);
DO64(drr_free.drr_offset);
@@ -1380,7 +1492,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
int err;
guid_map_entry_t gmesrch;
guid_map_entry_t *gmep;
- avl_index_t where;
+ avl_index_t where;
objset_t *ref_os = NULL;
dmu_buf_t *dbp;
@@ -1405,7 +1517,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
- if (err)
+ if (err != 0)
return (err);
tx = dmu_tx_create(os);
@@ -1425,6 +1537,48 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
}
static int
+restore_write_embedded(struct restorearg *ra, objset_t *os,
+ struct drr_write_embedded *drrwnp)
+{
+ dmu_tx_t *tx;
+ int err;
+ void *data;
+
+ if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
+ return (EINVAL);
+
+ if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
+ return (EINVAL);
+
+ if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+ return (EINVAL);
+ if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+ return (EINVAL);
+
+ data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8));
+ if (data == NULL)
+ return (ra->err);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, drrwnp->drr_object,
+ drrwnp->drr_offset, drrwnp->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ dmu_write_embedded(os, drrwnp->drr_object,
+ drrwnp->drr_offset, data, drrwnp->drr_etype,
+ drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
+ ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+static int
restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
{
dmu_tx_t *tx;
@@ -1618,6 +1772,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
ra.err = restore_write_byref(&ra, os, &drrwbr);
break;
}
+ case DRR_WRITE_EMBEDDED:
+ {
+ struct drr_write_embedded drrwe =
+ drr->drr_u.drr_write_embedded;
+ ra.err = restore_write_embedded(&ra, os, &drrwe);
+ break;
+ }
case DRR_FREE:
{
struct drr_free drrf = drr->drr_u.drr_free;
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 8c44e1771..e086e2487 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -463,7 +463,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (pfd->pd_cancel)
return (SET_ERROR(EINTR));
- if (BP_IS_HOLE(bp) ||
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
!((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 5aee10409..436816497 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1814,8 +1814,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
*offset = *offset >> span;
for (i = BF64_GET(*offset, 0, epbs);
i >= 0 && i < epb; i += inc) {
- if (bp[i].blk_fill >= minfill &&
- bp[i].blk_fill <= maxfill &&
+ if (BP_GET_FILL(&bp[i]) >= minfill &&
+ BP_GET_FILL(&bp[i]) <= maxfill &&
(hole || bp[i].blk_birth > txg))
break;
if (inc > 0 || *offset > 0)
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 238920060..676578859 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -237,8 +237,6 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
#endif
-#define ALL -1
-
static void
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
@@ -601,11 +599,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnp->dn_bonustype = dn->dn_bonustype;
dnp->dn_bonuslen = dn->dn_bonuslen;
}
-
ASSERT(dnp->dn_nlevels > 1 ||
BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT(dnp->dn_nlevels < 2 ||
+ BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
if (dn->dn_next_type[txgoff] != 0) {
dnp->dn_type = dn->dn_type;
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 2fe6b858d..e23dd6b06 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -1525,7 +1525,7 @@ dsl_dataset_space(dsl_dataset_t *ds,
else
*availbytesp = 0;
}
- *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
+ *usedobjsp = BP_GET_FILL(&ds->ds_phys->ds_bp);
*availobjsp = DN_MAX_OBJECT - *usedobjsp;
}
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
index 113b7261b..dc49fd558 100644
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -546,7 +546,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
struct killarg *ka = arg;
dmu_tx_t *tx = ka->tx;
- if (BP_IS_HOLE(bp))
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
if (zb->zb_level == ZB_ZIL_LEVEL) {
@@ -596,6 +596,7 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
uint64_t count;
objset_t *mos;
+ ASSERT(!dsl_dataset_is_snapshot(ds));
if (dsl_dataset_is_snapshot(ds))
return (SET_ERROR(EINVAL));
@@ -708,7 +709,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ds->ds_prev->ds_phys->ds_num_children == 2 &&
ds->ds_prev->ds_userrefs == 0);
- /* Remove our reservation */
+ /* Remove our reservation. */
if (ds->ds_reserved != 0) {
dsl_dataset_set_refreservation_sync_impl(ds,
(ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index bf06b51ee..f03f1f54b 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -1515,6 +1515,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
}
if (err == ERESTART)
return;
+ /* finished; verify that space accounting went to zero */
+ ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
+ ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
+ ASSERT0(dp->dp_free_dir->dd_phys->dd_uncompressed_bytes);
}
if (scn->scn_phys.scn_state != DSS_SCANNING)
@@ -1700,6 +1704,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
count_block(dp->dp_blkstats, bp);
+ if (BP_IS_EMBEDDED(bp))
+ return (0);
+
ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
zio_flags |= ZIO_FLAG_SCRUB;
diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c
index e24ed6444..93fd5d9e1 100644
--- a/module/zfs/dsl_userhold.c
+++ b/module/zfs/dsl_userhold.c
@@ -610,8 +610,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
KM_PUSHPAGE));
error = dsl_sync_task(pool, dsl_dataset_user_release_check,
- dsl_dataset_user_release_sync, &ddura,
- fnvlist_num_pairs(holds));
+ dsl_dataset_user_release_sync, &ddura, 0);
fnvlist_free(ddura.ddura_todelete);
fnvlist_free(ddura.ddura_chkholds);
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 2dfdafb3a..9c09837d5 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -2236,6 +2236,7 @@ metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
vdev_t *vd;
ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(psize > 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
@@ -2259,6 +2260,7 @@ metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
vdev_t *vd;
ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(psize > 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 5bf59315a..b9fa45f82 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1872,7 +1872,7 @@ static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
- if (!BP_IS_HOLE(bp)) {
+ if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
zio_t *rio = arg;
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
@@ -2423,9 +2423,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
- &spa->spa_feat_enabled_txg_obj) != 0) {
+ &spa->spa_feat_enabled_txg_obj) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
- }
}
spa->spa_is_initializing = B_TRUE;
@@ -5333,11 +5332,6 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
ASSERT(!locked);
ASSERT(vd == vd->vdev_top);
- /*
- * XXX - Once we have bp-rewrite this should
- * become the common case.
- */
-
mg = vd->vdev_mg;
/*
@@ -6487,7 +6481,7 @@ spa_upgrade(spa_t *spa, uint64_t version)
* possible.
*/
ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
- ASSERT(version >= spa->spa_uberblock.ub_version);
+ ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
spa->spa_uberblock.ub_version = version;
vdev_config_dirty(spa->spa_root_vdev);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 02ccb13a2..1bed90027 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1293,7 +1293,10 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
sizeof (type));
}
- checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+ if (!BP_IS_EMBEDDED(bp)) {
+ checksum =
+ zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+ }
compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
}
@@ -1588,7 +1591,7 @@ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
uint64_t dsize = 0;
int d;
- for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ for (d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
return (dsize);
@@ -1602,7 +1605,7 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- for (d = 0; d < SPA_DVAS_PER_BP; d++)
+ for (d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
spa_config_exit(spa, SCL_VDEV, FTAG);
diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c
index 2ea220245..f99e5349e 100644
--- a/module/zfs/zfeature_common.c
+++ b/module/zfs/zfeature_common.c
@@ -213,4 +213,9 @@ zpool_feature_init(void)
"\"zfs bookmark\" command",
B_TRUE, B_FALSE, B_FALSE, bookmarks_deps);
}
+
+ zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
+ "com.delphix:embedded_data", "embedded_data",
+ "Blocks which compress very well use even less space.",
+ B_FALSE, B_TRUE, B_TRUE, NULL);
}
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 409d2c737..491ba492c 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -4238,6 +4238,7 @@ out:
* zc_fromobj objsetid of incremental fromsnap (may be zero)
* zc_guid if set, estimate size of stream only. zc_cookie is ignored.
* output size in zc_objset_type.
+ * zc_flags if =1, WRITE_EMBEDDED records are permitted
*
* outputs:
* zc_objset_type estimated size, if zc_guid is set
@@ -4248,6 +4249,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
int error;
offset_t off;
boolean_t estimate = (zc->zc_guid != 0);
+ boolean_t embedok = (zc->zc_flags & 0x1);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@@ -4308,7 +4310,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
- zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off);
+ zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
@@ -5174,6 +5176,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
* innvl: {
* "fd" -> file descriptor to write stream to (int32)
* (optional) "fromsnap" -> full snap name to send an incremental from
+ * (optional) "embedok" -> (value ignored)
+ * presence indicates DRR_WRITE_EMBEDDED records are permitted
* }
*
* outnvl is unused
@@ -5187,6 +5191,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
char *fromname = NULL;
int fd;
file_t *fp;
+ boolean_t embedok;
error = nvlist_lookup_int32(innvl, "fd", &fd);
if (error != 0)
@@ -5194,11 +5199,13 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
+ embedok = nvlist_exists(innvl, "embedok");
+
if ((fp = getf(fd)) == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
- error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off);
+ error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index b5ee395d1..7c3c6592b 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -159,10 +159,15 @@ int
zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
{
avl_tree_t *t = &zilog->zl_bp_tree;
- const dva_t *dva = BP_IDENTITY(bp);
+ const dva_t *dva;
zil_bp_node_t *zn;
avl_index_t where;
+ if (BP_IS_EMBEDDED(bp))
+ return (0);
+
+ dva = BP_IDENTITY(bp);
+
if (avl_find(t, dva, &where) != NULL)
return (SET_ERROR(EEXIST));
@@ -863,7 +868,7 @@ zil_lwb_write_done(zio_t *zio)
ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
ASSERT(!BP_IS_GANG(zio->io_bp));
ASSERT(!BP_IS_HOLE(zio->io_bp));
- ASSERT(zio->io_bp->blk_fill == 0);
+ ASSERT(BP_GET_FILL(zio->io_bp) == 0);
/*
* Ensure the lwb buffer pointer is cleared before releasing
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 6352ab3a3..ad97ef5db 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -36,6 +36,7 @@
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/ddt.h>
+#include <sys/blkptr.h>
#include <sys/zfeature.h>
/*
@@ -243,7 +244,7 @@ zio_buf_alloc(size_t size)
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
- ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+ ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE | KM_NODEBUG));
}
@@ -711,6 +712,16 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_physdone = physdone;
zio->io_prop = *zp;
+ /*
+ * Data can be NULL if we are going to call zio_write_override() to
+ * provide the already-allocated BP. But we may need the data to
+ * verify a dedup hit (if requested). In this case, don't try to
+ * dedup (just take the already-allocated BP verbatim).
+ */
+ if (data == NULL && zio->io_prop.zp_dedup_verify) {
+ zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
+ }
+
return (zio);
}
@@ -750,6 +761,14 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
void
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
{
+
+ /*
+ * The check for EMBEDDED is a performance optimization. We
+ * process the free here (by ignoring it) rather than
+ * putting it on the list and then processing it in zio_free_sync().
+ */
+ if (BP_IS_EMBEDDED(bp))
+ return;
metaslab_check_free(spa, bp);
/*
@@ -774,13 +793,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio_t *zio;
enum zio_stage stage = ZIO_FREE_PIPELINE;
- dprintf_bp(bp, "freeing in txg %llu, pass %u",
- (longlong_t)txg, spa->spa_sync_pass);
-
ASSERT(!BP_IS_HOLE(bp));
ASSERT(spa_syncing_txg(spa) == txg);
ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
+ if (BP_IS_EMBEDDED(bp))
+ return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
metaslab_check_free(spa, bp);
arc_freed(spa, bp);
@@ -805,6 +824,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
{
zio_t *zio;
+ dprintf_bp(bp, "claiming in txg %llu", txg);
+
+ if (BP_IS_EMBEDDED(bp))
+ return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
/*
* A claim is an allocation of a specific block. Claims are needed
* to support immediate writes in the intent log. The issue is that
@@ -1011,12 +1035,20 @@ zio_read_bp_init(zio_t *zio)
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) {
- uint64_t psize = BP_GET_PSIZE(bp);
+ uint64_t psize =
+ BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(psize);
zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
}
+ if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ decode_embedded_bp_compressed(bp, zio->io_data);
+ } else {
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ }
+
if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
@@ -1060,6 +1092,9 @@ zio_write_bp_init(zio_t *zio)
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ if (BP_IS_EMBEDDED(bp))
+ return (ZIO_PIPELINE_CONTINUE);
+
/*
* If we've been overridden and nopwrite is set then
* set the flag accordingly to indicate that a nopwrite
@@ -1108,7 +1143,7 @@ zio_write_bp_init(zio_t *zio)
compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */
- ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
spa_max_replication(spa)) == BP_GET_NDVAS(bp));
}
@@ -1118,9 +1153,38 @@ zio_write_bp_init(zio_t *zio)
if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
+ } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
+ zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+ encode_embedded_bp_compressed(bp,
+ cbuf, compress, lsize, psize);
+ BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
+ BP_SET_TYPE(bp, zio->io_prop.zp_type);
+ BP_SET_LEVEL(bp, zio->io_prop.zp_level);
+ zio_buf_free(cbuf, lsize);
+ bp->blk_birth = zio->io_txg;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ ASSERT(spa_feature_is_active(spa,
+ SPA_FEATURE_EMBEDDED_DATA));
+ return (ZIO_PIPELINE_CONTINUE);
} else {
- ASSERT(psize < lsize);
- zio_push_transform(zio, cbuf, psize, lsize, NULL);
+ /*
+ * Round up compressed size to MINBLOCKSIZE and
+ * zero the tail.
+ */
+ size_t rounded =
+ P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
+ if (rounded > psize) {
+ bzero((char *)cbuf + psize, rounded - psize);
+ psize = rounded;
+ }
+ if (psize == lsize) {
+ compress = ZIO_COMPRESS_OFF;
+ zio_buf_free(cbuf, lsize);
+ } else {
+ zio_push_transform(zio, cbuf,
+ psize, lsize, NULL);
+ }
}
}
@@ -2873,7 +2937,7 @@ zio_checksum_verified(zio_t *zio)
/*
* ==========================================================================
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
- * An error of 0 indictes success. ENXIO indicates whole-device failure,
+ * An error of 0 indicates success. ENXIO indicates whole-device failure,
* which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
* indicate errors that are specific to one I/O, and most likely permanent.
* Any other error is presumed to be worse because we weren't expecting it.
@@ -2979,7 +3043,7 @@ zio_done(zio_t *zio)
for (w = 0; w < ZIO_WAIT_TYPES; w++)
ASSERT(zio->io_children[c][w] == 0);
- if (zio->io_bp != NULL) {
+ if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
ASSERT(zio->io_bp->blk_pad[0] == 0);
ASSERT(zio->io_bp->blk_pad[1] == 0);
ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
@@ -3216,7 +3280,8 @@ zio_done(zio_t *zio)
}
if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
- !BP_IS_HOLE(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
+ !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
+ !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
}
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index bc7331764..3a5c73a6a 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -126,7 +126,7 @@ zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
static void
zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
{
- dva_t *dva = BP_IDENTITY(bp);
+ const dva_t *dva = BP_IDENTITY(bp);
uint64_t txg = BP_PHYSICAL_BIRTH(bp);
ASSERT(BP_IS_GANG(bp));
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index 5b63f0aa0..074462349 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -80,7 +80,7 @@ size_t
zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
{
uint64_t *word, *word_end;
- size_t c_len, d_len, r_len;
+ size_t c_len, d_len;
zio_compress_info_t *ci = &zio_compress_table[c];
ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
@@ -102,28 +102,13 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
return (s_len);
/* Compress at least 12.5% */
- d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
- if (d_len == 0)
- return (s_len);
-
+ d_len = s_len - (s_len >> 3);
c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
if (c_len > d_len)
return (s_len);
- /*
- * Cool. We compressed at least as much as we were hoping to.
- * For both security and repeatability, pad out the last sector.
- */
- r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
- if (r_len > c_len) {
- bzero((char *)dst + c_len, r_len - c_len);
- c_len = r_len;
- }
-
ASSERT3U(c_len, <=, d_len);
- ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
-
return (c_len);
}