aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/ddt.c328
-rw-r--r--module/zfs/ddt_stats.c20
-rw-r--r--module/zfs/ddt_zap.c6
-rw-r--r--module/zfs/dsl_scan.c14
-rw-r--r--module/zfs/zio.c380
5 files changed, 578 insertions, 170 deletions
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 213e04239..59526394b 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -75,12 +75,19 @@
* fill the BP with the DVAs from the entry, increment the refcount and cause
* the write IO to return immediately.
*
- * Each ddt_phys_t slot in the entry represents a separate dedup block for the
- * same content/checksum. The slot is selected based on the zp_copies parameter
- * the block is written with, that is, the number of DVAs in the block. The
- * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto"
- * feature. These are no longer written, and will be freed if encountered on
- * old pools.
+ * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
+ * block for the same content/checksum. The slot is selected based on the
+ * zp_copies parameter the block is written with, that is, the number of DVAs
+ * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
+ * now-removed "dedupditto" feature. These are no longer written, and will be
+ * freed if encountered on old pools.
+ *
+ * If the "fast_dedup" feature is enabled, new dedup tables will be created
+ * with the "flat phys" option. In this mode, there is only one ddt_phys_t
+ * slot. If a write is issued for an entry that exists, but has fewer DVAs,
+ * then only as many new DVAs are allocated and written to make up the
+ * shortfall. The existing entry is then extended (ddt_phys_extend()) with the
+ * new DVAs.
*
* ## Lifetime of an entry
*
@@ -130,6 +137,16 @@
* from the alternate block. If the block is actually damaged, this will invoke
* the pool's "self-healing" mechanism, and repair the block.
*
+ * If the "fast_dedup" feature is enabled, the "flat phys" option will be in
+ * use, so there is only ever one ddt_phys_t slot. The repair process will
+ * still happen in this case, though it is unlikely to succeed as there will
+ * usually be no other equivalent blocks to fall back on (though there might
+ * be, if this was an early version of a dedup'd block that has since been
+ * extended).
+ *
+ * Note that this repair mechanism is in addition to and separate from the
+ * regular OpenZFS scrub and self-healing mechanisms.
+ *
* ## Scanning (scrub/resilver)
*
* If dedup is active, the scrub machinery will walk the dedup table first, and
@@ -162,10 +179,15 @@
c == ZIO_CHECKSUM_BLAKE3)
static kmem_cache_t *ddt_cache;
-static kmem_cache_t *ddt_entry_cache;
-#define DDT_ENTRY_SIZE \
- (sizeof (ddt_entry_t) + sizeof (ddt_phys_t) * DDT_PHYS_MAX)
+static kmem_cache_t *ddt_entry_flat_cache;
+static kmem_cache_t *ddt_entry_trad_cache;
+
+#define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE)
+#define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE)
+
+#define DDT_ENTRY_SIZE(ddt) \
+ _DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)
/*
* Enable/disable prefetching of dedup-ed blocks which are going to be freed.
@@ -195,7 +217,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
*/
static const uint64_t ddt_version_flags[] = {
[DDT_VERSION_LEGACY] = 0,
- [DDT_VERSION_FDT] = 0,
+ [DDT_VERSION_FDT] = DDT_FLAG_FLAT,
};
/* Dummy version to signal that configure is still necessary */
@@ -346,7 +368,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
ddt->ddt_object[type][class], &dde->dde_key,
- dde->dde_phys, sizeof (ddt_phys_t) * DDT_NPHYS(ddt)));
+ dde->dde_phys, DDT_PHYS_SIZE(ddt)));
}
static int
@@ -388,8 +410,8 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
ASSERT(ddt_object_exists(ddt, type, class));
return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
- ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys,
- sizeof (ddt_phys_t) * DDT_NPHYS(ddt), tx));
+ ddt->ddt_object[type][class], &dde->dde_key,
+ dde->dde_phys, DDT_PHYS_SIZE(ddt), tx));
}
static int
@@ -410,11 +432,10 @@ ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key,
- ddlwe->ddlwe_phys, sizeof (ddlwe->ddlwe_phys));
+ &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
if (error == 0) {
ddlwe->ddlwe_type = type;
ddlwe->ddlwe_class = class;
- ddlwe->ddlwe_nphys = DDT_NPHYS(ddt);
return (0);
}
return (error);
@@ -451,13 +472,25 @@ ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
}
void
-ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+ blkptr_t *bp, uint64_t txg)
{
ASSERT3U(txg, !=, 0);
+ ASSERT3U(v, <, DDT_PHYS_NONE);
+ uint64_t phys_birth;
+ const dva_t *dvap;
+
+ if (v == DDT_PHYS_FLAT) {
+ phys_birth = ddp->ddp_flat.ddp_phys_birth;
+ dvap = ddp->ddp_flat.ddp_dva;
+ } else {
+ phys_birth = ddp->ddp_trad[v].ddp_phys_birth;
+ dvap = ddp->ddp_trad[v].ddp_dva;
+ }
for (int d = 0; d < SPA_DVAS_PER_BP; d++)
- bp->blk_dva[d] = ddp->ddp_dva[d];
- BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+ bp->blk_dva[d] = dvap[d];
+ BP_SET_BIRTH(bp, txg, phys_birth);
}
/*
@@ -465,13 +498,13 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
* will be missing the salt / IV required to do a full decrypting read.
*/
void
-ddt_bp_create(enum zio_checksum checksum,
- const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
+ const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp)
{
BP_ZERO(bp);
if (ddp != NULL)
- ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+ ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v));
bp->blk_cksum = ddk->ddk_cksum;
@@ -502,42 +535,101 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
}
void
-ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
{
- ASSERT0(ddp->ddp_phys_birth);
+ ASSERT3U(v, <, DDT_PHYS_NONE);
+ int bp_ndvas = BP_GET_NDVAS(bp);
+ int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ?
+ SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
+ dva_t *dvas = (v == DDT_PHYS_FLAT) ?
+ ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
+
+ int s = 0, d = 0;
+ while (s < bp_ndvas && d < ddp_max_dvas) {
+ if (DVA_IS_VALID(&dvas[d])) {
+ d++;
+ continue;
+ }
+ dvas[d] = bp->blk_dva[s];
+ s++; d++;
+ }
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
- ddp->ddp_dva[d] = bp->blk_dva[d];
- ddp->ddp_phys_birth = BP_GET_BIRTH(bp);
+ /*
+ * If the caller offered us more DVAs than we can fit, something has
+ * gone wrong in their accounting. zio_ddt_write() should never ask for
+ * more than we need.
+ */
+ ASSERT3U(s, ==, bp_ndvas);
+
+ if (BP_IS_ENCRYPTED(bp))
+ dvas[2] = bp->blk_dva[2];
+
+ if (ddt_phys_birth(ddp, v) == 0) {
+ if (v == DDT_PHYS_FLAT)
+ ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp);
+ else
+ ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp);
+ }
}
void
-ddt_phys_clear(ddt_phys_t *ddp)
+ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
+ ddt_phys_variant_t v)
{
- memset(ddp, 0, sizeof (*ddp));
+ ASSERT3U(v, <, DDT_PHYS_NONE);
+
+ if (v == DDT_PHYS_FLAT)
+ dst->ddp_flat = src->ddp_flat;
+ else
+ dst->ddp_trad[v] = src->ddp_trad[v];
}
void
-ddt_phys_addref(ddt_phys_t *ddp)
+ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
{
- ddp->ddp_refcnt++;
+ ASSERT3U(v, <, DDT_PHYS_NONE);
+
+ if (v == DDT_PHYS_FLAT)
+ memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE);
+ else
+ memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
}
void
-ddt_phys_decref(ddt_phys_t *ddp)
+ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
{
- if (ddp) {
- ASSERT3U(ddp->ddp_refcnt, >, 0);
- ddp->ddp_refcnt--;
- }
+ ASSERT3U(v, <, DDT_PHYS_NONE);
+
+ if (v == DDT_PHYS_FLAT)
+ ddp->ddp_flat.ddp_refcnt++;
+ else
+ ddp->ddp_trad[v].ddp_refcnt++;
+}
+
+uint64_t
+ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
+{
+ ASSERT3U(v, <, DDT_PHYS_NONE);
+
+ uint64_t *refcntp;
+
+ if (v == DDT_PHYS_FLAT)
+ refcntp = &ddp->ddp_flat.ddp_refcnt;
+ else
+ refcntp = &ddp->ddp_trad[v].ddp_refcnt;
+
+ ASSERT3U(*refcntp, >, 0);
+ (*refcntp)--;
+ return (*refcntp);
}
static void
-ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp,
+ ddt_phys_variant_t v, uint64_t txg)
{
blkptr_t blk;
- ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
/*
* We clear the dedup bit so that zio_free() will actually free the
@@ -545,20 +637,67 @@ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
*/
BP_SET_DEDUP(&blk, 0);
- ddt_phys_clear(ddp);
+ ddt_phys_clear(ddp, v);
zio_free(ddt->ddt_spa, txg, &blk);
}
-ddt_phys_t *
+uint64_t
+ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
+{
+ ASSERT3U(v, <, DDT_PHYS_NONE);
+
+ if (v == DDT_PHYS_FLAT)
+ return (ddp->ddp_flat.ddp_phys_birth);
+ else
+ return (ddp->ddp_trad[v].ddp_phys_birth);
+}
+
+int
+ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+ boolean_t encrypted)
+{
+ ASSERT3U(v, <, DDT_PHYS_NONE);
+
+ const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
+ ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
+
+ return (DVA_IS_VALID(&dvas[0]) +
+ DVA_IS_VALID(&dvas[1]) +
+ DVA_IS_VALID(&dvas[2]) * !encrypted);
+}
+
+ddt_phys_variant_t
ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
{
- for (int p = 0; p < DDT_NPHYS(ddt); p++) {
- ddt_phys_t *ddp = (ddt_phys_t *)&dde->dde_phys[p];
- if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
- BP_GET_BIRTH(bp) == ddp->ddp_phys_birth)
- return (ddp);
+ const ddt_univ_phys_t *ddp = dde->dde_phys;
+
+ if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+ if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
+ BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
+ return (DDT_PHYS_FLAT);
+ }
+ } else /* traditional phys */ {
+ for (int p = 0; p < DDT_PHYS_MAX; p++) {
+ if (DVA_EQUAL(BP_IDENTITY(bp),
+ &ddp->ddp_trad[p].ddp_dva[0]) &&
+ BP_GET_BIRTH(bp) ==
+ ddp->ddp_trad[p].ddp_phys_birth) {
+ return (p);
+ }
+ }
}
- return (NULL);
+ return (DDT_PHYS_NONE);
+}
+
+uint64_t
+ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
+{
+ ASSERT3U(v, <, DDT_PHYS_NONE);
+
+ if (v == DDT_PHYS_FLAT)
+ return (ddp->ddp_flat.ddp_refcnt);
+ else
+ return (ddp->ddp_trad[v].ddp_refcnt);
}
uint64_t
@@ -566,10 +705,11 @@ ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde)
{
uint64_t refcnt = 0;
- for (int p = 0; p < DDT_NPHYS(ddt); p++) {
- if (DDT_PHYS_IS_DITTO(ddt, p))
- continue;
- refcnt += dde->dde_phys[p].ddp_refcnt;
+ if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+ refcnt = dde->dde_phys->ddp_flat.ddp_refcnt;
+ } else {
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+ refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt;
}
return (refcnt);
@@ -599,24 +739,33 @@ ddt_init(void)
{
ddt_cache = kmem_cache_create("ddt_cache",
sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
- ddt_entry_cache = kmem_cache_create("ddt_entry_cache",
- DDT_ENTRY_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+ ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache",
+ DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+ ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
+ DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
}
void
ddt_fini(void)
{
- kmem_cache_destroy(ddt_entry_cache);
+ kmem_cache_destroy(ddt_entry_trad_cache);
+ kmem_cache_destroy(ddt_entry_flat_cache);
kmem_cache_destroy(ddt_cache);
}
static ddt_entry_t *
-ddt_alloc(const ddt_key_t *ddk)
+ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk)
{
ddt_entry_t *dde;
- dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
- memset(dde, 0, DDT_ENTRY_SIZE);
+ if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+ dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP);
+ memset(dde, 0, DDT_ENTRY_FLAT_SIZE);
+ } else {
+ dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP);
+ memset(dde, 0, DDT_ENTRY_TRAD_SIZE);
+ }
+
cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
dde->dde_key = *ddk;
@@ -647,7 +796,8 @@ ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
}
cv_destroy(&dde->dde_cv);
- kmem_cache_free(ddt_entry_cache, dde);
+ kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+ ddt_entry_flat_cache : ddt_entry_trad_cache, dde);
}
void
@@ -793,7 +943,12 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
}
/* Time to make a new entry. */
- dde = ddt_alloc(&search);
+ dde = ddt_alloc(ddt, &search);
+
+ /* Record the time this class was created (used by ddt prune) */
+ if (ddt->ddt_flags & DDT_FLAG_FLAT)
+ dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec();
+
avl_insert(&ddt->ddt_tree, dde, where);
/*
@@ -1206,7 +1361,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
ddt_key_fill(&ddk, bp);
- dde = ddt_alloc(&ddk);
+ dde = ddt_alloc(ddt, &ddk);
ddt_alloc_entry_io(dde);
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
@@ -1222,7 +1377,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
}
}
- memset(dde->dde_phys, 0, sizeof (ddt_phys_t) * DDT_NPHYS(ddt));
+ memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt));
return (dde);
}
@@ -1265,13 +1420,26 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
ddt_repair_entry_done, rdde, rio->io_flags);
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
- ddt_phys_t *ddp = &dde->dde_phys[p];
- ddt_phys_t *rddp = &rdde->dde_phys[p];
- if (ddp->ddp_phys_birth == 0 ||
- ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
- memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+ ddt_univ_phys_t *ddp = dde->dde_phys;
+ ddt_univ_phys_t *rddp = rdde->dde_phys;
+ ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+ uint64_t phys_birth = ddt_phys_birth(ddp, v);
+ const dva_t *dvas, *rdvas;
+
+ if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+ dvas = ddp->ddp_flat.ddp_dva;
+ rdvas = rddp->ddp_flat.ddp_dva;
+ } else {
+ dvas = ddp->ddp_trad[p].ddp_dva;
+ rdvas = rddp->ddp_trad[p].ddp_dva;
+ }
+
+ if (phys_birth == 0 ||
+ phys_birth != ddt_phys_birth(rddp, v) ||
+ memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP))
continue;
- ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
@@ -1297,7 +1465,8 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio)
rdde_next = AVL_NEXT(t, rdde);
avl_remove(&ddt->ddt_repair_tree, rdde);
ddt_exit(ddt);
- ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+ ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL,
+ DDT_PHYS_NONE, &blk);
dde = ddt_repair_start(ddt, &blk);
ddt_repair_entry(ddt, dde, rdde, rio);
ddt_repair_done(ddt, dde);
@@ -1322,9 +1491,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ASSERT(dde->dde_io == NULL ||
dde->dde_io->dde_lead_zio[p] == NULL);
- ddt_phys_t *ddp = &dde->dde_phys[p];
- if (ddp->ddp_phys_birth == 0) {
- ASSERT0(ddp->ddp_refcnt);
+ ddt_univ_phys_t *ddp = dde->dde_phys;
+ ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+ uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
+
+ if (ddt_phys_birth(ddp, v) == 0) {
+ ASSERT0(phys_refcnt);
continue;
}
if (DDT_PHYS_IS_DITTO(ddt, p)) {
@@ -1332,12 +1504,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
* Note, we no longer create DDT-DITTO blocks, but we
* don't want to leak any written by older software.
*/
- ddt_phys_free(ddt, ddk, ddp, txg);
+ ddt_phys_free(ddt, ddk, ddp, v, txg);
continue;
}
- if (ddp->ddp_refcnt == 0)
- ddt_phys_free(ddt, ddk, ddp, txg);
- total_refcnt += ddp->ddp_refcnt;
+ if (phys_refcnt == 0)
+ ddt_phys_free(ddt, ddk, ddp, v, txg);
+ total_refcnt += phys_refcnt;
}
if (total_refcnt > 1)
@@ -1371,7 +1543,7 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
ddt_lightweight_entry_t ddlwe;
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
dsl_scan_ddt_entry(dp->dp_scan,
- ddt->ddt_checksum, &ddlwe, tx);
+ ddt->ddt_checksum, ddt, &ddlwe, tx);
}
}
}
@@ -1536,12 +1708,10 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
}
if (dde->dde_type < DDT_TYPES) {
- ddt_phys_t *ddp;
-
ASSERT3S(dde->dde_class, <, DDT_CLASSES);
int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
- ddp = &dde->dde_phys[p];
+ ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
/*
* This entry already existed (dde_type is real), so it must
@@ -1553,9 +1723,9 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
* likely further action is required to fill out the DDT entry,
* and this is a place that is likely to be missed in testing.
*/
- ASSERT3U(ddp->ddp_refcnt, >, 0);
+ ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0);
- ddt_phys_addref(ddp);
+ ddt_phys_addref(dde->dde_phys, v);
result = B_TRUE;
} else {
/*
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index 5449eca3a..6da77bbca 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -43,18 +43,22 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
memset(dds, 0, sizeof (*dds));
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
- ddt_phys_t *ddp = &dde->dde_phys[p];
+ const ddt_univ_phys_t *ddp = dde->dde_phys;
+ ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
- uint64_t dsize = 0;
- uint64_t refcnt = ddp->ddp_refcnt;
-
- if (ddp->ddp_phys_birth == 0)
+ if (ddt_phys_birth(ddp, v) == 0)
continue;
- int ndvas = DDK_GET_CRYPT(&dde->dde_key) ?
- SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
+ int ndvas = ddt_phys_dva_count(ddp, v,
+ DDK_GET_CRYPT(&dde->dde_key));
+ const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
+ ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
+
+ uint64_t dsize = 0;
for (int d = 0; d < ndvas; d++)
- dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+ dsize += dva_get_dsize_sync(spa, &dvas[d]);
+
+ uint64_t refcnt = ddt_phys_refcnt(ddp, v);
dds->dds_blocks += 1;
dds->dds_lsize += lsize;
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 8f1bbeeec..4e01624f3 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -109,7 +109,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
static int
ddt_zap_lookup(objset_t *os, uint64_t object,
- const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize)
+ const ddt_key_t *ddk, void *phys, size_t psize)
{
uchar_t *cbuf;
uint64_t one, csize;
@@ -156,7 +156,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object)
static int
ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
- const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx)
+ const void *phys, size_t psize, dmu_tx_t *tx)
{
const size_t cbuf_size = psize + 1;
@@ -182,7 +182,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,
static int
ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
- ddt_phys_t *phys, size_t psize)
+ void *phys, size_t psize)
{
zap_cursor_t zc;
zap_attribute_t za;
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index dec0eb28d..daf1bd5d6 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2929,7 +2929,7 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
void
dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
- ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+ ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
{
(void) tx;
const ddt_key_t *ddk = &ddlwe->ddlwe_key;
@@ -2953,13 +2953,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
if (scn->scn_done_txg != 0)
return;
- for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
- ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p];
+ for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+ ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+ uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
- if (ddp->ddp_phys_birth == 0 ||
- ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+ if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
continue;
- ddt_bp_create(checksum, ddk, ddp, &bp);
+ ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);
scn->scn_visited_this_txg++;
scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
@@ -3022,7 +3022,7 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
ASSERT(avl_first(&ddt->ddt_tree) == NULL);
- dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &ddlwe, tx);
+ dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
n++;
if (dsl_scan_check_suspend(scn, NULL))
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 1ca71c738..1f3acb9b9 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3256,14 +3256,16 @@ zio_ddt_child_read_done(zio_t *zio)
blkptr_t *bp = zio->io_bp;
ddt_t *ddt;
ddt_entry_t *dde = zio->io_private;
- ddt_phys_t *ddp;
zio_t *pio = zio_unique_parent(zio);
mutex_enter(&pio->io_lock);
ddt = ddt_select(zio->io_spa, bp);
- ddp = ddt_phys_select(ddt, dde, bp);
- if (zio->io_error == 0)
- ddt_phys_clear(ddp); /* this ddp doesn't need repair */
+
+ if (zio->io_error == 0) {
+ ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
+ /* this phys variant doesn't need repair */
+ ddt_phys_clear(dde->dde_phys, v);
+ }
if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
dde->dde_io->dde_repair_abd = zio->io_abd;
@@ -3284,21 +3286,25 @@ zio_ddt_read_start(zio_t *zio)
if (zio->io_child_error[ZIO_CHILD_DDT]) {
ddt_t *ddt = ddt_select(zio->io_spa, bp);
ddt_entry_t *dde = ddt_repair_start(ddt, bp);
- ddt_phys_t *ddp_self = ddt_phys_select(ddt, dde, bp);
+ ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp);
+ ddt_univ_phys_t *ddp = dde->dde_phys;
blkptr_t blk;
ASSERT(zio->io_vsd == NULL);
zio->io_vsd = dde;
- if (ddp_self == NULL)
+ if (v_self == DDT_PHYS_NONE)
return (zio);
+ /* issue I/O for the other copies */
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
- ddt_phys_t *ddp = &dde->dde_phys[p];
- if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+ ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+ if (ddt_phys_birth(ddp, v) == 0 || v == v_self)
continue;
- ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
- &blk);
+
+ ddt_bp_create(ddt->ddt_checksum, &dde->dde_key,
+ ddp, v, &blk);
zio_nowait(zio_read(zio, zio->io_spa, &blk,
abd_alloc_for_io(zio->io_size, B_TRUE),
zio->io_size, zio_ddt_child_read_done, dde,
@@ -3378,30 +3384,32 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (DDT_PHYS_IS_DITTO(ddt, p))
continue;
+ if (dde->dde_io == NULL)
+ continue;
+
zio_t *lio = dde->dde_io->dde_lead_zio[p];
+ if (lio == NULL)
+ continue;
- if (lio != NULL && do_raw) {
+ if (do_raw)
return (lio->io_size != zio->io_size ||
abd_cmp(zio->io_abd, lio->io_abd) != 0);
- } else if (lio != NULL) {
- return (lio->io_orig_size != zio->io_orig_size ||
- abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
- }
+
+ return (lio->io_orig_size != zio->io_orig_size ||
+ abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
}
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
- if (DDT_PHYS_IS_DITTO(ddt, p))
- continue;
+ ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+ uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v);
- ddt_phys_t *ddp = &dde->dde_phys[p];
-
- if (ddp->ddp_phys_birth != 0 && do_raw) {
+ if (phys_birth != 0 && do_raw) {
blkptr_t blk = *zio->io_bp;
uint64_t psize;
abd_t *tmpabd;
int error;
- ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+ ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
psize = BP_GET_PSIZE(&blk);
if (psize != zio->io_size)
@@ -3424,13 +3432,13 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
abd_free(tmpabd);
ddt_enter(ddt);
return (error != 0);
- } else if (ddp->ddp_phys_birth != 0) {
+ } else if (phys_birth != 0) {
arc_buf_t *abuf = NULL;
arc_flags_t aflags = ARC_FLAG_WAIT;
blkptr_t blk = *zio->io_bp;
int error;
- ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+ ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
return (B_TRUE);
@@ -3458,52 +3466,87 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
}
static void
-zio_ddt_child_write_ready(zio_t *zio)
+zio_ddt_child_write_done(zio_t *zio)
{
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
ddt_entry_t *dde = zio->io_private;
- zio_t *pio;
- if (zio->io_error)
- return;
+ zio_link_t *zl = NULL;
+ ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
- ddt_phys_t *ddp = &dde->dde_phys[p];
+ ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+ ddt_univ_phys_t *ddp = dde->dde_phys;
ddt_enter(ddt);
- ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
+ /* we're the lead, so once we're done there's no one else outstanding */
+ if (dde->dde_io->dde_lead_zio[p] == zio)
+ dde->dde_io->dde_lead_zio[p] = NULL;
- ddt_phys_fill(ddp, zio->io_bp);
+ ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys;
- zio_link_t *zl = NULL;
- while ((pio = zio_walk_parents(zio, &zl)) != NULL)
- ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+ if (zio->io_error != 0) {
+ /*
+ * The write failed, so we're about to abort the entire IO
+ * chain. We need to revert the entry back to what it was at
+ * the last time it was successfully extended.
+ */
+ ddt_phys_copy(ddp, orig, v);
+ ddt_phys_clear(orig, v);
+
+ ddt_exit(ddt);
+ return;
+ }
+
+ /*
+ * We've successfully added new DVAs to the entry. Clear the saved
+ * state or, if there's still outstanding IO, remember it so we can
+ * revert to a known good state if that IO fails.
+ */
+ if (dde->dde_io->dde_lead_zio[p] == NULL)
+ ddt_phys_clear(orig, v);
+ else
+ ddt_phys_copy(orig, ddp, v);
+
+ /*
+ * Add references for all dedup writes that were waiting on the
+ * physical one, skipping any other physical writes that are waiting.
+ */
+ zio_t *pio;
+ zl = NULL;
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
+ if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
+ ddt_phys_addref(ddp, v);
+ }
ddt_exit(ddt);
}
static void
-zio_ddt_child_write_done(zio_t *zio)
+zio_ddt_child_write_ready(zio_t *zio)
{
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
ddt_entry_t *dde = zio->io_private;
+ zio_link_t *zl = NULL;
+ ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
+
int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
- ddt_phys_t *ddp = &dde->dde_phys[p];
+ ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+ if (zio->io_error != 0)
+ return;
ddt_enter(ddt);
- ASSERT(ddp->ddp_refcnt == 0);
- ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
- dde->dde_io->dde_lead_zio[p] = NULL;
+ ddt_phys_extend(dde->dde_phys, v, zio->io_bp);
- if (zio->io_error == 0) {
- zio_link_t *zl = NULL;
- while (zio_walk_parents(zio, &zl) != NULL)
- ddt_phys_addref(ddp);
- } else {
- ddt_phys_clear(ddp);
+ zio_t *pio;
+ zl = NULL;
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
+ if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
+ ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
}
ddt_exit(ddt);
@@ -3516,7 +3559,6 @@ zio_ddt_write(zio_t *zio)
blkptr_t *bp = zio->io_bp;
uint64_t txg = zio->io_txg;
zio_prop_t *zp = &zio->io_prop;
- zio_t *cio = NULL;
ddt_t *ddt = ddt_select(spa, bp);
ddt_entry_t *dde;
@@ -3537,9 +3579,6 @@ zio_ddt_write(zio_t *zio)
return (zio);
}
- int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
- ddt_phys_t *ddp = &dde->dde_phys[p];
-
if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
/*
* If we're using a weak checksum, upgrade to a strong checksum
@@ -3563,31 +3602,227 @@ zio_ddt_write(zio_t *zio)
return (zio);
}
- ddt_alloc_entry_io(dde);
+ int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
+ ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+ ddt_univ_phys_t *ddp = dde->dde_phys;
- if (ddp->ddp_phys_birth != 0 || dde->dde_io->dde_lead_zio[p] != NULL) {
- if (ddp->ddp_phys_birth != 0)
- ddt_bp_fill(ddp, bp, txg);
- if (dde->dde_io->dde_lead_zio[p] != NULL)
- zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
- else
- ddt_phys_addref(ddp);
- } else if (zio->io_bp_override) {
- ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
- ASSERT(BP_EQUAL(bp, zio->io_bp_override));
- ddt_phys_fill(ddp, bp);
- ddt_phys_addref(ddp);
+ /*
+ * In the common cases, at this point we have a regular BP with no
+ * allocated DVAs, and the corresponding DDT entry for its checksum.
+ * Our goal is to fill the BP with enough DVAs to satisfy its copies=
+ * requirement.
+ *
+ * One of three things needs to happen to fulfill this:
+ *
+ * - if the DDT entry has enough DVAs to satisfy the BP, we just copy
+ * them out of the entry and return;
+ *
+ * - if the DDT entry has no DVAs (ie its brand new), then we have to
+ * issue the write as normal so that DVAs can be allocated and the
+ * data land on disk. We then copy the DVAs into the DDT entry on
+ * return.
+ *
+ * - if the DDT entry has some DVAs, but too few, we have to issue the
+ * write, adjusted to have allocate fewer copies. When it returns, we
+ * add the new DVAs to the DDT entry, and update the BP to have the
+ * full amount it originally requested.
+ *
+ * In all cases, if there's already a writing IO in flight, we need to
+ * defer the action until after the write is done. If our action is to
+ * write, we need to adjust our request for additional DVAs to match
+ * what will be in the DDT entry after it completes. In this way every
+ * IO can be guaranteed to recieve enough DVAs simply by joining the
+ * end of the chain and letting the sequence play out.
+ */
+
+ /*
+ * Number of DVAs in the DDT entry. If the BP is encrypted we ignore
+ * the third one as normal.
+ */
+ int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
+ IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
+
+ /* Number of DVAs requested bya the IO. */
+ uint8_t need_dvas = zp->zp_copies;
+
+ /*
+ * What we do next depends on whether or not there's IO outstanding that
+ * will update this entry.
+ */
+ if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
+ /*
+ * No IO outstanding, so we only need to worry about ourselves.
+ */
+
+ /*
+ * Override BPs bring their own DVAs and their own problems.
+ */
+ if (zio->io_bp_override) {
+ /*
+ * For a brand-new entry, all the work has been done
+ * for us, and we can just fill it out from the provided
+ * block and leave.
+ */
+ if (have_dvas == 0) {
+ ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
+ ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+ ddt_phys_extend(ddp, v, bp);
+ ddt_phys_addref(ddp, v);
+ ddt_exit(ddt);
+ return (zio);
+ }
+
+ /*
+ * If we already have this entry, then we want to treat
+ * it like a regular write. To do this we just wipe
+ * them out and proceed like a regular write.
+ *
+ * Even if there are some DVAs in the entry, we still
+ * have to clear them out. We can't use them to fill
+ * out the dedup entry, as they are all referenced
+ * together by a bp already on disk, and will be freed
+ * as a group.
+ */
+ BP_ZERO_DVAS(bp);
+ BP_SET_BIRTH(bp, 0, 0);
+ }
+
+ /*
+ * If there are enough DVAs in the entry to service our request,
+ * then we can just use them as-is.
+ */
+ if (have_dvas >= need_dvas) {
+ ddt_bp_fill(ddp, v, bp, txg);
+ ddt_phys_addref(ddp, v);
+ ddt_exit(ddt);
+ return (zio);
+ }
+
+ /*
+ * Otherwise, we have to issue IO to fill the entry up to the
+ * amount we need.
+ */
+ need_dvas -= have_dvas;
} else {
- cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
- zio->io_orig_size, zio->io_orig_size, zp,
- zio_ddt_child_write_ready, NULL,
- zio_ddt_child_write_done, dde, zio->io_priority,
- ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+ /*
+ * There's a write in-flight. If there's already enough DVAs on
+ * the entry, then either there were already enough to start
+ * with, or the in-flight IO is between READY and DONE, and so
+ * has extended the entry with new DVAs. Either way, we don't
+ * need to do anything, we can just slot in behind it.
+ */
+
+ if (zio->io_bp_override) {
+ /*
+ * If there's a write out, then we're soon going to
+ * have our own copies of this block, so clear out the
+ * override block and treat it as a regular dedup
+ * write. See comment above.
+ */
+ BP_ZERO_DVAS(bp);
+ BP_SET_BIRTH(bp, 0, 0);
+ }
+
+ if (have_dvas >= need_dvas) {
+ /*
+ * A minor point: there might already be enough
+ * committed DVAs in the entry to service our request,
+ * but we don't know which are completed and which are
+ * allocated but not yet written. In this case, should
+ * the IO for the new DVAs fail, we will be on the end
+ * of the IO chain and will also recieve an error, even
+ * though our request could have been serviced.
+ *
+ * This is an extremely rare case, as it requires the
+ * original block to be copied with a request for a
+ * larger number of DVAs, then copied again requesting
+ * the same (or already fulfilled) number of DVAs while
+ * the first request is active, and then that first
+ * request errors. In return, the logic required to
+ * catch and handle it is complex. For now, I'm just
+ * not going to bother with it.
+ */
+
+ /*
+ * We always fill the bp here as we may have arrived
+ * after the in-flight write has passed READY, and so
+ * missed out.
+ */
+ ddt_bp_fill(ddp, v, bp, txg);
+ zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
+ ddt_exit(ddt);
+ return (zio);
+ }
+
+ /*
+ * There's not enough in the entry yet, so we need to look at
+ * the write in-flight and see how many DVAs it will have once
+ * it completes.
+ *
+ * The in-flight write has potentially had its copies request
+ * reduced (if we're filling out an existing entry), so we need
+ * to reach in and get the original write to find out what it is
+ * expecting.
+ *
+ * Note that the parent of the lead zio will always have the
+ * highest zp_copies of any zio in the chain, because ones that
+ * can be serviced without additional IO are always added to
+ * the back of the chain.
+ */
+ zio_link_t *zl = NULL;
+ zio_t *pio =
+ zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
+ ASSERT(pio);
+ uint8_t parent_dvas = pio->io_prop.zp_copies;
+
+ if (parent_dvas >= need_dvas) {
+ zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
+ ddt_exit(ddt);
+ return (zio);
+ }
- zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
- dde->dde_io->dde_lead_zio[p] = cio;
+ /*
+ * Still not enough, so we will need to issue to get the
+ * shortfall.
+ */
+ need_dvas -= parent_dvas;
}
+ /*
+ * We need to write. We will create a new write with the copies
+ * property adjusted to match the number of DVAs we need to need to
+ * grow the DDT entry by to satisfy the request.
+ */
+ zio_prop_t czp = *zp;
+ czp.zp_copies = need_dvas;
+ zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
+ zio->io_orig_size, zio->io_orig_size, &czp,
+ zio_ddt_child_write_ready, NULL,
+ zio_ddt_child_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
+
+ /*
+ * We are the new lead zio, because our parent has the highest
+ * zp_copies that has been requested for this entry so far.
+ */
+ ddt_alloc_entry_io(dde);
+ if (dde->dde_io->dde_lead_zio[p] == NULL) {
+ /*
+ * First time out, take a copy of the stable entry to revert
+ * to if there's an error (see zio_ddt_child_write_done())
+ */
+ ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
+ } else {
+ /*
+ * Make the existing chain our child, because it cannot
+ * complete until we have.
+ */
+ zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
+ }
+ dde->dde_io->dde_lead_zio[p] = cio;
+
ddt_exit(ddt);
zio_nowait(cio);
@@ -3603,8 +3838,7 @@ zio_ddt_free(zio_t *zio)
spa_t *spa = zio->io_spa;
blkptr_t *bp = zio->io_bp;
ddt_t *ddt = ddt_select(spa, bp);
- ddt_entry_t *dde;
- ddt_phys_t *ddp;
+ ddt_entry_t *dde = NULL;
ASSERT(BP_GET_DEDUP(bp));
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
@@ -3612,9 +3846,9 @@ zio_ddt_free(zio_t *zio)
ddt_enter(ddt);
freedde = dde = ddt_lookup(ddt, bp);
if (dde) {
- ddp = ddt_phys_select(ddt, dde, bp);
- if (ddp)
- ddt_phys_decref(ddp);
+ ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
+ if (v != DDT_PHYS_NONE)
+ ddt_phys_decref(dde->dde_phys, v);
}
ddt_exit(ddt);