aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zdb/zdb.c117
-rw-r--r--module/zfs/dsl_deadlist.c65
-rw-r--r--tests/runfiles/common.run4
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh88
4 files changed, 210 insertions, 64 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 84ae606b9..c548936bc 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -162,12 +162,6 @@ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
dmu_tx_t *tx);
typedef struct sublivelist_verify {
- /* all ALLOC'd blkptr_t in one sub-livelist */
- zfs_btree_t sv_all_allocs;
-
- /* all FREE'd blkptr_t in one sub-livelist */
- zfs_btree_t sv_all_frees;
-
/* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */
zfs_btree_t sv_pair;
@@ -226,29 +220,68 @@ typedef struct sublivelist_verify_block {
static void zdb_print_blkptr(const blkptr_t *bp, int flags);
+typedef struct sublivelist_verify_block_refcnt {
+ /* block pointer entry in livelist being verified */
+ blkptr_t svbr_blk;
+
+ /*
+ * Refcount gets incremented to 1 when we encounter the first
+ * FREE entry for the svfbr block pointer and a node for it
+ * is created in our ZDB verification/tracking metadata.
+ *
+ * As we encounter more FREE entries we increment this counter
+ * and similarly decrement it whenever we find the respective
+ * ALLOC entries for this block.
+ *
+ * When the refcount gets to 0 it means that all the FREE and
+ * ALLOC entries of this block have paired up and we no longer
+ * need to track it in our verification logic (e.g. the node
+ * containing this struct in our verification data structure
+ * should be freed).
+ *
+ * [refer to sublivelist_verify_blkptr() for the actual code]
+ */
+ uint32_t svbr_refcnt;
+} sublivelist_verify_block_refcnt_t;
+
+static int
+sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
+{
+ const sublivelist_verify_block_refcnt_t *l = larg;
+ const sublivelist_verify_block_refcnt_t *r = rarg;
+ return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
+}
+
static int
sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
dmu_tx_t *tx)
{
ASSERT3P(tx, ==, NULL);
struct sublivelist_verify *sv = arg;
- char blkbuf[BP_SPRINTF_LEN];
+ sublivelist_verify_block_refcnt_t current = {
+ .svbr_blk = *bp,
+
+ /*
+ * Start with 1 in case this is the first free entry.
+ * This field is not used for our B-Tree comparisons
+ * anyway.
+ */
+ .svbr_refcnt = 1,
+ };
+
zfs_btree_index_t where;
+ sublivelist_verify_block_refcnt_t *pair =
+ zfs_btree_find(&sv->sv_pair, &current, &where);
if (free) {
- zfs_btree_add(&sv->sv_pair, bp);
- /* Check if the FREE is a duplicate */
- if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) {
- snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
- free);
- (void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf);
+ if (pair == NULL) {
+ /* first free entry for this block pointer */
+ zfs_btree_add(&sv->sv_pair, &current);
} else {
- zfs_btree_add_idx(&sv->sv_all_frees, bp, &where);
+ pair->svbr_refcnt++;
}
} else {
- /* Check if the ALLOC has been freed */
- if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) {
- zfs_btree_remove_idx(&sv->sv_pair, &where);
- } else {
+ if (pair == NULL) {
+ /* block that is currently marked as allocated */
for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
if (DVA_IS_EMPTY(&bp->blk_dva[i]))
break;
@@ -263,16 +296,16 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
&svb, &where);
}
}
- }
- /* Check if the ALLOC is a duplicate */
- if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) {
- snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
- free);
- (void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf);
} else {
- zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where);
+ /* alloc matches a free entry */
+ pair->svbr_refcnt--;
+ if (pair->svbr_refcnt == 0) {
+ /* all allocs and frees have been matched */
+ zfs_btree_remove_idx(&sv->sv_pair, &where);
+ }
}
}
+
return (0);
}
@@ -280,32 +313,22 @@ static int
sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
{
int err;
- char blkbuf[BP_SPRINTF_LEN];
struct sublivelist_verify *sv = args;
- zfs_btree_create(&sv->sv_all_allocs, livelist_compare,
- sizeof (blkptr_t));
-
- zfs_btree_create(&sv->sv_all_frees, livelist_compare,
- sizeof (blkptr_t));
-
- zfs_btree_create(&sv->sv_pair, livelist_compare,
- sizeof (blkptr_t));
+ zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare,
+ sizeof (sublivelist_verify_block_refcnt_t));
err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
sv, NULL);
- zfs_btree_clear(&sv->sv_all_allocs);
- zfs_btree_destroy(&sv->sv_all_allocs);
-
- zfs_btree_clear(&sv->sv_all_frees);
- zfs_btree_destroy(&sv->sv_all_frees);
-
- blkptr_t *e;
+ sublivelist_verify_block_refcnt_t *e;
zfs_btree_index_t *cookie = NULL;
while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
- snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE);
- (void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf);
+ char blkbuf[BP_SPRINTF_LEN];
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
+ &e->svbr_blk, B_TRUE);
+ (void) printf("\tERROR: %d unmatched FREE(s): %s\n",
+ e->svbr_refcnt, blkbuf);
}
zfs_btree_destroy(&sv->sv_pair);
@@ -614,10 +637,14 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
/*
* [Livelist Check]
* Iterate through all the sublivelists and:
- * - report leftover frees
- * - report double ALLOCs/FREEs
+ * - report leftover frees (**)
* - record leftover ALLOCs together with their TXG [see Cross Check]
*
+ * (**) Note: Double ALLOCs are valid in datasets that have dedup
+ * enabled. Similarly double FREEs are allowed as well but
+ * only if they pair up with a corresponding ALLOC entry once
+ * we our done with our sublivelist iteration.
+ *
* [Spacemap Check]
* for each metaslab:
* - iterate over spacemap and then the metaslab's entries in the
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index bad2d56ee..a77e38152 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -909,15 +909,16 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
}
typedef struct livelist_entry {
- const blkptr_t *le_bp;
+ blkptr_t le_bp;
+ uint32_t le_refcnt;
avl_node_t le_node;
} livelist_entry_t;
static int
livelist_compare(const void *larg, const void *rarg)
{
- const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp;
- const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp;
+ const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp;
+ const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp;
/* Sort them according to dva[0] */
uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
@@ -944,6 +945,11 @@ struct livelist_iter_arg {
* Expects an AVL tree which is incrementally filled will FREE blkptrs
* and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
* corresponding FREE are stored in the supplied bplist.
+ *
+ * Note that multiple FREE and ALLOC entries for the same blkptr may
+ * be encountered when dedup is involved. For this reason we keep a
+ * refcount for all the FREE entries of each blkptr and ensure that
+ * each of those FREE entries has a corresponding ALLOC preceding it.
*/
static int
dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
@@ -957,23 +963,47 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
return (SET_ERROR(EINTR));
+
+ livelist_entry_t node;
+ node.le_bp = *bp;
+ livelist_entry_t *found = avl_find(avl, &node, NULL);
if (bp_freed) {
- livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t),
- KM_SLEEP);
- blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
- *temp_bp = *bp;
- node->le_bp = temp_bp;
- avl_add(avl, node);
- } else {
- livelist_entry_t node;
- node.le_bp = bp;
- livelist_entry_t *found = avl_find(avl, &node, NULL);
- if (found != NULL) {
- avl_remove(avl, found);
- kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t));
- kmem_free(found, sizeof (livelist_entry_t));
+ if (found == NULL) {
+ /* first free entry for this blkptr */
+ livelist_entry_t *e =
+ kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP);
+ e->le_bp = *bp;
+ e->le_refcnt = 1;
+ avl_add(avl, e);
} else {
+ /* dedup block free */
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT3U(BP_GET_CHECKSUM(bp), ==,
+ BP_GET_CHECKSUM(&found->le_bp));
+ ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
+ found->le_refcnt++;
+ }
+ } else {
+ if (found == NULL) {
+ /* block is currently marked as allocated */
bplist_append(to_free, bp);
+ } else {
+ /* alloc matches a free entry */
+ ASSERT3U(found->le_refcnt, !=, 0);
+ found->le_refcnt--;
+ if (found->le_refcnt == 0) {
+ /* all tracked free pairs have been matched */
+ avl_remove(avl, found);
+ kmem_free(found, sizeof (livelist_entry_t));
+ } else {
+ /*
+ * This is definitely a deduped blkptr so
+ * let's validate it.
+ */
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT3U(BP_GET_CHECKSUM(bp), ==,
+ BP_GET_CHECKSUM(&found->le_bp));
+ }
}
}
return (0);
@@ -999,6 +1029,7 @@ dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
};
int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
+ VERIFY0(avl_numnodes(&avl));
avl_destroy(&avl);
return (err);
}
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 703407a7d..dd25a55ed 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -166,8 +166,8 @@ tags = ['functional', 'cli_root', 'zfs_create']
[tests/functional/cli_root/zfs_destroy]
tests = ['zfs_clone_livelist_condense_and_disable',
- 'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos',
- 'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
+ 'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup',
+ 'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg',
'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos',
'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos',
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh
new file mode 100755
index 000000000..5f356967a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh
@@ -0,0 +1,88 @@
+#!/bin/ksh -p
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2021 by Delphix. All rights reserved.
+#
+
+# DESCRIPTION
+# Verify zfs destroy test for clones with livelists that contain
+# dedup blocks. This test is a baseline regression test created
+# to ensure that past bugs that we've encountered between dedup
+# and the livelist logic don't resurface.
+
+# STRATEGY
+# 1. Create a clone from a test filesystem and enable dedup.
+# 2. Write some data and create a livelist.
+# 3. Copy the data within the clone to create dedup blocks.
+# 4. Remove some of the dedup data to create multiple free
+# entries for the same block pointers.
+# 5. Process all the livelist entries by destroying the clone.
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib
+
+function cleanup
+{
+ log_must zfs destroy -Rf $TESTPOOL/$TESTFS1
+ # Reset the minimum percent shared to 75
+ set_tunable32 LIVELIST_MIN_PERCENT_SHARED $ORIGINAL_MIN_SHARED
+}
+
+function test_dedup
+{
+ # Set a small percent shared threshold so the livelist is not disabled
+ set_tunable32 LIVELIST_MIN_PERCENT_SHARED 10
+ clone_dataset $TESTFS1 snap $TESTCLONE
+
+ # Enable dedup
+ log_must zfs set dedup=on $TESTPOOL/$TESTCLONE
+
+ # Create some data to be deduped
+ log_must dd if=/dev/urandom of="/$TESTPOOL/$TESTCLONE/data" bs=512 count=10k
+
+ # Create dedup blocks
+ # Note: We sync before and after so all dedup blocks belong to the
+ # same TXG, otherwise they won't look identical to the livelist
+ # iterator due to their logical birth TXG being different.
+ log_must zpool sync $TESTPOOL
+ log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-0
+ log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-1
+ log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-2
+ log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-3
+ log_must zpool sync $TESTPOOL
+ check_livelist_exists $TESTCLONE
+
+ # Introduce "double frees"
+ # We want to introduce consecutive FREEs of the same block as this
+ # was what triggered past panics.
+ # Note: Similarly to the previouys step we sync before and after our
+ # our deletions so all the entries end up in the same TXG.
+ log_must zpool sync $TESTPOOL
+ log_must rm /$TESTPOOL/$TESTCLONE/data-dup-2
+ log_must rm /$TESTPOOL/$TESTCLONE/data-dup-3
+ log_must zpool sync $TESTPOOL
+ check_livelist_exists $TESTCLONE
+
+ log_must zfs destroy $TESTPOOL/$TESTCLONE
+ check_livelist_gone
+}
+
+ORIGINAL_MIN_SHARED=$(get_tunable LIVELIST_MIN_PERCENT_SHARED)
+
+log_onexit cleanup
+log_must zfs create $TESTPOOL/$TESTFS1
+log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile
+log_must zfs snapshot $TESTPOOL/$TESTFS1@snap
+test_dedup
+
+log_pass "Clone's livelist processes dedup blocks as expected."