summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zdb/zdb.c654
-rw-r--r--cmd/ztest/ztest.c2
-rw-r--r--include/sys/metaslab.h3
-rw-r--r--include/sys/space_map.h9
-rw-r--r--man/man8/zdb.810
-rw-r--r--module/zfs/metaslab.c2
-rw-r--r--module/zfs/space_map.c28
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh2
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh4
9 files changed, 649 insertions, 65 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index a329e4a83..59b17132f 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -69,6 +69,7 @@
#include <sys/blkptr.h>
#include <sys/dsl_crypt.h>
#include <sys/dsl_scan.h>
+#include <sys/btree.h>
#include <zfs_comutil.h>
#include <libnvpair.h>
@@ -151,6 +152,571 @@ static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
boolean_t);
static void mos_obj_refd(uint64_t);
static void mos_obj_refd_multiple(uint64_t);
+static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
+ dmu_tx_t *tx);
+
+typedef struct sublivelist_verify {
+ /* all ALLOC'd blkptr_t in one sub-livelist */
+ zfs_btree_t sv_all_allocs;
+
+ /* all FREE'd blkptr_t in one sub-livelist */
+ zfs_btree_t sv_all_frees;
+
+ /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */
+ zfs_btree_t sv_pair;
+
+ /* ALLOC's without a matching FREE, accumulates across sub-livelists */
+ zfs_btree_t sv_leftover;
+} sublivelist_verify_t;
+
+static int
+livelist_compare(const void *larg, const void *rarg)
+{
+ const blkptr_t *l = larg;
+ const blkptr_t *r = rarg;
+
+ /* Sort them according to dva[0] */
+ uint64_t l_dva0_vdev, r_dva0_vdev;
+ l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
+ r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
+ if (l_dva0_vdev < r_dva0_vdev)
+ return (-1);
+ else if (l_dva0_vdev > r_dva0_vdev)
+ return (+1);
+
+ /* if vdevs are equal, sort by offsets. */
+ uint64_t l_dva0_offset;
+ uint64_t r_dva0_offset;
+ l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
+ r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
+ if (l_dva0_offset < r_dva0_offset) {
+ return (-1);
+ } else if (l_dva0_offset > r_dva0_offset) {
+ return (+1);
+ }
+
+ /*
+ * Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
+ * it's possible the offsets are equal. In that case, sort by txg
+ */
+ if (l->blk_birth < r->blk_birth) {
+ return (-1);
+ } else if (l->blk_birth > r->blk_birth) {
+ return (+1);
+ }
+ return (0);
+}
+
+typedef struct sublivelist_verify_block {
+ dva_t svb_dva;
+
+ /*
+ * We need this to check if the block marked as allocated
+ * in the livelist was freed (and potentially reallocated)
+ * in the metaslab spacemaps at a later TXG.
+ */
+ uint64_t svb_allocated_txg;
+} sublivelist_verify_block_t;
+
+static void zdb_print_blkptr(const blkptr_t *bp, int flags);
+
+static int
+sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
+ dmu_tx_t *tx)
+{
+ ASSERT3P(tx, ==, NULL);
+ struct sublivelist_verify *sv = arg;
+ char blkbuf[BP_SPRINTF_LEN];
+ zfs_btree_index_t where;
+ if (free) {
+ zfs_btree_add(&sv->sv_pair, bp);
+ /* Check if the FREE is a duplicate */
+ if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) {
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
+ free);
+ (void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf);
+ } else {
+ zfs_btree_add_idx(&sv->sv_all_frees, bp, &where);
+ }
+ } else {
+ /* Check if the ALLOC has been freed */
+ if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) {
+ zfs_btree_remove_idx(&sv->sv_pair, &where);
+ } else {
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ if (DVA_IS_EMPTY(&bp->blk_dva[i]))
+ break;
+ sublivelist_verify_block_t svb = {
+ .svb_dva = bp->blk_dva[i],
+ .svb_allocated_txg = bp->blk_birth
+ };
+
+ if (zfs_btree_find(&sv->sv_leftover, &svb,
+ &where) == NULL) {
+ zfs_btree_add_idx(&sv->sv_leftover,
+ &svb, &where);
+ }
+ }
+ }
+ /* Check if the ALLOC is a duplicate */
+ if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) {
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
+ free);
+ (void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf);
+ } else {
+ zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where);
+ }
+ }
+ return (0);
+}
+
+static int
+sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
+{
+ int err;
+ char blkbuf[BP_SPRINTF_LEN];
+ struct sublivelist_verify *sv = args;
+
+ zfs_btree_create(&sv->sv_all_allocs, livelist_compare,
+ sizeof (blkptr_t));
+
+ zfs_btree_create(&sv->sv_all_frees, livelist_compare,
+ sizeof (blkptr_t));
+
+ zfs_btree_create(&sv->sv_pair, livelist_compare,
+ sizeof (blkptr_t));
+
+ err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
+ sv, NULL);
+
+ zfs_btree_clear(&sv->sv_all_allocs);
+ zfs_btree_destroy(&sv->sv_all_allocs);
+
+ zfs_btree_clear(&sv->sv_all_frees);
+ zfs_btree_destroy(&sv->sv_all_frees);
+
+ blkptr_t *e;
+ zfs_btree_index_t *cookie = NULL;
+ while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE);
+ (void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf);
+ }
+ zfs_btree_destroy(&sv->sv_pair);
+
+ return (err);
+}
+
+static int
+livelist_block_compare(const void *larg, const void *rarg)
+{
+ const sublivelist_verify_block_t *l = larg;
+ const sublivelist_verify_block_t *r = rarg;
+
+ if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
+ return (-1);
+ else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
+ return (+1);
+
+ if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
+ return (-1);
+ else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
+ return (+1);
+
+ if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
+ return (-1);
+ else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
+ return (+1);
+
+ return (0);
+}
+
+/*
+ * Check for errors in a livelist while tracking all unfreed ALLOCs in the
+ * sublivelist_verify_t: sv->sv_leftover
+ */
+static void
+livelist_verify(dsl_deadlist_t *dl, void *arg)
+{
+ sublivelist_verify_t *sv = arg;
+ dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
+}
+
+/*
+ * Check for errors in the livelist entry and discard the intermediary
+ * data structures
+ */
+/* ARGSUSED */
+static int
+sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
+{
+ sublivelist_verify_t sv;
+ zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+ sizeof (sublivelist_verify_block_t));
+ int err = sublivelist_verify_func(&sv, dle);
+ zfs_btree_clear(&sv.sv_leftover);
+ zfs_btree_destroy(&sv.sv_leftover);
+ return (err);
+}
+
+typedef struct metaslab_verify {
+ /*
+ * Tree containing all the leftover ALLOCs from the livelists
+ * that are part of this metaslab.
+ */
+ zfs_btree_t mv_livelist_allocs;
+
+ /*
+ * Metaslab information.
+ */
+ uint64_t mv_vdid;
+ uint64_t mv_msid;
+ uint64_t mv_start;
+ uint64_t mv_end;
+
+ /*
+ * What's currently allocated for this metaslab.
+ */
+ range_tree_t *mv_allocated;
+} metaslab_verify_t;
+
+typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
+
+typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
+ void *arg);
+
+typedef struct unflushed_iter_cb_arg {
+ spa_t *uic_spa;
+ uint64_t uic_txg;
+ void *uic_arg;
+ zdb_log_sm_cb_t uic_cb;
+} unflushed_iter_cb_arg_t;
+
+static int
+iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
+{
+ unflushed_iter_cb_arg_t *uic = arg;
+ return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
+}
+
+static void
+iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ space_map_t *sm = NULL;
+ VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
+ sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+ unflushed_iter_cb_arg_t uic = {
+ .uic_spa = spa,
+ .uic_txg = sls->sls_txg,
+ .uic_arg = arg,
+ .uic_cb = cb
+ };
+ VERIFY0(space_map_iterate(sm, space_map_length(sm),
+ iterate_through_spacemap_logs_cb, &uic));
+ space_map_close(sm);
+ }
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static void
+verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
+ uint64_t offset, uint64_t size)
+{
+ sublivelist_verify_block_t svb;
+ DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
+ DVA_SET_OFFSET(&svb.svb_dva, offset);
+ DVA_SET_ASIZE(&svb.svb_dva, size);
+ zfs_btree_index_t where;
+ uint64_t end_offset = offset + size;
+
+ /*
+ * Look for an exact match for spacemap entry in the livelist entries.
+ * Then, look for other livelist entries that fall within the range
+ * of the spacemap entry as it may have been condensed
+ */
+ sublivelist_verify_block_t *found =
+ zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
+ if (found == NULL) {
+ found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
+ }
+ for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
+ DVA_GET_OFFSET(&found->svb_dva) < end_offset;
+ found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
+ if (found->svb_allocated_txg <= txg) {
+ (void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
+ "from TXG %llx FREED at TXG %llx\n",
+ (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
+ (u_longlong_t)found->svb_allocated_txg,
+ (u_longlong_t)txg);
+ }
+ }
+}
+
+static int
+metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
+{
+ metaslab_verify_t *mv = arg;
+ uint64_t offset = sme->sme_offset;
+ uint64_t size = sme->sme_run;
+ uint64_t txg = sme->sme_txg;
+
+ if (sme->sme_type == SM_ALLOC) {
+ if (range_tree_contains(mv->mv_allocated,
+ offset, size)) {
+ (void) printf("ERROR: DOUBLE ALLOC: "
+ "%llu [%llx:%llx] "
+ "%llu:%llu LOG_SM\n",
+ (u_longlong_t)txg, (u_longlong_t)offset,
+ (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
+ (u_longlong_t)mv->mv_msid);
+ } else {
+ range_tree_add(mv->mv_allocated,
+ offset, size);
+ }
+ } else {
+ if (!range_tree_contains(mv->mv_allocated,
+ offset, size)) {
+ (void) printf("ERROR: DOUBLE FREE: "
+ "%llu [%llx:%llx] "
+ "%llu:%llu LOG_SM\n",
+ (u_longlong_t)txg, (u_longlong_t)offset,
+ (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
+ (u_longlong_t)mv->mv_msid);
+ } else {
+ range_tree_remove(mv->mv_allocated,
+ offset, size);
+ }
+ }
+
+ if (sme->sme_type != SM_ALLOC) {
+ /*
+ * If something is freed in the spacemap, verify that
+ * it is not listed as allocated in the livelist.
+ */
+ verify_livelist_allocs(mv, txg, offset, size);
+ }
+ return (0);
+}
+
+static int
+spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
+ uint64_t txg, void *arg)
+{
+ metaslab_verify_t *mv = arg;
+ uint64_t offset = sme->sme_offset;
+ uint64_t vdev_id = sme->sme_vdev;
+
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+ /* skip indirect vdevs */
+ if (!vdev_is_concrete(vd))
+ return (0);
+
+ if (vdev_id != mv->mv_vdid)
+ return (0);
+
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ if (ms->ms_id != mv->mv_msid)
+ return (0);
+
+ if (txg < metaslab_unflushed_txg(ms))
+ return (0);
+
+
+ ASSERT3U(txg, ==, sme->sme_txg);
+ return (metaslab_spacemap_validation_cb(sme, mv));
+}
+
+static void
+spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
+{
+ iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
+}
+
+static void
+spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv)
+{
+ if (sm == NULL)
+ return;
+
+ VERIFY0(space_map_iterate(sm, space_map_length(sm),
+ metaslab_spacemap_validation_cb, mv));
+}
+
+static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
+
+/*
+ * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
+ * they are part of that metaslab (mv_msid).
+ */
+static void
+mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
+{
+ zfs_btree_index_t where;
+ sublivelist_verify_block_t *svb;
+ ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
+ for (svb = zfs_btree_first(&sv->sv_leftover, &where);
+ svb != NULL;
+ svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
+ if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
+ continue;
+
+ if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
+ (DVA_GET_OFFSET(&svb->svb_dva) +
+ DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
+ (void) printf("ERROR: Found block that crosses "
+ "metaslab boundary: <%llu:%llx:%llx>\n",
+ (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
+ continue;
+ }
+
+ if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
+ continue;
+
+ if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
+ continue;
+
+ if ((DVA_GET_OFFSET(&svb->svb_dva) +
+ DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
+ (void) printf("ERROR: Found block that crosses "
+ "metaslab boundary: <%llu:%llx:%llx>\n",
+ (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
+ continue;
+ }
+
+ zfs_btree_add(&mv->mv_livelist_allocs, svb);
+ }
+
+ for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
+ svb != NULL;
+ svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
+ zfs_btree_remove(&sv->sv_leftover, svb);
+ }
+}
+
+/*
+ * [Livelist Check]
+ * Iterate through all the sublivelists and:
+ * - report leftover frees
+ * - report double ALLOCs/FREEs
+ * - record leftover ALLOCs together with their TXG [see Cross Check]
+ *
+ * [Spacemap Check]
+ * for each metaslab:
+ * - iterate over spacemap and then the metaslab's entries in the
+ * spacemap log, then report any double FREEs and ALLOCs (do not
+ * blow up).
+ *
+ * [Cross Check]
+ * After finishing the Livelist Check phase and while being in the
+ * Spacemap Check phase, we find all the recorded leftover ALLOCs
+ * of the livelist check that are part of the metaslab that we are
+ * currently looking at in the Spacemap Check. We report any entries
+ * that are marked as ALLOCs in the livelists but have been actually
+ * freed (and potentially allocated again) after their TXG stamp in
+ * the spacemaps. Also report any ALLOCs from the livelists that
+ * belong to indirect vdevs (e.g. their vdev completed removal).
+ *
+ * Note that this will miss Log Spacemap entries that cancelled each other
+ * out before being flushed to the metaslab, so we are not guaranteed
+ * to match all erroneous ALLOCs.
+ */
+static void
+livelist_metaslab_validate(spa_t *spa)
+{
+ (void) printf("Verifying deleted livelist entries\n");
+
+ sublivelist_verify_t sv;
+ zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+ sizeof (sublivelist_verify_block_t));
+ iterate_deleted_livelists(spa, livelist_verify, &sv);
+
+ (void) printf("Verifying metaslab entries\n");
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ if (!vdev_is_concrete(vd))
+ continue;
+
+ for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
+ metaslab_t *m = vd->vdev_ms[mid];
+
+ (void) fprintf(stderr,
+ "\rverifying concrete vdev %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)mid,
+ (longlong_t)vd->vdev_ms_count);
+
+ uint64_t shift, start;
+ range_seg_type_t type =
+ metaslab_calculate_range_tree_type(vd, m,
+ &start, &shift);
+ metaslab_verify_t mv;
+ mv.mv_allocated = range_tree_create(NULL,
+ type, NULL, start, shift);
+ mv.mv_vdid = vd->vdev_id;
+ mv.mv_msid = m->ms_id;
+ mv.mv_start = m->ms_start;
+ mv.mv_end = m->ms_start + m->ms_size;
+ zfs_btree_create(&mv.mv_livelist_allocs,
+ livelist_block_compare,
+ sizeof (sublivelist_verify_block_t));
+
+ mv_populate_livelist_allocs(&mv, &sv);
+
+ spacemap_check_ms_sm(m->ms_sm, &mv);
+ spacemap_check_sm_log(spa, &mv);
+
+ range_tree_vacate(mv.mv_allocated, NULL, NULL);
+ range_tree_destroy(mv.mv_allocated);
+ zfs_btree_clear(&mv.mv_livelist_allocs);
+ zfs_btree_destroy(&mv.mv_livelist_allocs);
+ }
+ }
+ (void) fprintf(stderr, "\n");
+
+ /*
+ * If there are any segments in the leftover tree after we walked
+ * through all the metaslabs in the concrete vdevs then this means
+ * that we have segments in the livelists that belong to indirect
+ * vdevs and are marked as allocated.
+ */
+ if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
+ zfs_btree_destroy(&sv.sv_leftover);
+ return;
+ }
+ (void) printf("ERROR: Found livelist blocks marked as allocated "
+ "for indirect vdevs:\n");
+
+ zfs_btree_index_t *where = NULL;
+ sublivelist_verify_block_t *svb;
+ while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
+ NULL) {
+ int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
+ ASSERT3U(vdev_id, <, rvd->vdev_children);
+ vdev_t *vd = rvd->vdev_child[vdev_id];
+ ASSERT(!vdev_is_concrete(vd));
+ (void) printf("<%d:%llx:%llx> TXG %llx\n",
+ vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
+ (u_longlong_t)svb->svb_allocated_txg);
+ }
+ (void) printf("\n");
+ zfs_btree_destroy(&sv.sv_leftover);
+}
/*
* These libumem hooks provide a reasonable set of defaults for the allocator's
@@ -172,7 +738,7 @@ static void
usage(void)
{
(void) fprintf(stderr,
- "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] "
+ "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
"[-I <inflight I/Os>]\n"
"\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
"\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
@@ -234,7 +800,9 @@ usage(void)
(void) fprintf(stderr, " -s report stats on zdb's I/O\n");
(void) fprintf(stderr, " -S simulate dedup to measure effect\n");
(void) fprintf(stderr, " -v verbose (applies to all "
- "others)\n\n");
+ "others)\n");
+ (void) fprintf(stderr, " -y perform livelist and metaslab "
+ "validation on any livelists being deleted\n\n");
(void) fprintf(stderr, " Below options are intended for use "
"with other options:\n");
(void) fprintf(stderr, " -A ignore assertions (-A), enable "
@@ -926,11 +1494,20 @@ dump_spacemap(objset_t *os, space_map_t *sm)
sizeof (word), &word, DMU_READ_PREFETCH));
if (sm_entry_is_debug(word)) {
- (void) printf("\t [%6llu] %s: txg %llu pass %llu\n",
- (u_longlong_t)entry_id,
- ddata[SM_DEBUG_ACTION_DECODE(word)],
- (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
- (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+ uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
+ uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
+ if (de_txg == 0) {
+ (void) printf(
+ "\t [%6llu] PADDING\n",
+ (u_longlong_t)entry_id);
+ } else {
+ (void) printf(
+ "\t [%6llu] %s: txg %llu pass %llu\n",
+ (u_longlong_t)entry_id,
+ ddata[SM_DEBUG_ACTION_DECODE(word)],
+ (u_longlong_t)de_txg,
+ (u_longlong_t)de_sync_pass);
+ }
entry_id++;
continue;
}
@@ -2214,6 +2791,11 @@ verify_dd_livelist(objset_t *os)
ASSERT(!dmu_objset_is_snapshot(os));
if (!dsl_deadlist_is_open(&dd->dd_livelist))
return (0);
+
+ /* Iterate through the livelist to check for duplicates */
+ dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
+ NULL);
+
dsl_pool_config_enter(dp, FTAG);
dsl_deadlist_space(&dd->dd_livelist, &ll_used,
&ll_comp, &ll_uncomp);
@@ -4652,50 +5234,6 @@ static metaslab_ops_t zdb_metaslab_ops = {
NULL /* alloc */
};
-typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme,
- uint64_t txg, void *arg);
-
-typedef struct unflushed_iter_cb_arg {
- spa_t *uic_spa;
- uint64_t uic_txg;
- void *uic_arg;
- zdb_log_sm_cb_t uic_cb;
-} unflushed_iter_cb_arg_t;
-
-static int
-iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
-{
- unflushed_iter_cb_arg_t *uic = arg;
- return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
-}
-
-static void
-iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
-{
- if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
- return;
-
- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
- for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
- sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
- space_map_t *sm = NULL;
- VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
- sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
-
- unflushed_iter_cb_arg_t uic = {
- .uic_spa = spa,
- .uic_txg = sls->sls_txg,
- .uic_arg = arg,
- .uic_cb = cb
- };
-
- VERIFY0(space_map_iterate(sm, space_map_length(sm),
- iterate_through_spacemap_logs_cb, &uic));
- space_map_close(sm);
- }
- spa_config_exit(spa, SCL_CONFIG, FTAG);
-}
-
/* ARGSUSED */
static int
load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
@@ -5443,8 +5981,6 @@ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
* Iterate over livelists which have been destroyed by the user but
* are still present in the MOS, waiting to be freed
*/
-typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
-
static void
iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
{
@@ -5515,6 +6051,7 @@ dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
ASSERT3P(arg, ==, NULL);
global_feature_count[SPA_FEATURE_LIVELIST]++;
dump_blkptr_list(ll, "Deleted Livelist");
+ dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
}
/*
@@ -6780,6 +7317,10 @@ dump_zpool(spa_t *spa)
dsl_pool_t *dp = spa_get_dsl(spa);
int rc = 0;
+ if (dump_opt['y']) {
+ livelist_metaslab_validate(spa);
+ }
+
if (dump_opt['S']) {
dump_simulated_ddt(spa);
return;
@@ -6925,7 +7466,7 @@ static int flagbits[256];
static char flagbitstr[16];
static void
-zdb_print_blkptr(blkptr_t *bp, int flags)
+zdb_print_blkptr(const blkptr_t *bp, int flags)
{
char blkbuf[BP_SPRINTF_LEN];
@@ -7537,7 +8078,7 @@ main(int argc, char **argv)
zfs_btree_verify_intensity = 3;
while ((c = getopt(argc, argv,
- "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) {
+ "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XYy")) != -1) {
switch (c) {
case 'b':
case 'c':
@@ -7556,6 +8097,7 @@ main(int argc, char **argv)
case 's':
case 'S':
case 'u':
+ case 'y':
dump_opt[c]++;
dump_all = 0;
break;
@@ -7698,7 +8240,7 @@ main(int argc, char **argv)
verbose = MAX(verbose, 1);
for (c = 0; c < 256; c++) {
- if (dump_all && strchr("AeEFklLOPRSX", c) == NULL)
+ if (dump_all && strchr("AeEFklLOPRSXy", c) == NULL)
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index ca38271cc..0a3653f7f 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -6469,7 +6469,7 @@ ztest_run_zdb(char *pool)
ztest_get_zdb_bin(bin, len);
(void) sprintf(zdb,
- "%s -bcc%s%s -G -d -Y -e -p %s %s",
+ "%s -bcc%s%s -G -d -Y -e -y -p %s %s",
bin,
ztest_opts.zo_verbose >= 3 ? "s" : "",
ztest_opts.zo_verbose >= 4 ? "v" : "",
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index f8d9c6a82..b3b7f8655 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -137,6 +137,9 @@ void metaslab_set_selected_txg(metaslab_t *, uint64_t);
extern int metaslab_debug_load;
+range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev,
+ metaslab_t *msp, uint64_t *start, uint64_t *shift);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/sys/space_map.h b/include/sys/space_map.h
index 81f56076a..cb81e710b 100644
--- a/include/sys/space_map.h
+++ b/include/sys/space_map.h
@@ -148,6 +148,15 @@ typedef struct space_map_entry {
uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */
uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */
uint64_t sme_run; /* max is 2^36; units of sm_shift */
+
+ /*
+ * The following fields are not part of the actual space map entry
+ * on-disk and they are populated with the values from the debug
+ * entry most recently visited starting from the beginning to the
+ * end of the space map.
+ */
+ uint64_t sme_txg;
+ uint64_t sme_sync_pass;
} space_map_entry_t;
#define SM_NO_VDEVID (1 << SPA_VDEVBITS)
diff --git a/man/man8/zdb.8 b/man/man8/zdb.8
index e8320c35b..56cb02dce 100644
--- a/man/man8/zdb.8
+++ b/man/man8/zdb.8
@@ -10,7 +10,7 @@
.\"
.\"
.\" Copyright 2012, Richard Lowe.
-.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+.\" Copyright (c) 2012, 2019 by Delphix. All rights reserved.
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
.\" Copyright (c) 2017 Intel Corporation.
@@ -23,7 +23,7 @@
.Nd display zpool debugging and consistency information
.Sh SYNOPSIS
.Nm
-.Op Fl AbcdDFGhikLMPsvXY
+.Op Fl AbcdDFGhikLMPsvXYy
.Op Fl e Oo Fl V Oc Op Fl p Ar path ...
.Op Fl I Ar inflight I/Os
.Oo Fl o Ar var Ns = Ns Ar value Oc Ns ...
@@ -403,6 +403,12 @@ but read transactions otherwise deemed too old.
Attempt all possible combinations when reconstructing indirect split blocks.
This flag disables the individual I/O deadman timer in order to allow as
much time as required for the attempted reconstruction.
+.It Fl y
+Perform validation for livelists that are being deleted.
+Scans through the livelist and metaslabs, checking for duplicate entries
+and compares the two, checking for potential double frees.
+If it encounters issues, warnings will be printed, but the command will not
+necessarily fail.
.El
.Pp
Specifying a display option more than once enables verbosity for only that
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 762038cb3..a935f33cb 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -2533,7 +2533,7 @@ metaslab_unload(metaslab_t *msp)
* the vdev_ms_shift - the vdev_ashift is less than 32, we can store
* the ranges using two uint32_ts, rather than two uint64_ts.
*/
-static range_seg_type_t
+range_seg_type_t
metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
uint64_t *start, uint64_t *shift)
{
diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c
index eb2c36942..25da0e63c 100644
--- a/module/zfs/space_map.c
+++ b/module/zfs/space_map.c
@@ -96,6 +96,7 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
ZIO_PRIORITY_SYNC_READ);
int error = 0;
+ uint64_t txg = 0, sync_pass = 0;
for (uint64_t block_base = 0; block_base < end && error == 0;
block_base += blksz) {
dmu_buf_t *db;
@@ -117,8 +118,29 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
block_cursor < block_end && error == 0; block_cursor++) {
uint64_t e = *block_cursor;
- if (sm_entry_is_debug(e)) /* Skip debug entries */
+ if (sm_entry_is_debug(e)) {
+ /*
+ * Debug entries are only needed to record the
+ * current TXG and sync pass if available.
+ *
+ * Note though that sometimes there can be
+ * debug entries that are used as padding
+ * at the end of space map blocks in-order
+ * to not split a double-word entry in the
+ * middle between two blocks. These entries
+ * have their TXG field set to 0 and we
+ * skip them without recording the TXG.
+ * [see comment in space_map_write_seg()]
+ */
+ uint64_t e_txg = SM_DEBUG_TXG_DECODE(e);
+ if (e_txg != 0) {
+ txg = e_txg;
+ sync_pass = SM_DEBUG_SYNCPASS_DECODE(e);
+ } else {
+ ASSERT0(SM_DEBUG_SYNCPASS_DECODE(e));
+ }
continue;
+ }
uint64_t raw_offset, raw_run, vdev_id;
maptype_t type;
@@ -158,7 +180,9 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
.sme_type = type,
.sme_vdev = vdev_id,
.sme_offset = entry_offset,
- .sme_run = entry_run
+ .sme_run = entry_run,
+ .sme_txg = txg,
+ .sme_sync_pass = sync_pass
};
error = callback(&sme, arg);
}
diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh
index 508f20adb..1d344cf2a 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh
@@ -57,7 +57,7 @@ set -A args "create" "add" "destroy" "import fakepool" \
"add raidz1 fakepool" "add raidz2 fakepool" \
"setvprop" "blah blah" "-%" "--?" "-*" "-=" \
"-a" "-f" "-g" "-j" "-n" "-o" "-p" "-p /tmp" "-r" \
- "-t" "-w" "-y" "-z" "-E" "-H" "-I" "-J" "-K" \
+ "-t" "-w" "-z" "-E" "-H" "-I" "-J" "-K" \
"-N" "-Q" "-R" "-T" "-W" "-Z"
log_assert "Execute zdb using invalid parameters."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh
index 4bb5c3c4a..4c2fc15ec 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh
@@ -58,7 +58,7 @@ function cleanup
function test_imported_pool
{
typeset -a args=("-A" "-b" "-C" "-c" "-d" "-D" "-G" "-h" "-i" "-L" \
- "-M" "-P" "-s" "-v" "-Y")
+ "-M" "-P" "-s" "-v" "-Y" "-y")
for i in ${args[@]}; do
log_must eval "zdb $i $TESTPOOL >/dev/null"
done
@@ -68,7 +68,7 @@ function test_exported_pool
{
log_must zpool export $TESTPOOL
typeset -a args=("-A" "-b" "-C" "-c" "-d" "-D" "-F" "-G" "-h" "-i" "-L" "-M" \
- "-P" "-s" "-v" "-X" "-Y")
+ "-P" "-s" "-v" "-X" "-Y" "-y")
for i in ${args[@]}; do
log_must eval "zdb -e $i $TESTPOOL >/dev/null"
done