diff options
-rw-r--r-- | cmd/zdb/zdb.c | 654 | ||||
-rw-r--r-- | cmd/ztest/ztest.c | 2 | ||||
-rw-r--r-- | include/sys/metaslab.h | 3 | ||||
-rw-r--r-- | include/sys/space_map.h | 9 | ||||
-rw-r--r-- | man/man8/zdb.8 | 10 | ||||
-rw-r--r-- | module/zfs/metaslab.c | 2 | ||||
-rw-r--r-- | module/zfs/space_map.c | 28 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh | 2 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh | 4 |
9 files changed, 649 insertions, 65 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index a329e4a83..59b17132f 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -69,6 +69,7 @@ #include <sys/blkptr.h> #include <sys/dsl_crypt.h> #include <sys/dsl_scan.h> +#include <sys/btree.h> #include <zfs_comutil.h> #include <libnvpair.h> @@ -151,6 +152,571 @@ static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, boolean_t); static void mos_obj_refd(uint64_t); static void mos_obj_refd_multiple(uint64_t); +static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, + dmu_tx_t *tx); + +typedef struct sublivelist_verify { + /* all ALLOC'd blkptr_t in one sub-livelist */ + zfs_btree_t sv_all_allocs; + + /* all FREE'd blkptr_t in one sub-livelist */ + zfs_btree_t sv_all_frees; + + /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ + zfs_btree_t sv_pair; + + /* ALLOC's without a matching FREE, accumulates across sub-livelists */ + zfs_btree_t sv_leftover; +} sublivelist_verify_t; + +static int +livelist_compare(const void *larg, const void *rarg) +{ + const blkptr_t *l = larg; + const blkptr_t *r = rarg; + + /* Sort them according to dva[0] */ + uint64_t l_dva0_vdev, r_dva0_vdev; + l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); + r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); + if (l_dva0_vdev < r_dva0_vdev) + return (-1); + else if (l_dva0_vdev > r_dva0_vdev) + return (+1); + + /* if vdevs are equal, sort by offsets. */ + uint64_t l_dva0_offset; + uint64_t r_dva0_offset; + l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); + r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); + if (l_dva0_offset < r_dva0_offset) { + return (-1); + } else if (l_dva0_offset > r_dva0_offset) { + return (+1); + } + + /* + * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, + * it's possible the offsets are equal. In that case, sort by txg + */ + if (l->blk_birth < r->blk_birth) { + return (-1); + } else if (l->blk_birth > r->blk_birth) { + return (+1); + } + return (0); +} + +typedef struct sublivelist_verify_block { + dva_t svb_dva; + + /* + * We need this to check if the block marked as allocated + * in the livelist was freed (and potentially reallocated) + * in the metaslab spacemaps at a later TXG. + */ + uint64_t svb_allocated_txg; +} sublivelist_verify_block_t; + +static void zdb_print_blkptr(const blkptr_t *bp, int flags); + +static int +sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, + dmu_tx_t *tx) +{ + ASSERT3P(tx, ==, NULL); + struct sublivelist_verify *sv = arg; + char blkbuf[BP_SPRINTF_LEN]; + zfs_btree_index_t where; + if (free) { + zfs_btree_add(&sv->sv_pair, bp); + /* Check if the FREE is a duplicate */ + if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, + free); + (void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf); + } else { + zfs_btree_add_idx(&sv->sv_all_frees, bp, &where); + } + } else { + /* Check if the ALLOC has been freed */ + if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) { + zfs_btree_remove_idx(&sv->sv_pair, &where); + } else { + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + if (DVA_IS_EMPTY(&bp->blk_dva[i])) + break; + sublivelist_verify_block_t svb = { + .svb_dva = bp->blk_dva[i], + .svb_allocated_txg = bp->blk_birth + }; + + if (zfs_btree_find(&sv->sv_leftover, &svb, + &where) == NULL) { + zfs_btree_add_idx(&sv->sv_leftover, + &svb, &where); + } + } + } + /* Check if the ALLOC is a duplicate */ + if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, + free); + (void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf); + } else { + zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where); + } + } + return (0); +} + +static int +sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) +{ + int err; + char blkbuf[BP_SPRINTF_LEN]; + struct sublivelist_verify *sv = args; + + zfs_btree_create(&sv->sv_all_allocs, livelist_compare, + sizeof (blkptr_t)); + + zfs_btree_create(&sv->sv_all_frees, livelist_compare, + sizeof (blkptr_t)); + + zfs_btree_create(&sv->sv_pair, livelist_compare, + sizeof (blkptr_t)); + + err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, + sv, NULL); + + zfs_btree_clear(&sv->sv_all_allocs); + zfs_btree_destroy(&sv->sv_all_allocs); + + zfs_btree_clear(&sv->sv_all_frees); + zfs_btree_destroy(&sv->sv_all_frees); + + blkptr_t *e; + zfs_btree_index_t *cookie = NULL; + while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE); + (void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf); + } + zfs_btree_destroy(&sv->sv_pair); + + return (err); +} + +static int +livelist_block_compare(const void *larg, const void *rarg) +{ + const sublivelist_verify_block_t *l = larg; + const sublivelist_verify_block_t *r = rarg; + + if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva)) + return (-1); + else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva)) + return (+1); + + if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva)) + return (-1); + else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva)) + return (+1); + + if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva)) + return (-1); + else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva)) + return (+1); + + return (0); +} + +/* + * Check for errors in a livelist while tracking all unfreed ALLOCs in the + * sublivelist_verify_t: sv->sv_leftover + */ +static void +livelist_verify(dsl_deadlist_t *dl, void *arg) +{ + sublivelist_verify_t *sv = arg; + dsl_deadlist_iterate(dl, sublivelist_verify_func, sv); +} + +/* + * Check for errors in the livelist entry and discard the intermediary + * data structures + */ +/* ARGSUSED */ +static int +sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) +{ + sublivelist_verify_t sv; + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + int err = sublivelist_verify_func(&sv, dle); + zfs_btree_clear(&sv.sv_leftover); + zfs_btree_destroy(&sv.sv_leftover); + return (err); +} + +typedef struct metaslab_verify { + /* + * Tree containing all the leftover ALLOCs from the livelists + * that are part of this metaslab. + */ + zfs_btree_t mv_livelist_allocs; + + /* + * Metaslab information. + */ + uint64_t mv_vdid; + uint64_t mv_msid; + uint64_t mv_start; + uint64_t mv_end; + + /* + * What's currently allocated for this metaslab. + */ + range_tree_t *mv_allocated; +} metaslab_verify_t; + +typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); + +typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, + void *arg); + +typedef struct unflushed_iter_cb_arg { + spa_t *uic_spa; + uint64_t uic_txg; + void *uic_arg; + zdb_log_sm_cb_t uic_cb; +} unflushed_iter_cb_arg_t; + +static int +iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) +{ + unflushed_iter_cb_arg_t *uic = arg; + return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); +} + +static void +iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + VERIFY0(space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + unflushed_iter_cb_arg_t uic = { + .uic_spa = spa, + .uic_txg = sls->sls_txg, + .uic_arg = arg, + .uic_cb = cb + }; + VERIFY0(space_map_iterate(sm, space_map_length(sm), + iterate_through_spacemap_logs_cb, &uic)); + space_map_close(sm); + } + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +static void +verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, + uint64_t offset, uint64_t size) +{ + sublivelist_verify_block_t svb; + DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); + DVA_SET_OFFSET(&svb.svb_dva, offset); + DVA_SET_ASIZE(&svb.svb_dva, size); + zfs_btree_index_t where; + uint64_t end_offset = offset + size; + + /* + * Look for an exact match for spacemap entry in the livelist entries. + * Then, look for other livelist entries that fall within the range + * of the spacemap entry as it may have been condensed + */ + sublivelist_verify_block_t *found = + zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where); + if (found == NULL) { + found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where); + } + for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid && + DVA_GET_OFFSET(&found->svb_dva) < end_offset; + found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { + if (found->svb_allocated_txg <= txg) { + (void) printf("ERROR: Livelist ALLOC [%llx:%llx] " + "from TXG %llx FREED at TXG %llx\n", + (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), + (u_longlong_t)found->svb_allocated_txg, + (u_longlong_t)txg); + } + } +} + +static int +metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) +{ + metaslab_verify_t *mv = arg; + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint64_t txg = sme->sme_txg; + + if (sme->sme_type == SM_ALLOC) { + if (range_tree_contains(mv->mv_allocated, + offset, size)) { + (void) printf("ERROR: DOUBLE ALLOC: " + "%llu [%llx:%llx] " + "%llu:%llu LOG_SM\n", + (u_longlong_t)txg, (u_longlong_t)offset, + (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, + (u_longlong_t)mv->mv_msid); + } else { + range_tree_add(mv->mv_allocated, + offset, size); + } + } else { + if (!range_tree_contains(mv->mv_allocated, + offset, size)) { + (void) printf("ERROR: DOUBLE FREE: " + "%llu [%llx:%llx] " + "%llu:%llu LOG_SM\n", + (u_longlong_t)txg, (u_longlong_t)offset, + (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, + (u_longlong_t)mv->mv_msid); + } else { + range_tree_remove(mv->mv_allocated, + offset, size); + } + } + + if (sme->sme_type != SM_ALLOC) { + /* + * If something is freed in the spacemap, verify that + * it is not listed as allocated in the livelist. + */ + verify_livelist_allocs(mv, txg, offset, size); + } + return (0); +} + +static int +spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + metaslab_verify_t *mv = arg; + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* skip indirect vdevs */ + if (!vdev_is_concrete(vd)) + return (0); + + if (vdev_id != mv->mv_vdid) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + if (ms->ms_id != mv->mv_msid) + return (0); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + + ASSERT3U(txg, ==, sme->sme_txg); + return (metaslab_spacemap_validation_cb(sme, mv)); +} + +static void +spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv) +{ + iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv); +} + +static void +spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv) +{ + if (sm == NULL) + return; + + VERIFY0(space_map_iterate(sm, space_map_length(sm), + metaslab_spacemap_validation_cb, mv)); +} + +static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg); + +/* + * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if + * they are part of that metaslab (mv_msid). + */ +static void +mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) +{ + zfs_btree_index_t where; + sublivelist_verify_block_t *svb; + ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0); + for (svb = zfs_btree_first(&sv->sv_leftover, &where); + svb != NULL; + svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) { + if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid) + continue; + + if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start && + (DVA_GET_OFFSET(&svb->svb_dva) + + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) { + (void) printf("ERROR: Found block that crosses " + "metaslab boundary: <%llu:%llx:%llx>\n", + (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), + (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + continue; + } + + if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start) + continue; + + if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end) + continue; + + if ((DVA_GET_OFFSET(&svb->svb_dva) + + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) { + (void) printf("ERROR: Found block that crosses " + "metaslab boundary: <%llu:%llx:%llx>\n", + (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), + (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + continue; + } + + zfs_btree_add(&mv->mv_livelist_allocs, svb); + } + + for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where); + svb != NULL; + svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { + zfs_btree_remove(&sv->sv_leftover, svb); + } +} + +/* + * [Livelist Check] + * Iterate through all the sublivelists and: + * - report leftover frees + * - report double ALLOCs/FREEs + * - record leftover ALLOCs together with their TXG [see Cross Check] + * + * [Spacemap Check] + * for each metaslab: + * - iterate over spacemap and then the metaslab's entries in the + * spacemap log, then report any double FREEs and ALLOCs (do not + * blow up). + * + * [Cross Check] + * After finishing the Livelist Check phase and while being in the + * Spacemap Check phase, we find all the recorded leftover ALLOCs + * of the livelist check that are part of the metaslab that we are + * currently looking at in the Spacemap Check. We report any entries + * that are marked as ALLOCs in the livelists but have been actually + * freed (and potentially allocated again) after their TXG stamp in + * the spacemaps. Also report any ALLOCs from the livelists that + * belong to indirect vdevs (e.g. their vdev completed removal). + * + * Note that this will miss Log Spacemap entries that cancelled each other + * out before being flushed to the metaslab, so we are not guaranteed + * to match all erroneous ALLOCs. + */ +static void +livelist_metaslab_validate(spa_t *spa) +{ + (void) printf("Verifying deleted livelist entries\n"); + + sublivelist_verify_t sv; + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + iterate_deleted_livelists(spa, livelist_verify, &sv); + + (void) printf("Verifying metaslab entries\n"); + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + if (!vdev_is_concrete(vd)) + continue; + + for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) { + metaslab_t *m = vd->vdev_ms[mid]; + + (void) fprintf(stderr, + "\rverifying concrete vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)mid, + (longlong_t)vd->vdev_ms_count); + + uint64_t shift, start; + range_seg_type_t type = + metaslab_calculate_range_tree_type(vd, m, + &start, &shift); + metaslab_verify_t mv; + mv.mv_allocated = range_tree_create(NULL, + type, NULL, start, shift); + mv.mv_vdid = vd->vdev_id; + mv.mv_msid = m->ms_id; + mv.mv_start = m->ms_start; + mv.mv_end = m->ms_start + m->ms_size; + zfs_btree_create(&mv.mv_livelist_allocs, + livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + + mv_populate_livelist_allocs(&mv, &sv); + + spacemap_check_ms_sm(m->ms_sm, &mv); + spacemap_check_sm_log(spa, &mv); + + range_tree_vacate(mv.mv_allocated, NULL, NULL); + range_tree_destroy(mv.mv_allocated); + zfs_btree_clear(&mv.mv_livelist_allocs); + zfs_btree_destroy(&mv.mv_livelist_allocs); + } + } + (void) fprintf(stderr, "\n"); + + /* + * If there are any segments in the leftover tree after we walked + * through all the metaslabs in the concrete vdevs then this means + * that we have segments in the livelists that belong to indirect + * vdevs and are marked as allocated. + */ + if (zfs_btree_numnodes(&sv.sv_leftover) == 0) { + zfs_btree_destroy(&sv.sv_leftover); + return; + } + (void) printf("ERROR: Found livelist blocks marked as allocated " + "for indirect vdevs:\n"); + + zfs_btree_index_t *where = NULL; + sublivelist_verify_block_t *svb; + while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) != + NULL) { + int vdev_id = DVA_GET_VDEV(&svb->svb_dva); + ASSERT3U(vdev_id, <, rvd->vdev_children); + vdev_t *vd = rvd->vdev_child[vdev_id]; + ASSERT(!vdev_is_concrete(vd)); + (void) printf("<%d:%llx:%llx> TXG %llx\n", + vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva), + (u_longlong_t)svb->svb_allocated_txg); + } + (void) printf("\n"); + zfs_btree_destroy(&sv.sv_leftover); +} /* * These libumem hooks provide a reasonable set of defaults for the allocator's @@ -172,7 +738,7 @@ static void usage(void) { (void) fprintf(stderr, - "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] " + "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] " "[-I <inflight I/Os>]\n" "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n" "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n" @@ -234,7 +800,9 @@ usage(void) (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); (void) fprintf(stderr, " -v verbose (applies to all " - "others)\n\n"); + "others)\n"); + (void) fprintf(stderr, " -y perform livelist and metaslab " + "validation on any livelists being deleted\n\n"); (void) fprintf(stderr, " Below options are intended for use " "with other options:\n"); (void) fprintf(stderr, " -A ignore assertions (-A), enable " @@ -926,11 +1494,20 @@ dump_spacemap(objset_t *os, space_map_t *sm) sizeof (word), &word, DMU_READ_PREFETCH)); if (sm_entry_is_debug(word)) { - (void) printf("\t [%6llu] %s: txg %llu pass %llu\n", - (u_longlong_t)entry_id, - ddata[SM_DEBUG_ACTION_DECODE(word)], - (u_longlong_t)SM_DEBUG_TXG_DECODE(word), - (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word)); + uint64_t de_txg = SM_DEBUG_TXG_DECODE(word); + uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word); + if (de_txg == 0) { + (void) printf( + "\t [%6llu] PADDING\n", + (u_longlong_t)entry_id); + } else { + (void) printf( + "\t [%6llu] %s: txg %llu pass %llu\n", + (u_longlong_t)entry_id, + ddata[SM_DEBUG_ACTION_DECODE(word)], + (u_longlong_t)de_txg, + (u_longlong_t)de_sync_pass); + } entry_id++; continue; } @@ -2214,6 +2791,11 @@ verify_dd_livelist(objset_t *os) ASSERT(!dmu_objset_is_snapshot(os)); if (!dsl_deadlist_is_open(&dd->dd_livelist)) return (0); + + /* Iterate through the livelist to check for duplicates */ + dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight, + NULL); + dsl_pool_config_enter(dp, FTAG); dsl_deadlist_space(&dd->dd_livelist, &ll_used, &ll_comp, &ll_uncomp); @@ -4652,50 +5234,6 @@ static metaslab_ops_t zdb_metaslab_ops = { NULL /* alloc */ }; -typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, - uint64_t txg, void *arg); - -typedef struct unflushed_iter_cb_arg { - spa_t *uic_spa; - uint64_t uic_txg; - void *uic_arg; - zdb_log_sm_cb_t uic_cb; -} unflushed_iter_cb_arg_t; - -static int -iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) -{ - unflushed_iter_cb_arg_t *uic = arg; - return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); -} - -static void -iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) -{ - if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) - return; - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); - sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { - space_map_t *sm = NULL; - VERIFY0(space_map_open(&sm, spa_meta_objset(spa), - sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); - - unflushed_iter_cb_arg_t uic = { - .uic_spa = spa, - .uic_txg = sls->sls_txg, - .uic_arg = arg, - .uic_cb = cb - }; - - VERIFY0(space_map_iterate(sm, space_map_length(sm), - iterate_through_spacemap_logs_cb, &uic)); - space_map_close(sm); - } - spa_config_exit(spa, SCL_CONFIG, FTAG); -} - /* ARGSUSED */ static int load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, @@ -5443,8 +5981,6 @@ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) * Iterate over livelists which have been destroyed by the user but * are still present in the MOS, waiting to be freed */ -typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); - static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) { @@ -5515,6 +6051,7 @@ dump_livelist_cb(dsl_deadlist_t *ll, void *arg) ASSERT3P(arg, ==, NULL); global_feature_count[SPA_FEATURE_LIVELIST]++; dump_blkptr_list(ll, "Deleted Livelist"); + dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); } /* @@ -6780,6 +7317,10 @@ dump_zpool(spa_t *spa) dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; + if (dump_opt['y']) { + livelist_metaslab_validate(spa); + } + if (dump_opt['S']) { dump_simulated_ddt(spa); return; @@ -6925,7 +7466,7 @@ static int flagbits[256]; static char flagbitstr[16]; static void -zdb_print_blkptr(blkptr_t *bp, int flags) +zdb_print_blkptr(const blkptr_t *bp, int flags) { char blkbuf[BP_SPRINTF_LEN]; @@ -7537,7 +8078,7 @@ main(int argc, char **argv) zfs_btree_verify_intensity = 3; while ((c = getopt(argc, argv, - "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) { + "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XYy")) != -1) { switch (c) { case 'b': case 'c': @@ -7556,6 +8097,7 @@ main(int argc, char **argv) case 's': case 'S': case 'u': + case 'y': dump_opt[c]++; dump_all = 0; break; @@ -7698,7 +8240,7 @@ main(int argc, char **argv) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { - if (dump_all && strchr("AeEFklLOPRSX", c) == NULL) + if (dump_all && strchr("AeEFklLOPRSXy", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index ca38271cc..0a3653f7f 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -6469,7 +6469,7 @@ ztest_run_zdb(char *pool) ztest_get_zdb_bin(bin, len); (void) sprintf(zdb, - "%s -bcc%s%s -G -d -Y -e -p %s %s", + "%s -bcc%s%s -G -d -Y -e -y -p %s %s", bin, ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 4 ? "v" : "", diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index f8d9c6a82..b3b7f8655 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -137,6 +137,9 @@ void metaslab_set_selected_txg(metaslab_t *, uint64_t); extern int metaslab_debug_load; +range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, + metaslab_t *msp, uint64_t *start, uint64_t *shift); + #ifdef __cplusplus } #endif diff --git a/include/sys/space_map.h b/include/sys/space_map.h index 81f56076a..cb81e710b 100644 --- a/include/sys/space_map.h +++ b/include/sys/space_map.h @@ -148,6 +148,15 @@ typedef struct space_map_entry { uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */ uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */ uint64_t sme_run; /* max is 2^36; units of sm_shift */ + + /* + * The following fields are not part of the actual space map entry + * on-disk and they are populated with the values from the debug + * entry most recently visited starting from the beginning to the + * end of the space map. + */ + uint64_t sme_txg; + uint64_t sme_sync_pass; } space_map_entry_t; #define SM_NO_VDEVID (1 << SPA_VDEVBITS) diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index e8320c35b..56cb02dce 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -10,7 +10,7 @@ .\" .\" .\" Copyright 2012, Richard Lowe. -.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012, 2019 by Delphix. All rights reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. @@ -23,7 +23,7 @@ .Nd display zpool debugging and consistency information .Sh SYNOPSIS .Nm -.Op Fl AbcdDFGhikLMPsvXY +.Op Fl AbcdDFGhikLMPsvXYy .Op Fl e Oo Fl V Oc Op Fl p Ar path ... .Op Fl I Ar inflight I/Os .Oo Fl o Ar var Ns = Ns Ar value Oc Ns ... @@ -403,6 +403,12 @@ but read transactions otherwise deemed too old. Attempt all possible combinations when reconstructing indirect split blocks. This flag disables the individual I/O deadman timer in order to allow as much time as required for the attempted reconstruction. +.It Fl y +Perform validation for livelists that are being deleted. +Scans through the livelist and metaslabs, checking for duplicate entries +and compares the two, checking for potential double frees. +If it encounters issues, warnings will be printed, but the command will not +necessarily fail. .El .Pp Specifying a display option more than once enables verbosity for only that diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 762038cb3..a935f33cb 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -2533,7 +2533,7 @@ metaslab_unload(metaslab_t *msp) * the vdev_ms_shift - the vdev_ashift is less than 32, we can store * the ranges using two uint32_ts, rather than two uint64_ts. */ -static range_seg_type_t +range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, uint64_t *start, uint64_t *shift) { diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index eb2c36942..25da0e63c 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -96,6 +96,7 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) ZIO_PRIORITY_SYNC_READ); int error = 0; + uint64_t txg = 0, sync_pass = 0; for (uint64_t block_base = 0; block_base < end && error == 0; block_base += blksz) { dmu_buf_t *db; @@ -117,8 +118,29 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) block_cursor < block_end && error == 0; block_cursor++) { uint64_t e = *block_cursor; - if (sm_entry_is_debug(e)) /* Skip debug entries */ + if (sm_entry_is_debug(e)) { + /* + * Debug entries are only needed to record the + * current TXG and sync pass if available. + * + * Note though that sometimes there can be + * debug entries that are used as padding + * at the end of space map blocks in-order + * to not split a double-word entry in the + * middle between two blocks. These entries + * have their TXG field set to 0 and we + * skip them without recording the TXG. + * [see comment in space_map_write_seg()] + */ + uint64_t e_txg = SM_DEBUG_TXG_DECODE(e); + if (e_txg != 0) { + txg = e_txg; + sync_pass = SM_DEBUG_SYNCPASS_DECODE(e); + } else { + ASSERT0(SM_DEBUG_SYNCPASS_DECODE(e)); + } continue; + } uint64_t raw_offset, raw_run, vdev_id; maptype_t type; @@ -158,7 +180,9 @@ space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) .sme_type = type, .sme_vdev = vdev_id, .sme_offset = entry_offset, - .sme_run = entry_run + .sme_run = entry_run, + .sme_txg = txg, + .sme_sync_pass = sync_pass }; error = callback(&sme, arg); } diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh index 508f20adb..1d344cf2a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh @@ -57,7 +57,7 @@ set -A args "create" "add" "destroy" "import fakepool" \ "add raidz1 fakepool" "add raidz2 fakepool" \ "setvprop" "blah blah" "-%" "--?" "-*" "-=" \ "-a" "-f" "-g" "-j" "-n" "-o" "-p" "-p /tmp" "-r" \ - "-t" "-w" "-y" "-z" "-E" "-H" "-I" "-J" "-K" \ + "-t" "-w" "-z" "-E" "-H" "-I" "-J" "-K" \ "-N" "-Q" "-R" "-T" "-W" "-Z" log_assert "Execute zdb using invalid parameters." diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh index 4bb5c3c4a..4c2fc15ec 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh @@ -58,7 +58,7 @@ function cleanup function test_imported_pool { typeset -a args=("-A" "-b" "-C" "-c" "-d" "-D" "-G" "-h" "-i" "-L" \ - "-M" "-P" "-s" "-v" "-Y") + "-M" "-P" "-s" "-v" "-Y" "-y") for i in ${args[@]}; do log_must eval "zdb $i $TESTPOOL >/dev/null" done @@ -68,7 +68,7 @@ function test_exported_pool { log_must zpool export $TESTPOOL typeset -a args=("-A" "-b" "-C" "-c" "-d" "-D" "-F" "-G" "-h" "-i" "-L" "-M" \ - "-P" "-s" "-v" "-X" "-Y") + "-P" "-s" "-v" "-X" "-Y" "-y") for i in ${args[@]}; do log_must eval "zdb -e $i $TESTPOOL >/dev/null" done |