aboutsummaryrefslogtreecommitdiffstats
path: root/cmd/zdb/zdb.c
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/zdb/zdb.c')
-rw-r--r--cmd/zdb/zdb.c853
1 files changed, 733 insertions, 120 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index faac43c79..d1e77cce7 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -131,7 +131,7 @@ static void
usage(void)
{
(void) fprintf(stderr,
- "Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-V] [-p <path> ...]] "
+ "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] "
"[-I <inflight I/Os>]\n"
"\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
"\t\t[<poolname> [<object> ...]]\n"
@@ -168,6 +168,8 @@ usage(void)
(void) fprintf(stderr, " -h pool history\n");
(void) fprintf(stderr, " -i intent logs\n");
(void) fprintf(stderr, " -l read label contents\n");
+ (void) fprintf(stderr, " -k examine the checkpointed state "
+ "of the pool\n");
(void) fprintf(stderr, " -L disable leak tracking (do not "
"load spacemaps)\n");
(void) fprintf(stderr, " -m metaslabs\n");
@@ -731,6 +733,22 @@ get_prev_obsolete_spacemap_refcount(spa_t *spa)
}
static int
+get_checkpoint_refcount(vdev_t *vd)
+{
+ int refcount = 0;
+
+ if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
+ zap_contains(spa_meta_objset(vd->vdev_spa),
+ vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
+ refcount++;
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++)
+ refcount += get_checkpoint_refcount(vd->vdev_child[c]);
+
+ return (refcount);
+}
+
+static int
verify_spacemap_refcounts(spa_t *spa)
{
uint64_t expected_refcount = 0;
@@ -743,6 +761,7 @@ verify_spacemap_refcounts(spa_t *spa)
actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
+ actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
if (expected_refcount != actual_refcount) {
(void) printf("space map refcount mismatch: expected %lld != "
@@ -816,8 +835,8 @@ static void
dump_metaslab_stats(metaslab_t *msp)
{
char maxbuf[32];
- range_tree_t *rt = msp->ms_tree;
- avl_tree_t *t = &msp->ms_size_tree;
+ range_tree_t *rt = msp->ms_allocatable;
+ avl_tree_t *t = &msp->ms_allocatable_by_size;
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
/* max sure nicenum has enough space */
@@ -853,7 +872,7 @@ dump_metaslab(metaslab_t *msp)
metaslab_load_wait(msp);
if (!msp->ms_loaded) {
VERIFY0(metaslab_load(msp));
- range_tree_stat_verify(msp->ms_tree);
+ range_tree_stat_verify(msp->ms_allocatable);
}
dump_metaslab_stats(msp);
metaslab_unload(msp);
@@ -2420,6 +2439,8 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
(void) printf("\trootbp = %s\n", blkbuf);
}
+ (void) printf("\tcheckpoint_txg = %llu\n",
+ (u_longlong_t)ub->ub_checkpoint_txg);
(void) printf("%s", footer ? footer : "");
}
@@ -3129,6 +3150,7 @@ static const char *zdb_ot_extname[] = {
typedef struct zdb_cb {
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
uint64_t zcb_removing_size;
+ uint64_t zcb_checkpoint_size;
uint64_t zcb_dedup_asize;
uint64_t zcb_dedup_blocks;
uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
@@ -3229,7 +3251,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
}
VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
- refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
+ refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
}
@@ -3388,7 +3410,7 @@ claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
ASSERT(vdev_is_concrete(vd));
VERIFY0(metaslab_claim_impl(vd, offset, size,
- spa_first_txg(vd->vdev_spa)));
+ spa_min_claim_txg(vd->vdev_spa)));
}
static void
@@ -3453,70 +3475,6 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
-/*
- * vm_idxp is an in-out parameter which (for indirect vdevs) is the
- * index in vim_entries that has the first entry in this metaslab. On
- * return, it will be set to the first entry after this metaslab.
- */
-static void
-zdb_leak_init_ms(metaslab_t *msp, uint64_t *vim_idxp)
-{
- metaslab_group_t *mg = msp->ms_group;
- vdev_t *vd = mg->mg_vd;
- vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
-
- mutex_enter(&msp->ms_lock);
- metaslab_unload(msp);
-
- /*
- * We don't want to spend the CPU manipulating the size-ordered
- * tree, so clear the range_tree ops.
- */
- msp->ms_tree->rt_ops = NULL;
-
- (void) fprintf(stderr,
- "\rloading vdev %llu of %llu, metaslab %llu of %llu ...",
- (longlong_t)vd->vdev_id,
- (longlong_t)rvd->vdev_children,
- (longlong_t)msp->ms_id,
- (longlong_t)vd->vdev_ms_count);
-
- /*
- * For leak detection, we overload the metaslab ms_tree to
- * contain allocated segments instead of free segments. As a
- * result, we can't use the normal metaslab_load/unload
- * interfaces.
- */
- if (vd->vdev_ops == &vdev_indirect_ops) {
- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
- for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
- (*vim_idxp)++) {
- vdev_indirect_mapping_entry_phys_t *vimep =
- &vim->vim_entries[*vim_idxp];
- uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
- uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
- ASSERT3U(ent_offset, >=, msp->ms_start);
- if (ent_offset >= msp->ms_start + msp->ms_size)
- break;
-
- /*
- * Mappings do not cross metaslab boundaries,
- * because we create them by walking the metaslabs.
- */
- ASSERT3U(ent_offset + ent_len, <=,
- msp->ms_start + msp->ms_size);
- range_tree_add(msp->ms_tree, ent_offset, ent_len);
- }
- } else if (msp->ms_sm != NULL) {
- VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC));
- }
-
- if (!msp->ms_loaded) {
- msp->ms_loaded = B_TRUE;
- }
- mutex_exit(&msp->ms_lock);
-}
-
/* ARGSUSED */
static int
increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
@@ -3615,11 +3573,246 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
ASSERT(error == ENOENT);
}
+typedef struct checkpoint_sm_exclude_entry_arg {
+ vdev_t *cseea_vd;
+ uint64_t cseea_checkpoint_size;
+} checkpoint_sm_exclude_entry_arg_t;
+
+static int
+checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
+ void *arg)
+{
+ checkpoint_sm_exclude_entry_arg_t *cseea = arg;
+ vdev_t *vd = cseea->cseea_vd;
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ uint64_t end = offset + size;
+
+ ASSERT(type == SM_FREE);
+
+ /*
+ * Since the vdev_checkpoint_sm exists in the vdev level
+ * and the ms_sm space maps exist in the metaslab level,
+ * an entry in the checkpoint space map could theoretically
+ * cross the boundaries of the metaslab that it belongs.
+ *
+ * In reality, because of the way that we populate and
+ * manipulate the checkpoint's space maps currently,
+ * there shouldn't be any entries that cross metaslabs.
+ * Hence the assertion below.
+ *
+ * That said, there is no fundamental requirement that
+ * the checkpoint's space map entries should not cross
+ * metaslab boundaries. So if needed we could add code
+ * that handles metaslab-crossing segments in the future.
+ */
+ VERIFY3U(offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * By removing the entry from the allocated segments we
+ * also verify that the entry is there to begin with.
+ */
+ mutex_enter(&ms->ms_lock);
+ range_tree_remove(ms->ms_allocatable, offset, size);
+ mutex_exit(&ms->ms_lock);
+
+ cseea->cseea_checkpoint_size += size;
+ return (0);
+}
+
+static void
+zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ /*
+ * If there is no vdev_top_zap, we are in a pool whose
+ * version predates the pool checkpoint feature.
+ */
+ if (vd->vdev_top_zap == 0)
+ return;
+
+ /*
+ * If there is no reference of the vdev_checkpoint_sm in
+ * the vdev_top_zap, then one of the following scenarios
+ * is true:
+ *
+ * 1] There is no checkpoint
+ * 2] There is a checkpoint, but no checkpointed blocks
+ * have been freed yet
+ * 3] The current vdev is indirect
+ *
+ * In these cases we return immediately.
+ */
+ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ return;
+
+ VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
+ &checkpoint_sm_obj));
+
+ checkpoint_sm_exclude_entry_arg_t cseea;
+ cseea.cseea_vd = vd;
+ cseea.cseea_checkpoint_size = 0;
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+ checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+ space_map_update(checkpoint_sm);
+
+ VERIFY0(space_map_iterate(checkpoint_sm,
+ checkpoint_sm_exclude_entry_cb, &cseea));
+ space_map_close(checkpoint_sm);
+
+ zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
+}
+
+static void
+zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
+ zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
+ }
+}
+
+static void
+load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+
+ ASSERT3U(i, ==, vd->vdev_id);
+
+ if (vd->vdev_ops == &vdev_indirect_ops)
+ continue;
+
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ (void) fprintf(stderr,
+ "\rloading concrete vdev %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)msp->ms_id,
+ (longlong_t)vd->vdev_ms_count);
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_unload(msp);
+
+ /*
+ * We don't want to spend the CPU manipulating the
+ * size-ordered tree, so clear the range_tree ops.
+ */
+ msp->ms_allocatable->rt_ops = NULL;
+
+ if (msp->ms_sm != NULL) {
+ VERIFY0(space_map_load(msp->ms_sm,
+ msp->ms_allocatable, maptype));
+ }
+ if (!msp->ms_loaded)
+ msp->ms_loaded = B_TRUE;
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+}
+
+/*
+ * vm_idxp is an in-out parameter which (for indirect vdevs) is the
+ * index in vim_entries that has the first entry in this metaslab.
+ * On return, it will be set to the first entry after this metaslab.
+ */
+static void
+load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
+ uint64_t *vim_idxp)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_unload(msp);
+
+ /*
+ * We don't want to spend the CPU manipulating the
+ * size-ordered tree, so clear the range_tree ops.
+ */
+ msp->ms_allocatable->rt_ops = NULL;
+
+ for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
+ (*vim_idxp)++) {
+ vdev_indirect_mapping_entry_phys_t *vimep =
+ &vim->vim_entries[*vim_idxp];
+ uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+ uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
+ ASSERT3U(ent_offset, >=, msp->ms_start);
+ if (ent_offset >= msp->ms_start + msp->ms_size)
+ break;
+
+ /*
+ * Mappings do not cross metaslab boundaries,
+ * because we create them by walking the metaslabs.
+ */
+ ASSERT3U(ent_offset + ent_len, <=,
+ msp->ms_start + msp->ms_size);
+ range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
+ }
+
+ if (!msp->ms_loaded)
+ msp->ms_loaded = B_TRUE;
+ mutex_exit(&msp->ms_lock);
+}
+
+static void
+zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ ASSERT3U(c, ==, vd->vdev_id);
+
+ if (vd->vdev_ops != &vdev_indirect_ops)
+ continue;
+
+ /*
+ * Note: we don't check for mapping leaks on
+ * removing vdevs because their ms_allocatable's
+ * are used to look for leaks in allocated space.
+ */
+ zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
+
+ /*
+ * Normally, indirect vdevs don't have any
+ * metaslabs. We want to set them up for
+ * zio_claim().
+ */
+ VERIFY0(vdev_metaslab_init(vd, 0));
+
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t vim_idx = 0;
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+
+ (void) fprintf(stderr,
+ "\rloading indirect vdev %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vd->vdev_ms[m]->ms_id,
+ (longlong_t)vd->vdev_ms_count);
+
+ load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
+ &vim_idx);
+ }
+ ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
+ }
+}
+
static void
zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
{
zcb->zcb_spa = spa;
- uint64_t c;
if (!dump_opt['L']) {
dsl_pool_t *dp = spa->spa_dsl_pool;
@@ -3627,7 +3820,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
/*
* We are going to be changing the meaning of the metaslab's
- * ms_tree. Ensure that the allocator doesn't try to
+ * ms_allocatable. Ensure that the allocator doesn't try to
* use the tree.
*/
spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
@@ -3637,38 +3830,37 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
UMEM_NOFAIL);
- for (c = 0; c < rvd->vdev_children; c++) {
- vdev_t *vd = rvd->vdev_child[c];
- uint64_t vim_idx = 0;
-
- ASSERT3U(c, ==, vd->vdev_id);
-
- /*
- * Note: we don't check for mapping leaks on
- * removing vdevs because their ms_tree's are
- * used to look for leaks in allocated space.
- */
- if (vd->vdev_ops == &vdev_indirect_ops) {
- zcb->zcb_vd_obsolete_counts[c] =
- zdb_load_obsolete_counts(vd);
+ /*
+ * For leak detection, we overload the ms_allocatable trees
+ * to contain allocated segments instead of free segments.
+ * As a result, we can't use the normal metaslab_load/unload
+ * interfaces.
+ */
+ zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+ load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
- /*
- * Normally, indirect vdevs don't have any
- * metaslabs. We want to set them up for
- * zio_claim().
- */
- VERIFY0(vdev_metaslab_init(vd, 0));
- }
+ /*
+ * On load_concrete_ms_allocatable_trees() we loaded all the
+ * allocated entries from the ms_sm to the ms_allocatable for
+ * each metaslab. If the pool has a checkpoint or is in the
+ * middle of discarding a checkpoint, some of these blocks
+ * may have been freed but their ms_sm may not have been
+ * updated because they are referenced by the checkpoint. In
+ * order to avoid false-positives during leak-detection, we
+ * go through the vdev's checkpoint space map and exclude all
+ * its entries from their relevant ms_allocatable.
+ *
+ * We also aggregate the space held by the checkpoint and add
+ * it to zcb_checkpoint_size.
+ *
+ * Note that at this point we are also verifying that all the
+ * entries on the checkpoint_sm are marked as allocated in
+ * the ms_sm of their relevant metaslab.
+ * [see comment in checkpoint_sm_exclude_entry_cb()]
+ */
+ zdb_leak_init_exclude_checkpoint(spa, zcb);
- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
- zdb_leak_init_ms(vd->vdev_ms[m], &vim_idx);
- }
- if (vd->vdev_ops == &vdev_indirect_ops) {
- ASSERT3U(vim_idx, ==,
- vdev_indirect_mapping_num_entries(
- vd->vdev_indirect_mapping));
- }
- }
+ /* for cleaner progress output */
(void) fprintf(stderr, "\n");
if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
@@ -3677,12 +3869,16 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
increment_indirect_mapping_cb, zcb, NULL);
}
+ } else {
+ /*
+ * If leak tracing is disabled, we still need to consider
+ * any checkpointed space in our space verification.
+ */
+ zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
}
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
zdb_ddt_leak_init(spa, zcb);
-
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
@@ -3709,7 +3905,7 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
for (uint64_t inner_offset = 0;
inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
inner_offset += 1 << vd->vdev_ashift) {
- if (range_tree_contains(msp->ms_tree,
+ if (range_tree_contains(msp->ms_allocatable,
offset + inner_offset, 1 << vd->vdev_ashift)) {
obsolete_bytes += 1 << vd->vdev_ashift;
}
@@ -3775,23 +3971,23 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
ASSERT3P(mg, ==, msp->ms_group);
/*
- * The ms_tree has been overloaded to
- * contain allocated segments. Now that we
- * finished traversing all blocks, any
- * block that remains in the ms_tree
+ * ms_allocatable has been overloaded
+ * to contain allocated segments. Now that
+ * we finished traversing all blocks, any
+ * block that remains in the ms_allocatable
* represents an allocated block that we
* did not claim during the traversal.
* Claimed blocks would have been removed
- * from the ms_tree. For indirect vdevs,
- * space remaining in the tree represents
- * parts of the mapping that are not
- * referenced, which is not a bug.
+ * from the ms_allocatable. For indirect
+ * vdevs, space remaining in the tree
+ * represents parts of the mapping that are
+ * not referenced, which is not a bug.
*/
if (vd->vdev_ops == &vdev_indirect_ops) {
- range_tree_vacate(msp->ms_tree,
+ range_tree_vacate(msp->ms_allocatable,
NULL, NULL);
} else {
- range_tree_vacate(msp->ms_tree,
+ range_tree_vacate(msp->ms_allocatable,
zdb_leak, vd);
}
@@ -3923,7 +4119,7 @@ dump_block_stats(spa_t *spa)
total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
- zcb.zcb_removing_size;
+ zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
if (total_found == total_alloc) {
if (!dump_opt['L'])
@@ -4332,6 +4528,390 @@ verify_device_removal_feature_counts(spa_t *spa)
return (ret);
}
+#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
+/*
+ * Import the checkpointed state of the pool specified by the target
+ * parameter as readonly. The function also accepts a pool config
+ * as an optional parameter, else it attempts to infer the config by
+ * the name of the target pool.
+ *
+ * Note that the checkpointed state's pool name will be the name of
+ * the original pool with the above suffix appened to it. In addition,
+ * if the target is not a pool name (e.g. a path to a dataset) then
+ * the new_path parameter is populated with the updated path to
+ * reflect the fact that we are looking into the checkpointed state.
+ *
+ * The function returns a newly-allocated copy of the name of the
+ * pool containing the checkpointed state. When this copy is no
+ * longer needed it should be freed with free(3C). Same thing
+ * applies to the new_path parameter if allocated.
+ */
+static char *
+import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+{
+ int error = 0;
+ char *poolname, *bogus_name = NULL;
+
+ /* If the target is not a pool, the extract the pool name */
+ char *path_start = strchr(target, '/');
+ if (path_start != NULL) {
+ size_t poolname_len = path_start - target;
+ poolname = strndup(target, poolname_len);
+ } else {
+ poolname = target;
+ }
+
+ if (cfg == NULL) {
+ error = spa_get_stats(poolname, &cfg, NULL, 0);
+ if (error != 0) {
+ fatal("Tried to read config of pool \"%s\" but "
+ "spa_get_stats() failed with error %d\n",
+ poolname, error);
+ }
+ }
+
+ if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1)
+ return (NULL);
+ fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
+
+ error = spa_import(bogus_name, cfg, NULL,
+ ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT);
+ if (error != 0) {
+ fatal("Tried to import pool \"%s\" but spa_import() failed "
+ "with error %d\n", bogus_name, error);
+ }
+
+ if (new_path != NULL && path_start != NULL) {
+ if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
+ if (path_start != NULL)
+ free(poolname);
+ return (NULL);
+ }
+ }
+
+ if (target != poolname)
+ free(poolname);
+
+ return (bogus_name);
+}
+
+typedef struct verify_checkpoint_sm_entry_cb_arg {
+ vdev_t *vcsec_vd;
+
+ /* the following fields are only used for printing progress */
+ uint64_t vcsec_entryid;
+ uint64_t vcsec_num_entries;
+} verify_checkpoint_sm_entry_cb_arg_t;
+
+#define ENTRIES_PER_PROGRESS_UPDATE 10000
+
+static int
+verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
+ void *arg)
+{
+ verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
+ vdev_t *vd = vcsec->vcsec_vd;
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ uint64_t end = offset + size;
+
+ ASSERT(type == SM_FREE);
+
+ if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu, space map entry %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vcsec->vcsec_entryid,
+ (longlong_t)vcsec->vcsec_num_entries);
+ }
+ vcsec->vcsec_entryid++;
+
+ /*
+ * See comment in checkpoint_sm_exclude_entry_cb()
+ */
+ VERIFY3U(offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * The entries in the vdev_checkpoint_sm should be marked as
+ * allocated in the checkpointed state of the pool, therefore
+ * their respective ms_allocateable trees should not contain them.
+ */
+ mutex_enter(&ms->ms_lock);
+ range_tree_verify(ms->ms_allocatable, offset, size);
+ mutex_exit(&ms->ms_lock);
+
+ return (0);
+}
+
+/*
+ * Verify that all segments in the vdev_checkpoint_sm are allocated
+ * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
+ * ms_allocatable).
+ *
+ * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
+ * each vdev in the current state of the pool to the metaslab space maps
+ * (ms_sm) of the checkpointed state of the pool.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of the current spa_t. The entries of these ms_allocatable
+ * trees are cleared out and then repopulated from with the free
+ * entries of their respective ms_sm space maps.
+ */
+static void
+verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
+
+ for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * Since we don't allow device removal in a pool
+ * that has a checkpoint, we expect that all removed
+ * vdevs were removed from the pool before the
+ * checkpoint.
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ /*
+ * If the checkpoint space map doesn't exist, then nothing
+ * here is checkpointed so there's nothing to verify.
+ */
+ if (current_vd->vdev_top_zap == 0 ||
+ zap_contains(spa_meta_objset(current),
+ current_vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ continue;
+
+ VERIFY0(zap_lookup(spa_meta_objset(current),
+ current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
+ checkpoint_sm_obj, 0, current_vd->vdev_asize,
+ current_vd->vdev_ashift));
+ space_map_update(checkpoint_sm);
+
+ verify_checkpoint_sm_entry_cb_arg_t vcsec;
+ vcsec.vcsec_vd = ckpoint_vd;
+ vcsec.vcsec_entryid = 0;
+ vcsec.vcsec_num_entries =
+ space_map_length(checkpoint_sm) / sizeof (uint64_t);
+ VERIFY0(space_map_iterate(checkpoint_sm,
+ verify_checkpoint_sm_entry_cb, &vcsec));
+ dump_spacemap(current->spa_meta_objset, checkpoint_sm);
+ space_map_close(checkpoint_sm);
+ }
+
+ /*
+ * If we've added vdevs since we took the checkpoint, ensure
+ * that their checkpoint space maps are empty.
+ */
+ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
+ for (uint64_t c = ckpoint_rvd->vdev_children;
+ c < current_rvd->vdev_children; c++) {
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+ ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+/*
+ * Verifies that all space that's allocated in the checkpoint is
+ * still allocated in the current version, by checking that everything
+ * in checkpoint's ms_allocatable (which is actually allocated, not
+ * allocatable/free) is not present in current's ms_allocatable.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of both spas when called. The entries of all ms_allocatable
+ * trees are cleared out and then repopulated from their respective
+ * ms_sm space maps. In the checkpointed state we load the allocated
+ * entries, and in the current state we load the free entries.
+ */
+static void
+verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
+ load_concrete_ms_allocatable_trees(current, SM_FREE);
+
+ for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
+ vdev_t *current_vd = current_rvd->vdev_child[i];
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * See comment in verify_checkpoint_vdev_spacemaps()
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
+ metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
+ metaslab_t *current_msp = current_vd->vdev_ms[m];
+
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu of %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)current_vd->vdev_id,
+ (longlong_t)current_rvd->vdev_children,
+ (longlong_t)current_vd->vdev_ms[m]->ms_id,
+ (longlong_t)current_vd->vdev_ms_count);
+
+ /*
+ * We walk through the ms_allocatable trees that
+ * are loaded with the allocated blocks from the
+ * ms_sm spacemaps of the checkpoint. For each
+ * one of these ranges we ensure that none of them
+ * exists in the ms_allocatable trees of the
+ * current state which are loaded with the ranges
+ * that are currently free.
+ *
+ * This way we ensure that none of the blocks that
+ * are part of the checkpoint were freed by mistake.
+ */
+ range_tree_walk(ckpoint_msp->ms_allocatable,
+ (range_tree_func_t *)range_tree_verify,
+ current_msp->ms_allocatable);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+static void
+verify_checkpoint_blocks(spa_t *spa)
+{
+ spa_t *checkpoint_spa;
+ char *checkpoint_pool;
+ nvlist_t *config = NULL;
+ int error = 0;
+
+ /*
+ * We import the checkpointed state of the pool (under a different
+ * name) so we can do verification on it against the current state
+ * of the pool.
+ */
+ checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
+ NULL);
+ ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
+
+ error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
+ if (error != 0) {
+ fatal("Tried to open pool \"%s\" but spa_open() failed with "
+ "error %d\n", checkpoint_pool, error);
+ }
+
+ /*
+ * Ensure that ranges in the checkpoint space maps of each vdev
+ * are allocated according to the checkpointed state's metaslab
+ * space maps.
+ */
+ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
+
+ /*
+ * Ensure that allocated ranges in the checkpoint's metaslab
+ * space maps remain allocated in the metaslab space maps of
+ * the current state.
+ */
+ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
+
+ /*
+ * Once we are done, we get rid of the checkpointed state.
+ */
+ spa_close(checkpoint_spa, FTAG);
+ free(checkpoint_pool);
+}
+
+static void
+dump_leftover_checkpoint_blocks(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ if (vd->vdev_top_zap == 0)
+ continue;
+
+ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ continue;
+
+ VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+ checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+ space_map_update(checkpoint_sm);
+ dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
+ space_map_close(checkpoint_sm);
+ }
+}
+
+static int
+verify_checkpoint(spa_t *spa)
+{
+ uberblock_t checkpoint;
+ int error;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (0);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+ sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+ if (error == ENOENT) {
+ /*
+ * If the feature is active but the uberblock is missing
+ * then we must be in the middle of discarding the
+ * checkpoint.
+ */
+ (void) printf("\nPartially discarded checkpoint "
+ "state found:\n");
+ dump_leftover_checkpoint_blocks(spa);
+ return (0);
+ } else if (error != 0) {
+ (void) printf("lookup error %d when looking for "
+ "checkpointed uberblock in MOS\n", error);
+ return (error);
+ }
+ dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
+
+ if (checkpoint.ub_checkpoint_txg == 0) {
+ (void) printf("\nub_checkpoint_txg not set in checkpointed "
+ "uberblock\n");
+ error = 3;
+ }
+
+ if (error == 0)
+ verify_checkpoint_blocks(spa);
+
+ return (error);
+}
+
static void
dump_zpool(spa_t *spa)
{
@@ -4435,6 +5015,9 @@ dump_zpool(spa_t *spa)
if (dump_opt['h'])
dump_history(spa);
+ if (rc == 0 && !dump_opt['L'])
+ rc = verify_checkpoint(spa);
+
if (rc != 0) {
dump_debug_buffer();
exit(rc);
@@ -4879,6 +5462,7 @@ main(int argc, char **argv)
int rewind = ZPOOL_NEVER_REWIND;
char *spa_config_path_env;
boolean_t target_is_spa = B_TRUE;
+ nvlist_t *cfg = NULL;
(void) setrlimit(RLIMIT_NOFILE, &rl);
(void) enable_extended_FILE_stdio(-1, -1);
@@ -4895,7 +5479,7 @@ main(int argc, char **argv)
spa_config_path = spa_config_path_env;
while ((c = getopt(argc, argv,
- "AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
+ "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
switch (c) {
case 'b':
case 'c':
@@ -4920,6 +5504,7 @@ main(int argc, char **argv)
case 'A':
case 'e':
case 'F':
+ case 'k':
case 'L':
case 'P':
case 'q':
@@ -5029,7 +5614,7 @@ main(int argc, char **argv)
verbose = MAX(verbose, 1);
for (c = 0; c < 256; c++) {
- if (dump_all && strchr("AeEFlLOPRSX", c) == NULL)
+ if (dump_all && strchr("AeEFklLOPRSX", c) == NULL)
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
@@ -5081,6 +5666,17 @@ main(int argc, char **argv)
error = 0;
target = argv[0];
+ char *checkpoint_pool = NULL;
+ char *checkpoint_target = NULL;
+ if (dump_opt['k']) {
+ checkpoint_pool = import_checkpointed_state(target, cfg,
+ &checkpoint_target);
+
+ if (checkpoint_target != NULL)
+ target = checkpoint_target;
+
+ }
+
if (strpbrk(target, "/@") != NULL) {
size_t targetlen;
@@ -5097,7 +5693,6 @@ main(int argc, char **argv)
if (dump_opt['e']) {
importargs_t args = { 0 };
- nvlist_t *cfg = NULL;
args.paths = nsearch;
args.path = searchdirs;
@@ -5121,6 +5716,7 @@ main(int argc, char **argv)
(void) printf("\nConfiguration for import:\n");
dump_nvlist(cfg, 8);
}
+
error = spa_import(target_pool, cfg, NULL,
flags | ZFS_IMPORT_SKIP_MMP);
}
@@ -5130,7 +5726,18 @@ main(int argc, char **argv)
free(target_pool);
if (error == 0) {
- if (target_is_spa || dump_opt['R']) {
+ if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
+ ASSERT(checkpoint_pool != NULL);
+ ASSERT(checkpoint_target == NULL);
+
+ error = spa_open(checkpoint_pool, &spa, FTAG);
+ if (error != 0) {
+ fatal("Tried to open pool \"%s\" but "
+ "spa_open() failed with error %d\n",
+ checkpoint_pool, error);
+ }
+
+ } else if (target_is_spa || dump_opt['R']) {
/*
* Disable the activity check to allow examination of
* active pools.
@@ -5216,6 +5823,12 @@ main(int argc, char **argv)
zdb_read_block(argv[i], spa);
}
+ if (dump_opt['k']) {
+ free(checkpoint_pool);
+ if (!target_is_spa)
+ free(checkpoint_target);
+ }
+
if (os != NULL)
close_objset(os, FTAG);
else