diff options
author | Serapheim Dimitropoulos <[email protected]> | 2016-12-16 14:11:29 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2018-06-26 10:07:42 -0700 |
commit | d2734cce68cf740e015312314415f9034c67851c (patch) | |
tree | b7a140a3cf2a19bb7c88f2d277f3b5a33c121cea /cmd/zdb | |
parent | 88eaf610d9c7056f0946e5090cba1e6288ff2b70 (diff) |
OpenZFS 9166 - zfs storage pool checkpoint
Details about the motivation of this feature and its usage can
be found in this blogpost:
https://sdimitro.github.io/post/zpool-checkpoint/
A lightning talk of this feature can be found here:
https://www.youtube.com/watch?v=fPQA8K40jAM
Implementation details can be found in big block comment of
spa_checkpoint.c
Side-changes that are relevant to this commit but not explained
elsewhere:
* renames members of "struct metaslab trees to be shorter without
losing meaning
* space_map_{alloc,truncate}() accept a block size as a
parameter. The reason is that in the current state all space
maps that we allocate through the DMU use a global tunable
(space_map_blksz) which defauls to 4KB. This is ok for metaslab
space maps in terms of bandwirdth since they are scattered all
over the disk. But for other space maps this default is probably
not what we want. Examples are device removal's vdev_obsolete_sm
or vdev_chedkpoint_sm from this review. Both of these have a
1:1 relationship with each vdev and could benefit from a bigger
block size.
Porting notes:
* The part of dsl_scan_sync() which handles async destroys has
been moved into the new dsl_process_async_destroys() function.
* Remove "VERIFY(!(flags & FWRITE))" in "kernel.c" so zhack can write
to block device backed pools.
* ZTS:
* Fix get_txg() in zpool_sync_001_pos due to "checkpoint_txg".
* Don't use large dd block sizes on /dev/urandom under Linux in
checkpoint_capacity.
* Adopt Delphix-OS's setting of 4 (spa_asize_inflation =
SPA_DVAS_PER_BP + 1) for the checkpoint_capacity test to speed
its attempts to fill the pool
* Create the base and nested pools with sync=disabled to speed up
the "setup" phase.
* Clear labels in test pool between checkpoint tests to avoid
duplicate pool issues.
* The import_rewind_device_replaced test has been marked as "known
to fail" for the reasons listed in its DISCLAIMER.
* New module parameters:
zfs_spa_discard_memory_limit,
zfs_remove_max_bytes_pause (not documented - debugging only)
vdev_max_ms_count (formerly metaslabs_per_vdev)
vdev_min_ms_count
Authored by: Serapheim Dimitropoulos <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: John Kennedy <[email protected]>
Reviewed by: Dan Kimmel <[email protected]>
Reviewed by: Brian Behlendorf <[email protected]>
Approved by: Richard Lowe <[email protected]>
Ported-by: Tim Chase <[email protected]>
Signed-off-by: Tim Chase <[email protected]>
OpenZFS-issue: https://illumos.org/issues/9166
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7159fdb8
Closes #7570
Diffstat (limited to 'cmd/zdb')
-rw-r--r-- | cmd/zdb/zdb.c | 853 | ||||
-rw-r--r-- | cmd/zdb/zdb_il.c | 12 |
2 files changed, 742 insertions, 123 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index faac43c79..d1e77cce7 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -131,7 +131,7 @@ static void usage(void) { (void) fprintf(stderr, - "Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-V] [-p <path> ...]] " + "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] " "[-I <inflight I/Os>]\n" "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n" "\t\t[<poolname> [<object> ...]]\n" @@ -168,6 +168,8 @@ usage(void) (void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -i intent logs\n"); (void) fprintf(stderr, " -l read label contents\n"); + (void) fprintf(stderr, " -k examine the checkpointed state " + "of the pool\n"); (void) fprintf(stderr, " -L disable leak tracking (do not " "load spacemaps)\n"); (void) fprintf(stderr, " -m metaslabs\n"); @@ -731,6 +733,22 @@ get_prev_obsolete_spacemap_refcount(spa_t *spa) } static int +get_checkpoint_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && + zap_contains(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) + refcount++; + + for (uint64_t c = 0; c < vd->vdev_children; c++) + refcount += get_checkpoint_refcount(vd->vdev_child[c]); + + return (refcount); +} + +static int verify_spacemap_refcounts(spa_t *spa) { uint64_t expected_refcount = 0; @@ -743,6 +761,7 @@ verify_spacemap_refcounts(spa_t *spa) actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); actual_refcount += get_prev_obsolete_spacemap_refcount(spa); + actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " @@ -816,8 +835,8 @@ static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; - range_tree_t *rt = msp->ms_tree; - avl_tree_t *t = &msp->ms_size_tree; + range_tree_t *rt = msp->ms_allocatable; + avl_tree_t *t = &msp->ms_allocatable_by_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ @@ -853,7 +872,7 @@ dump_metaslab(metaslab_t *msp) metaslab_load_wait(msp); if (!msp->ms_loaded) { VERIFY0(metaslab_load(msp)); - range_tree_stat_verify(msp->ms_tree); + range_tree_stat_verify(msp->ms_allocatable); } dump_metaslab_stats(msp); metaslab_unload(msp); @@ -2420,6 +2439,8 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer) snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } + (void) printf("\tcheckpoint_txg = %llu\n", + (u_longlong_t)ub->ub_checkpoint_txg); (void) printf("%s", footer ? footer : ""); } @@ -3129,6 +3150,7 @@ static const char *zdb_ot_extname[] = { typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_removing_size; + uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; @@ -3229,7 +3251,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, } VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, - refcnt ? 0 : spa_first_txg(zcb->zcb_spa), + refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } @@ -3388,7 +3410,7 @@ claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, ASSERT(vdev_is_concrete(vd)); VERIFY0(metaslab_claim_impl(vd, offset, size, - spa_first_txg(vd->vdev_spa))); + spa_min_claim_txg(vd->vdev_spa))); } static void @@ -3453,70 +3475,6 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) spa_config_exit(spa, SCL_CONFIG, FTAG); } -/* - * vm_idxp is an in-out parameter which (for indirect vdevs) is the - * index in vim_entries that has the first entry in this metaslab. On - * return, it will be set to the first entry after this metaslab. - */ -static void -zdb_leak_init_ms(metaslab_t *msp, uint64_t *vim_idxp) -{ - metaslab_group_t *mg = msp->ms_group; - vdev_t *vd = mg->mg_vd; - vdev_t *rvd = vd->vdev_spa->spa_root_vdev; - - mutex_enter(&msp->ms_lock); - metaslab_unload(msp); - - /* - * We don't want to spend the CPU manipulating the size-ordered - * tree, so clear the range_tree ops. - */ - msp->ms_tree->rt_ops = NULL; - - (void) fprintf(stderr, - "\rloading vdev %llu of %llu, metaslab %llu of %llu ...", - (longlong_t)vd->vdev_id, - (longlong_t)rvd->vdev_children, - (longlong_t)msp->ms_id, - (longlong_t)vd->vdev_ms_count); - - /* - * For leak detection, we overload the metaslab ms_tree to - * contain allocated segments instead of free segments. As a - * result, we can't use the normal metaslab_load/unload - * interfaces. - */ - if (vd->vdev_ops == &vdev_indirect_ops) { - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); - (*vim_idxp)++) { - vdev_indirect_mapping_entry_phys_t *vimep = - &vim->vim_entries[*vim_idxp]; - uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); - uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); - ASSERT3U(ent_offset, >=, msp->ms_start); - if (ent_offset >= msp->ms_start + msp->ms_size) - break; - - /* - * Mappings do not cross metaslab boundaries, - * because we create them by walking the metaslabs. - */ - ASSERT3U(ent_offset + ent_len, <=, - msp->ms_start + msp->ms_size); - range_tree_add(msp->ms_tree, ent_offset, ent_len); - } - } else if (msp->ms_sm != NULL) { - VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC)); - } - - if (!msp->ms_loaded) { - msp->ms_loaded = B_TRUE; - } - mutex_exit(&msp->ms_lock); -} - /* ARGSUSED */ static int increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) @@ -3615,11 +3573,246 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) ASSERT(error == ENOENT); } +typedef struct checkpoint_sm_exclude_entry_arg { + vdev_t *cseea_vd; + uint64_t cseea_checkpoint_size; +} checkpoint_sm_exclude_entry_arg_t; + +static int +checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size, + void *arg) +{ + checkpoint_sm_exclude_entry_arg_t *cseea = arg; + vdev_t *vd = cseea->cseea_vd; + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + uint64_t end = offset + size; + + ASSERT(type == SM_FREE); + + /* + * Since the vdev_checkpoint_sm exists in the vdev level + * and the ms_sm space maps exist in the metaslab level, + * an entry in the checkpoint space map could theoretically + * cross the boundaries of the metaslab that it belongs. + * + * In reality, because of the way that we populate and + * manipulate the checkpoint's space maps currently, + * there shouldn't be any entries that cross metaslabs. + * Hence the assertion below. + * + * That said, there is no fundamental requirement that + * the checkpoint's space map entries should not cross + * metaslab boundaries. So if needed we could add code + * that handles metaslab-crossing segments in the future. + */ + VERIFY3U(offset, >=, ms->ms_start); + VERIFY3U(end, <=, ms->ms_start + ms->ms_size); + + /* + * By removing the entry from the allocated segments we + * also verify that the entry is there to begin with. + */ + mutex_enter(&ms->ms_lock); + range_tree_remove(ms->ms_allocatable, offset, size); + mutex_exit(&ms->ms_lock); + + cseea->cseea_checkpoint_size += size; + return (0); +} + +static void +zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) +{ + spa_t *spa = vd->vdev_spa; + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + /* + * If there is no vdev_top_zap, we are in a pool whose + * version predates the pool checkpoint feature. + */ + if (vd->vdev_top_zap == 0) + return; + + /* + * If there is no reference of the vdev_checkpoint_sm in + * the vdev_top_zap, then one of the following scenarios + * is true: + * + * 1] There is no checkpoint + * 2] There is a checkpoint, but no checkpointed blocks + * have been freed yet + * 3] The current vdev is indirect + * + * In these cases we return immediately. + */ + if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + return; + + VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, + &checkpoint_sm_obj)); + + checkpoint_sm_exclude_entry_arg_t cseea; + cseea.cseea_vd = vd; + cseea.cseea_checkpoint_size = 0; + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), + checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); + space_map_update(checkpoint_sm); + + VERIFY0(space_map_iterate(checkpoint_sm, + checkpoint_sm_exclude_entry_cb, &cseea)); + space_map_close(checkpoint_sm); + + zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; +} + +static void +zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) +{ + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); + zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); + } +} + +static void +load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) +{ + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + + ASSERT3U(i, ==, vd->vdev_id); + + if (vd->vdev_ops == &vdev_indirect_ops) + continue; + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + (void) fprintf(stderr, + "\rloading concrete vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)msp->ms_id, + (longlong_t)vd->vdev_ms_count); + + mutex_enter(&msp->ms_lock); + metaslab_unload(msp); + + /* + * We don't want to spend the CPU manipulating the + * size-ordered tree, so clear the range_tree ops. + */ + msp->ms_allocatable->rt_ops = NULL; + + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + msp->ms_allocatable, maptype)); + } + if (!msp->ms_loaded) + msp->ms_loaded = B_TRUE; + mutex_exit(&msp->ms_lock); + } + } +} + +/* + * vm_idxp is an in-out parameter which (for indirect vdevs) is the + * index in vim_entries that has the first entry in this metaslab. + * On return, it will be set to the first entry after this metaslab. + */ +static void +load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, + uint64_t *vim_idxp) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + mutex_enter(&msp->ms_lock); + metaslab_unload(msp); + + /* + * We don't want to spend the CPU manipulating the + * size-ordered tree, so clear the range_tree ops. + */ + msp->ms_allocatable->rt_ops = NULL; + + for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); + (*vim_idxp)++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[*vim_idxp]; + uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); + uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); + ASSERT3U(ent_offset, >=, msp->ms_start); + if (ent_offset >= msp->ms_start + msp->ms_size) + break; + + /* + * Mappings do not cross metaslab boundaries, + * because we create them by walking the metaslabs. + */ + ASSERT3U(ent_offset + ent_len, <=, + msp->ms_start + msp->ms_size); + range_tree_add(msp->ms_allocatable, ent_offset, ent_len); + } + + if (!msp->ms_loaded) + msp->ms_loaded = B_TRUE; + mutex_exit(&msp->ms_lock); +} + +static void +zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) +{ + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + ASSERT3U(c, ==, vd->vdev_id); + + if (vd->vdev_ops != &vdev_indirect_ops) + continue; + + /* + * Note: we don't check for mapping leaks on + * removing vdevs because their ms_allocatable's + * are used to look for leaks in allocated space. + */ + zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); + + /* + * Normally, indirect vdevs don't have any + * metaslabs. We want to set them up for + * zio_claim(). + */ + VERIFY0(vdev_metaslab_init(vd, 0)); + + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t vim_idx = 0; + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + + (void) fprintf(stderr, + "\rloading indirect vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)vd->vdev_ms[m]->ms_id, + (longlong_t)vd->vdev_ms_count); + + load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], + &vim_idx); + } + ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); + } +} + static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; - uint64_t c; if (!dump_opt['L']) { dsl_pool_t *dp = spa->spa_dsl_pool; @@ -3627,7 +3820,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) /* * We are going to be changing the meaning of the metaslab's - * ms_tree. Ensure that the allocator doesn't try to + * ms_allocatable. Ensure that the allocator doesn't try to * use the tree. */ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; @@ -3637,38 +3830,37 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), UMEM_NOFAIL); - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - uint64_t vim_idx = 0; - - ASSERT3U(c, ==, vd->vdev_id); - - /* - * Note: we don't check for mapping leaks on - * removing vdevs because their ms_tree's are - * used to look for leaks in allocated space. - */ - if (vd->vdev_ops == &vdev_indirect_ops) { - zcb->zcb_vd_obsolete_counts[c] = - zdb_load_obsolete_counts(vd); + /* + * For leak detection, we overload the ms_allocatable trees + * to contain allocated segments instead of free segments. + * As a result, we can't use the normal metaslab_load/unload + * interfaces. + */ + zdb_leak_init_prepare_indirect_vdevs(spa, zcb); + load_concrete_ms_allocatable_trees(spa, SM_ALLOC); - /* - * Normally, indirect vdevs don't have any - * metaslabs. We want to set them up for - * zio_claim(). - */ - VERIFY0(vdev_metaslab_init(vd, 0)); - } + /* + * On load_concrete_ms_allocatable_trees() we loaded all the + * allocated entries from the ms_sm to the ms_allocatable for + * each metaslab. If the pool has a checkpoint or is in the + * middle of discarding a checkpoint, some of these blocks + * may have been freed but their ms_sm may not have been + * updated because they are referenced by the checkpoint. In + * order to avoid false-positives during leak-detection, we + * go through the vdev's checkpoint space map and exclude all + * its entries from their relevant ms_allocatable. + * + * We also aggregate the space held by the checkpoint and add + * it to zcb_checkpoint_size. + * + * Note that at this point we are also verifying that all the + * entries on the checkpoint_sm are marked as allocated in + * the ms_sm of their relevant metaslab. + * [see comment in checkpoint_sm_exclude_entry_cb()] + */ + zdb_leak_init_exclude_checkpoint(spa, zcb); - for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { - zdb_leak_init_ms(vd->vdev_ms[m], &vim_idx); - } - if (vd->vdev_ops == &vdev_indirect_ops) { - ASSERT3U(vim_idx, ==, - vdev_indirect_mapping_num_entries( - vd->vdev_indirect_mapping)); - } - } + /* for cleaner progress output */ (void) fprintf(stderr, "\n"); if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { @@ -3677,12 +3869,16 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, increment_indirect_mapping_cb, zcb, NULL); } + } else { + /* + * If leak tracing is disabled, we still need to consider + * any checkpointed space in our space verification. + */ + zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa); } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - zdb_ddt_leak_init(spa, zcb); - spa_config_exit(spa, SCL_CONFIG, FTAG); } @@ -3709,7 +3905,7 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) for (uint64_t inner_offset = 0; inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); inner_offset += 1 << vd->vdev_ashift) { - if (range_tree_contains(msp->ms_tree, + if (range_tree_contains(msp->ms_allocatable, offset + inner_offset, 1 << vd->vdev_ashift)) { obsolete_bytes += 1 << vd->vdev_ashift; } @@ -3775,23 +3971,23 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) ASSERT3P(mg, ==, msp->ms_group); /* - * The ms_tree has been overloaded to - * contain allocated segments. Now that we - * finished traversing all blocks, any - * block that remains in the ms_tree + * ms_allocatable has been overloaded + * to contain allocated segments. Now that + * we finished traversing all blocks, any + * block that remains in the ms_allocatable * represents an allocated block that we * did not claim during the traversal. * Claimed blocks would have been removed - * from the ms_tree. For indirect vdevs, - * space remaining in the tree represents - * parts of the mapping that are not - * referenced, which is not a bug. + * from the ms_allocatable. For indirect + * vdevs, space remaining in the tree + * represents parts of the mapping that are + * not referenced, which is not a bug. */ if (vd->vdev_ops == &vdev_indirect_ops) { - range_tree_vacate(msp->ms_tree, + range_tree_vacate(msp->ms_allocatable, NULL, NULL); } else { - range_tree_vacate(msp->ms_tree, + range_tree_vacate(msp->ms_allocatable, zdb_leak, vd); } @@ -3923,7 +4119,7 @@ dump_block_stats(spa_t *spa) total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)); total_found = tzb->zb_asize - zcb.zcb_dedup_asize + - zcb.zcb_removing_size; + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; if (total_found == total_alloc) { if (!dump_opt['L']) @@ -4332,6 +4528,390 @@ verify_device_removal_feature_counts(spa_t *spa) return (ret); } +#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" +/* + * Import the checkpointed state of the pool specified by the target + * parameter as readonly. The function also accepts a pool config + * as an optional parameter, else it attempts to infer the config by + * the name of the target pool. + * + * Note that the checkpointed state's pool name will be the name of + * the original pool with the above suffix appened to it. In addition, + * if the target is not a pool name (e.g. a path to a dataset) then + * the new_path parameter is populated with the updated path to + * reflect the fact that we are looking into the checkpointed state. + * + * The function returns a newly-allocated copy of the name of the + * pool containing the checkpointed state. When this copy is no + * longer needed it should be freed with free(3C). Same thing + * applies to the new_path parameter if allocated. + */ +static char * +import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) +{ + int error = 0; + char *poolname, *bogus_name = NULL; + + /* If the target is not a pool, the extract the pool name */ + char *path_start = strchr(target, '/'); + if (path_start != NULL) { + size_t poolname_len = path_start - target; + poolname = strndup(target, poolname_len); + } else { + poolname = target; + } + + if (cfg == NULL) { + error = spa_get_stats(poolname, &cfg, NULL, 0); + if (error != 0) { + fatal("Tried to read config of pool \"%s\" but " + "spa_get_stats() failed with error %d\n", + poolname, error); + } + } + + if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) + return (NULL); + fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); + + error = spa_import(bogus_name, cfg, NULL, + ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT); + if (error != 0) { + fatal("Tried to import pool \"%s\" but spa_import() failed " + "with error %d\n", bogus_name, error); + } + + if (new_path != NULL && path_start != NULL) { + if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { + if (path_start != NULL) + free(poolname); + return (NULL); + } + } + + if (target != poolname) + free(poolname); + + return (bogus_name); +} + +typedef struct verify_checkpoint_sm_entry_cb_arg { + vdev_t *vcsec_vd; + + /* the following fields are only used for printing progress */ + uint64_t vcsec_entryid; + uint64_t vcsec_num_entries; +} verify_checkpoint_sm_entry_cb_arg_t; + +#define ENTRIES_PER_PROGRESS_UPDATE 10000 + +static int +verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size, + void *arg) +{ + verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; + vdev_t *vd = vcsec->vcsec_vd; + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + uint64_t end = offset + size; + + ASSERT(type == SM_FREE); + + if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { + (void) fprintf(stderr, + "\rverifying vdev %llu, space map entry %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)vcsec->vcsec_entryid, + (longlong_t)vcsec->vcsec_num_entries); + } + vcsec->vcsec_entryid++; + + /* + * See comment in checkpoint_sm_exclude_entry_cb() + */ + VERIFY3U(offset, >=, ms->ms_start); + VERIFY3U(end, <=, ms->ms_start + ms->ms_size); + + /* + * The entries in the vdev_checkpoint_sm should be marked as + * allocated in the checkpointed state of the pool, therefore + * their respective ms_allocateable trees should not contain them. + */ + mutex_enter(&ms->ms_lock); + range_tree_verify(ms->ms_allocatable, offset, size); + mutex_exit(&ms->ms_lock); + + return (0); +} + +/* + * Verify that all segments in the vdev_checkpoint_sm are allocated + * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's + * ms_allocatable). + * + * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of + * each vdev in the current state of the pool to the metaslab space maps + * (ms_sm) of the checkpointed state of the pool. + * + * Note that the function changes the state of the ms_allocatable + * trees of the current spa_t. The entries of these ms_allocatable + * trees are cleared out and then repopulated from with the free + * entries of their respective ms_sm space maps. + */ +static void +verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) +{ + vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; + vdev_t *current_rvd = current->spa_root_vdev; + + load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); + + for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { + vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; + vdev_t *current_vd = current_rvd->vdev_child[c]; + + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { + /* + * Since we don't allow device removal in a pool + * that has a checkpoint, we expect that all removed + * vdevs were removed from the pool before the + * checkpoint. + */ + ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); + continue; + } + + /* + * If the checkpoint space map doesn't exist, then nothing + * here is checkpointed so there's nothing to verify. + */ + if (current_vd->vdev_top_zap == 0 || + zap_contains(spa_meta_objset(current), + current_vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + continue; + + VERIFY0(zap_lookup(spa_meta_objset(current), + current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (uint64_t), 1, &checkpoint_sm_obj)); + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), + checkpoint_sm_obj, 0, current_vd->vdev_asize, + current_vd->vdev_ashift)); + space_map_update(checkpoint_sm); + + verify_checkpoint_sm_entry_cb_arg_t vcsec; + vcsec.vcsec_vd = ckpoint_vd; + vcsec.vcsec_entryid = 0; + vcsec.vcsec_num_entries = + space_map_length(checkpoint_sm) / sizeof (uint64_t); + VERIFY0(space_map_iterate(checkpoint_sm, + verify_checkpoint_sm_entry_cb, &vcsec)); + dump_spacemap(current->spa_meta_objset, checkpoint_sm); + space_map_close(checkpoint_sm); + } + + /* + * If we've added vdevs since we took the checkpoint, ensure + * that their checkpoint space maps are empty. + */ + if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { + for (uint64_t c = ckpoint_rvd->vdev_children; + c < current_rvd->vdev_children; c++) { + vdev_t *current_vd = current_rvd->vdev_child[c]; + ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL); + } + } + + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); +} + +/* + * Verifies that all space that's allocated in the checkpoint is + * still allocated in the current version, by checking that everything + * in checkpoint's ms_allocatable (which is actually allocated, not + * allocatable/free) is not present in current's ms_allocatable. + * + * Note that the function changes the state of the ms_allocatable + * trees of both spas when called. The entries of all ms_allocatable + * trees are cleared out and then repopulated from their respective + * ms_sm space maps. In the checkpointed state we load the allocated + * entries, and in the current state we load the free entries. + */ +static void +verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) +{ + vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; + vdev_t *current_rvd = current->spa_root_vdev; + + load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); + load_concrete_ms_allocatable_trees(current, SM_FREE); + + for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { + vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; + vdev_t *current_vd = current_rvd->vdev_child[i]; + + if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { + /* + * See comment in verify_checkpoint_vdev_spacemaps() + */ + ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); + continue; + } + + for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { + metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; + metaslab_t *current_msp = current_vd->vdev_ms[m]; + + (void) fprintf(stderr, + "\rverifying vdev %llu of %llu, " + "metaslab %llu of %llu ...", + (longlong_t)current_vd->vdev_id, + (longlong_t)current_rvd->vdev_children, + (longlong_t)current_vd->vdev_ms[m]->ms_id, + (longlong_t)current_vd->vdev_ms_count); + + /* + * We walk through the ms_allocatable trees that + * are loaded with the allocated blocks from the + * ms_sm spacemaps of the checkpoint. For each + * one of these ranges we ensure that none of them + * exists in the ms_allocatable trees of the + * current state which are loaded with the ranges + * that are currently free. + * + * This way we ensure that none of the blocks that + * are part of the checkpoint were freed by mistake. + */ + range_tree_walk(ckpoint_msp->ms_allocatable, + (range_tree_func_t *)range_tree_verify, + current_msp->ms_allocatable); + } + } + + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); +} + +static void +verify_checkpoint_blocks(spa_t *spa) +{ + spa_t *checkpoint_spa; + char *checkpoint_pool; + nvlist_t *config = NULL; + int error = 0; + + /* + * We import the checkpointed state of the pool (under a different + * name) so we can do verification on it against the current state + * of the pool. + */ + checkpoint_pool = import_checkpointed_state(spa->spa_name, config, + NULL); + ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); + + error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); + if (error != 0) { + fatal("Tried to open pool \"%s\" but spa_open() failed with " + "error %d\n", checkpoint_pool, error); + } + + /* + * Ensure that ranges in the checkpoint space maps of each vdev + * are allocated according to the checkpointed state's metaslab + * space maps. + */ + verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); + + /* + * Ensure that allocated ranges in the checkpoint's metaslab + * space maps remain allocated in the metaslab space maps of + * the current state. + */ + verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); + + /* + * Once we are done, we get rid of the checkpointed state. + */ + spa_close(checkpoint_spa, FTAG); + free(checkpoint_pool); +} + +static void +dump_leftover_checkpoint_blocks(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + if (vd->vdev_top_zap == 0) + continue; + + if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + continue; + + VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (uint64_t), 1, &checkpoint_sm_obj)); + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), + checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); + space_map_update(checkpoint_sm); + dump_spacemap(spa->spa_meta_objset, checkpoint_sm); + space_map_close(checkpoint_sm); + } +} + +static int +verify_checkpoint(spa_t *spa) +{ + uberblock_t checkpoint; + int error; + + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (0); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error == ENOENT) { + /* + * If the feature is active but the uberblock is missing + * then we must be in the middle of discarding the + * checkpoint. + */ + (void) printf("\nPartially discarded checkpoint " + "state found:\n"); + dump_leftover_checkpoint_blocks(spa); + return (0); + } else if (error != 0) { + (void) printf("lookup error %d when looking for " + "checkpointed uberblock in MOS\n", error); + return (error); + } + dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); + + if (checkpoint.ub_checkpoint_txg == 0) { + (void) printf("\nub_checkpoint_txg not set in checkpointed " + "uberblock\n"); + error = 3; + } + + if (error == 0) + verify_checkpoint_blocks(spa); + + return (error); +} + static void dump_zpool(spa_t *spa) { @@ -4435,6 +5015,9 @@ dump_zpool(spa_t *spa) if (dump_opt['h']) dump_history(spa); + if (rc == 0 && !dump_opt['L']) + rc = verify_checkpoint(spa); + if (rc != 0) { dump_debug_buffer(); exit(rc); @@ -4879,6 +5462,7 @@ main(int argc, char **argv) int rewind = ZPOOL_NEVER_REWIND; char *spa_config_path_env; boolean_t target_is_spa = B_TRUE; + nvlist_t *cfg = NULL; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) enable_extended_FILE_stdio(-1, -1); @@ -4895,7 +5479,7 @@ main(int argc, char **argv) spa_config_path = spa_config_path_env; while ((c = getopt(argc, argv, - "AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) { + "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) { switch (c) { case 'b': case 'c': @@ -4920,6 +5504,7 @@ main(int argc, char **argv) case 'A': case 'e': case 'F': + case 'k': case 'L': case 'P': case 'q': @@ -5029,7 +5614,7 @@ main(int argc, char **argv) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { - if (dump_all && strchr("AeEFlLOPRSX", c) == NULL) + if (dump_all && strchr("AeEFklLOPRSX", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; @@ -5081,6 +5666,17 @@ main(int argc, char **argv) error = 0; target = argv[0]; + char *checkpoint_pool = NULL; + char *checkpoint_target = NULL; + if (dump_opt['k']) { + checkpoint_pool = import_checkpointed_state(target, cfg, + &checkpoint_target); + + if (checkpoint_target != NULL) + target = checkpoint_target; + + } + if (strpbrk(target, "/@") != NULL) { size_t targetlen; @@ -5097,7 +5693,6 @@ main(int argc, char **argv) if (dump_opt['e']) { importargs_t args = { 0 }; - nvlist_t *cfg = NULL; args.paths = nsearch; args.path = searchdirs; @@ -5121,6 +5716,7 @@ main(int argc, char **argv) (void) printf("\nConfiguration for import:\n"); dump_nvlist(cfg, 8); } + error = spa_import(target_pool, cfg, NULL, flags | ZFS_IMPORT_SKIP_MMP); } @@ -5130,7 +5726,18 @@ main(int argc, char **argv) free(target_pool); if (error == 0) { - if (target_is_spa || dump_opt['R']) { + if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { + ASSERT(checkpoint_pool != NULL); + ASSERT(checkpoint_target == NULL); + + error = spa_open(checkpoint_pool, &spa, FTAG); + if (error != 0) { + fatal("Tried to open pool \"%s\" but " + "spa_open() failed with error %d\n", + checkpoint_pool, error); + } + + } else if (target_is_spa || dump_opt['R']) { /* * Disable the activity check to allow examination of * active pools. @@ -5216,6 +5823,12 @@ main(int argc, char **argv) zdb_read_block(argv[i], spa); } + if (dump_opt['k']) { + free(checkpoint_pool); + if (!target_is_spa) + free(checkpoint_target); + } + if (os != NULL) close_objset(os, FTAG); else diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 2db9c9c0c..c12178eff 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ /* @@ -42,6 +42,7 @@ #include <sys/resource.h> #include <sys/zil.h> #include <sys/zil_impl.h> +#include <sys/spa_impl.h> #include <sys/abd.h> #include "zdb.h" @@ -166,7 +167,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg) if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", tab_prefix, !BP_IS_HOLE(bp) && - bp->blk_birth >= spa_first_txg(zilog->zl_spa) ? + bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ? "will claim" : "won't claim"); print_log_bp(bp, tab_prefix); @@ -361,7 +362,7 @@ print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) if (claim_txg != 0) claim = "already claimed"; - else if (bp->blk_birth >= spa_first_txg(zilog->zl_spa)) + else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa)) claim = "will claim"; else claim = "won't claim"; @@ -416,6 +417,11 @@ dump_intent_log(zilog_t *zilog) for (i = 0; i < TX_MAX_TYPE; i++) zil_rec_info[i].zri_count = 0; + /* see comment in zil_claim() or zil_check_log_chain() */ + if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && + zh->zh_claim_txg == 0) + return; + if (verbose >= 2) { (void) printf("\n"); (void) zil_parse(zilog, print_log_block, print_log_record, NULL, |