diff options
author | Sara Hartse <[email protected]> | 2019-07-26 10:54:14 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2019-07-26 10:54:14 -0700 |
commit | 37f03da8ba6e1ab074b503e1dd63bfa7199d0537 (patch) | |
tree | 987b03643c33cd43b246a20aea28b8750f7b4ee6 | |
parent | d274ac54609894d00a49c0a0da89abd3a7f3998d (diff) |
Fast Clone Deletion
Deleting a clone requires finding blocks are clone-only, not shared
with the snapshot. This was done by traversing the entire block tree
which results in a large performance penalty for sparsely
written clones.
This is new method keeps track of clone blocks when they are
modified in a "Livelist" so that, when it’s time to delete,
the clone-specific blocks are already at hand.
We see performance improvements because now deletion work is
proportional to the number of clone-modified blocks, not the size
of the original dataset.
Reviewed-by: Sean Eric Fagan <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Serapheim Dimitropoulos <[email protected]>
Signed-off-by: Sara Hartse <[email protected]>
Closes #8416
38 files changed, 2581 insertions, 203 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 7e7e26448..1417ab5f2 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -115,7 +115,8 @@ uint64_t max_inflight = 1000; static int leaked_objects = 0; static range_tree_t *mos_refd_objs; -static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *); +static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, + boolean_t); static void mos_obj_refd(uint64_t); static void mos_obj_refd_multiple(uint64_t); @@ -552,12 +553,16 @@ dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) (void) printf("\t\tcomp = %s\n", comp); (void) printf("\t\tuncomp = %s\n", uncomp); } - if (size >= sizeof (*bpop)) { + if (size >= BPOBJ_SIZE_V2) { (void) printf("\t\tsubobjs = %llu\n", (u_longlong_t)bpop->bpo_subobjs); (void) printf("\t\tnum_subobjs = %llu\n", (u_longlong_t)bpop->bpo_num_subobjs); } + if (size >= sizeof (*bpop)) { + (void) printf("\t\tnum_freed = %llu\n", + (u_longlong_t)bpop->bpo_num_freed); + } if (dump_opt['d'] < 5) return; @@ -572,7 +577,8 @@ dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) (void) printf("got error %u from dmu_read\n", err); break; } - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, + BP_GET_FREE(&bp)); (void) printf("\t%s\n", blkbuf); } } @@ -1508,7 +1514,8 @@ blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, } static void -snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) +snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, + boolean_t bp_freed) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; @@ -1516,6 +1523,10 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) if (dump_opt['b'] >= 6) { snprintf_blkptr(blkbuf, buflen, bp); + if (bp_freed) { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " %s", "FREE"); + } return; } @@ -1553,6 +1564,9 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + if (bp_freed) + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " %s", "FREE"); } } @@ -1580,7 +1594,7 @@ print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, } } - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); (void) printf("%s\n", blkbuf); } @@ -1815,12 +1829,12 @@ dump_bptree(objset_t *os, uint64_t obj, const char *name) /* ARGSUSED */ static int -dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; ASSERT(bp->blk_birth != 0); - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); (void) printf("\t%s\n", blkbuf); return (0); } @@ -1845,14 +1859,28 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); - (void) printf(" %*s: object %llu, %llu local blkptrs, " - "%llu subobjs in object, %llu, %s (%s/%s comp)\n", - indent * 8, name, - (u_longlong_t)bpo->bpo_object, - (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, - (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, - (u_longlong_t)bpo->bpo_phys->bpo_subobjs, - bytes, comp, uncomp); + if (bpo->bpo_havefreed) { + (void) printf(" %*s: object %llu, %llu local " + "blkptrs, %llu freed, %llu subobjs in object %llu, " + "%s (%s/%s comp)\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_freed, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + (u_longlong_t)bpo->bpo_phys->bpo_subobjs, + bytes, comp, uncomp); + } else { + (void) printf(" %*s: object %llu, %llu local " + "blkptrs, %llu subobjs in object %llu, " + "%s (%s/%s comp)\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + (u_longlong_t)bpo->bpo_phys->bpo_subobjs, + bytes, comp, uncomp); + } for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { uint64_t subobj; @@ -1872,11 +1900,22 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) bpobj_close(&subbpo); } } else { - (void) printf(" %*s: object %llu, %llu blkptrs, %s\n", - indent * 8, name, - (u_longlong_t)bpo->bpo_object, - (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, - bytes); + if (bpo->bpo_havefreed) { + (void) printf(" %*s: object %llu, %llu blkptrs, " + "%llu freed, %s\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_freed, + bytes); + } else { + (void) printf(" %*s: object %llu, %llu blkptrs, " + "%s\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + bytes); + } } if (dump_opt['d'] < 5) @@ -2038,36 +2077,59 @@ bpobj_count_refd(bpobj_t *bpo) } } +static int +dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) +{ + spa_t *spa = arg; + uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; + if (dle->dle_bpobj.bpo_object != empty_bpobj) + bpobj_count_refd(&dle->dle_bpobj); + return (0); +} + +static int +dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) +{ + ASSERT(arg == NULL); + if (dump_opt['d'] >= 5) { + char buf[128]; + (void) snprintf(buf, sizeof (buf), + "mintxg %llu -> obj %llu", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + + dump_full_bpobj(&dle->dle_bpobj, buf, 0); + } else { + (void) printf("mintxg %llu -> obj %llu\n", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + } + return (0); +} + static void -dump_deadlist(dsl_deadlist_t *dl) +dump_blkptr_list(dsl_deadlist_t *dl, char *name) { - dsl_deadlist_entry_t *dle; - uint64_t unused; char bytes[32]; char comp[32]; char uncomp[32]; - uint64_t empty_bpobj = - dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj; - - /* force the tree to be loaded */ - dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); + char entries[32]; + spa_t *spa = dmu_objset_spa(dl->dl_os); + uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; if (dl->dl_oldfmt) { if (dl->dl_bpobj.bpo_object != empty_bpobj) bpobj_count_refd(&dl->dl_bpobj); } else { mos_obj_refd(dl->dl_object); - for (dle = avl_first(&dl->dl_tree); dle; - dle = AVL_NEXT(&dl->dl_tree, dle)) { - if (dle->dle_bpobj.bpo_object != empty_bpobj) - bpobj_count_refd(&dle->dle_bpobj); - } + dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); } /* make sure nicenum has enough space */ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ); if (dump_opt['d'] < 3) return; @@ -2080,30 +2142,60 @@ dump_deadlist(dsl_deadlist_t *dl) zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); - (void) printf("\n Deadlist: %s (%s/%s comp)\n", - bytes, comp, uncomp); + zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); + (void) printf("\n %s: %s (%s/%s comp), %s entries\n", + name, bytes, comp, uncomp, entries); if (dump_opt['d'] < 4) return; (void) printf("\n"); - for (dle = avl_first(&dl->dl_tree); dle; - dle = AVL_NEXT(&dl->dl_tree, dle)) { - if (dump_opt['d'] >= 5) { - char buf[128]; - (void) snprintf(buf, sizeof (buf), - "mintxg %llu -> obj %llu", - (longlong_t)dle->dle_mintxg, - (longlong_t)dle->dle_bpobj.bpo_object); + dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); +} - dump_full_bpobj(&dle->dle_bpobj, buf, 0); - } else { - (void) printf("mintxg %llu -> obj %llu\n", - (longlong_t)dle->dle_mintxg, - (longlong_t)dle->dle_bpobj.bpo_object); - } +static int +verify_dd_livelist(objset_t *os) +{ + uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + + ASSERT(!dmu_objset_is_snapshot(os)); + if (!dsl_deadlist_is_open(&dd->dd_livelist)) + return (0); + dsl_pool_config_enter(dp, FTAG); + dsl_deadlist_space(&dd->dd_livelist, &ll_used, + &ll_comp, &ll_uncomp); + + dsl_dataset_t *origin_ds; + ASSERT(dsl_pool_config_held(dp)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); + VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, + &used, &comp, &uncomp)); + dsl_dataset_rele(origin_ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + /* + * It's possible that the dataset's uncomp space is larger than the + * livelist's because livelists do not track embedded block pointers + */ + if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { + char nice_used[32], nice_comp[32], nice_uncomp[32]; + (void) printf("Discrepancy in space accounting:\n"); + zdb_nicenum(used, nice_used, sizeof (nice_used)); + zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); + zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); + (void) printf("dir: used %s, comp %s, uncomp %s\n", + nice_used, nice_comp, nice_uncomp); + zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); + zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); + zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); + (void) printf("livelist: used %s, comp %s, uncomp %s\n", + nice_used, nice_comp, nice_uncomp); + return (1); } + return (0); } static avl_tree_t idx_tree; @@ -2643,7 +2735,7 @@ static const char *objset_types[DMU_OST_NUMTYPES] = { "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; static void -dump_dir(objset_t *os) +dump_objset(objset_t *os) { dmu_objset_stats_t dds; uint64_t object, object_count; @@ -2716,11 +2808,17 @@ dump_dir(objset_t *os) if (dmu_objset_ds(os) != NULL) { dsl_dataset_t *ds = dmu_objset_ds(os); - dump_deadlist(&ds->ds_deadlist); + dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + !dmu_objset_is_snapshot(os)) { + dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); + if (verify_dd_livelist(os) != 0) + fatal("livelist is incorrect"); + } if (dsl_dataset_remap_deadlist_exists(ds)) { (void) printf("ds_remap_deadlist:\n"); - dump_deadlist(&ds->ds_remap_deadlist); + dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); } count_ds_mos_objects(ds); } @@ -3470,7 +3568,7 @@ static uint64_t remap_deadlist_count = 0; /*ARGSUSED*/ static int -dump_one_dir(const char *dsname, void *arg) +dump_one_objset(const char *dsname, void *arg) { int error; objset_t *os; @@ -3502,7 +3600,12 @@ dump_one_dir(const char *dsname, void *arg) global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; } - dump_dir(os); + if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && + !dmu_objset_is_snapshot(os)) { + global_feature_count[SPA_FEATURE_LIVELIST]++; + } + + dump_objset(os); close_objset(os, FTAG); fuid_table_destroy(); return (0); @@ -3993,13 +4096,15 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) /* ARGSUSED */ static int -increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { zdb_cb_t *zcb = arg; spa_t *spa = zcb->zcb_spa; vdev_t *vd; const dva_t *dva = &bp->blk_dva[0]; + ASSERT(!bp_freed); ASSERT(!dump_opt['L']); ASSERT3U(BP_GET_NDVAS(bp), ==, 1); @@ -4617,6 +4722,101 @@ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) return (0); } +/* + * Iterate over livelists which have been destroyed by the user but + * are still present in the MOS, waiting to be freed + */ +typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); + +static void +iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) +{ + objset_t *mos = spa->spa_meta_objset; + uint64_t zap_obj; + int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (err == ENOENT) + return; + ASSERT0(err); + + zap_cursor_t zc; + zap_attribute_t attr; + dsl_deadlist_t ll; + /* NULL out os prior to dsl_deadlist_open in case it's garbage */ + ll.dl_os = NULL; + for (zap_cursor_init(&zc, mos, zap_obj); + zap_cursor_retrieve(&zc, &attr) == 0; + (void) zap_cursor_advance(&zc)) { + dsl_deadlist_open(&ll, mos, attr.za_first_integer); + func(&ll, arg); + dsl_deadlist_close(&ll); + } + zap_cursor_fini(&zc); +} + +static int +bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); + return (count_block_cb(arg, bp, tx)); +} + +static int +livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) +{ + zdb_cb_t *zbc = args; + bplist_t blks; + bplist_create(&blks); + /* determine which blocks have been alloc'd but not freed */ + VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); + /* count those blocks */ + (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); + bplist_destroy(&blks); + return (0); +} + +static void +livelist_count_blocks(dsl_deadlist_t *ll, void *arg) +{ + dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); +} + +/* + * Count the blocks in the livelists that have been destroyed by the user + * but haven't yet been freed. + */ +static void +deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) +{ + iterate_deleted_livelists(spa, livelist_count_blocks, zbc); +} + +static void +dump_livelist_cb(dsl_deadlist_t *ll, void *arg) +{ + ASSERT3P(arg, ==, NULL); + global_feature_count[SPA_FEATURE_LIVELIST]++; + dump_blkptr_list(ll, "Deleted Livelist"); +} + +/* + * Print out, register object references to, and increment feature counts for + * livelists that have been destroyed by the user but haven't yet been freed. + */ +static void +deleted_livelists_dump_mos(spa_t *spa) +{ + uint64_t zap_obj; + objset_t *mos = spa->spa_meta_objset; + int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (err == ENOENT) + return; + mos_obj_refd(zap_obj); + iterate_deleted_livelists(spa, dump_livelist_cb, NULL); +} + static int dump_block_stats(spa_t *spa) { @@ -4656,11 +4856,11 @@ dump_block_stats(spa_t *spa) * If there's a deferred-free bplist, process that first. */ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, - count_block_cb, &zcb, NULL); + bpobj_count_block_cb, &zcb, NULL); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, - count_block_cb, &zcb, NULL); + bpobj_count_block_cb, &zcb, NULL); } zdb_claim_removing(spa, &zcb); @@ -4671,6 +4871,8 @@ dump_block_stats(spa_t *spa) &zcb, NULL)); } + deleted_livelists_count_blocks(spa, &zcb); + if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; @@ -5706,6 +5908,7 @@ dump_mos_leaks(spa_t *spa) mos_obj_refd(vim->vim_phys->vimp_counts_object); vdev_indirect_mapping_close(vim); } + deleted_livelists_dump_mos(spa); if (dp->dp_origin_snap != NULL) { dsl_dataset_t *ds; @@ -5715,12 +5918,12 @@ dump_mos_leaks(spa_t *spa) dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, FTAG, &ds)); count_ds_mos_objects(ds); - dump_deadlist(&ds->ds_deadlist); + dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); count_ds_mos_objects(dp->dp_origin_snap); - dump_deadlist(&dp->dp_origin_snap->ds_deadlist); + dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); } count_dir_mos_objects(dp->dp_mos_dir); if (dp->dp_free_dir != NULL) @@ -5885,7 +6088,7 @@ dump_zpool(spa_t *spa) if (dump_opt['d'] || dump_opt['i']) { spa_feature_t f; mos_refd_objs = range_tree_create(NULL, NULL); - dump_dir(dp->dp_meta_objset); + dump_objset(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dsl_pool_t *dp = spa->spa_dsl_pool; @@ -5915,8 +6118,9 @@ dump_zpool(spa_t *spa) global_feature_count[f] = UINT64_MAX; global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; + global_feature_count[SPA_FEATURE_LIVELIST] = 0; - (void) dmu_objset_find(spa_name(spa), dump_one_dir, + (void) dmu_objset_find(spa_name(spa), dump_one_objset, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); if (rc == 0 && !dump_opt['L']) @@ -6777,9 +6981,9 @@ main(int argc, char **argv) } } if (os != NULL) { - dump_dir(os); + dump_objset(os); } else if (zopt_objects > 0 && !dump_opt['m']) { - dump_dir(spa->spa_meta_objset); + dump_objset(spa->spa_meta_objset); } else { dump_zpool(spa); } diff --git a/include/sys/bplist.h b/include/sys/bplist.h index 471be9047..f8deaf843 100644 --- a/include/sys/bplist.h +++ b/include/sys/bplist.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. */ #ifndef _SYS_BPLIST_H @@ -49,6 +50,7 @@ void bplist_destroy(bplist_t *bpl); void bplist_append(bplist_t *bpl, const blkptr_t *bp); void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx); +void bplist_clear(bplist_t *bpl); #ifdef __cplusplus } diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h index d425e239f..16e403526 100644 --- a/include/sys/bpobj.h +++ b/include/sys/bpobj.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2015, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_BPOBJ_H @@ -31,6 +31,7 @@ #include <sys/txg.h> #include <sys/zio.h> #include <sys/zfs_context.h> +#include <sys/bplist.h> #ifdef __cplusplus extern "C" { @@ -48,10 +49,12 @@ typedef struct bpobj_phys { uint64_t bpo_uncomp; uint64_t bpo_subobjs; uint64_t bpo_num_subobjs; + uint64_t bpo_num_freed; } bpobj_phys_t; #define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t)) #define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t)) +#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t)) typedef struct bpobj { kmutex_t bpo_lock; @@ -60,12 +63,14 @@ typedef struct bpobj { int bpo_epb; uint8_t bpo_havecomp; uint8_t bpo_havesubobj; + uint8_t bpo_havefreed; bpobj_phys_t *bpo_phys; dmu_buf_t *bpo_dbuf; dmu_buf_t *bpo_cached_dbuf; } bpobj_t; -typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx); @@ -77,10 +82,13 @@ void bpobj_close(bpobj_t *bpo); boolean_t bpobj_is_open(const bpobj_t *bpo); int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); -int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); +int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, uint64_t *); +int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, + void *arg, int64_t start); void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); -void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx); +void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); int bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); @@ -88,6 +96,9 @@ int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); boolean_t bpobj_is_empty(bpobj_t *bpo); +int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); + #ifdef __cplusplus } #endif diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 65da78eb5..62de1eaf5 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -383,6 +383,7 @@ typedef struct dmu_buf { #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" #define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap" +#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones" /* * Allocate an object from this objset. The range of object numbers @@ -1003,6 +1004,7 @@ extern uint64_t dmu_objset_id(objset_t *os); extern uint64_t dmu_objset_dnodesize(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); +extern int dmu_objset_blksize(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val); diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index c0650bcde..9b6614e98 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -126,7 +126,7 @@ struct objset { zfs_cache_type_t os_secondary_cache; zfs_sync_type_t os_sync; zfs_redundant_metadata_type_t os_redundant_metadata; - int os_recordsize; + uint64_t os_recordsize; /* * The next four values are used as a cache of whatever's on disk, and * are initialized the first time these properties are queried. Before diff --git a/include/sys/dsl_deadlist.h b/include/sys/dsl_deadlist.h index 08f38233d..bb8248a66 100644 --- a/include/sys/dsl_deadlist.h +++ b/include/sys/dsl_deadlist.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_DEADLIST_H @@ -28,12 +28,14 @@ #include <sys/bpobj.h> #include <sys/zfs_context.h> +#include <sys/zthr.h> #ifdef __cplusplus extern "C" { #endif struct dmu_buf; +struct dsl_pool; struct dsl_dataset; typedef struct dsl_deadlist_phys { @@ -63,13 +65,34 @@ typedef struct dsl_deadlist_entry { bpobj_t dle_bpobj; } dsl_deadlist_entry_t; +typedef struct livelist_condense_entry { + struct dsl_dataset *ds; + dsl_deadlist_entry_t *first; + dsl_deadlist_entry_t *next; + boolean_t syncing; + boolean_t cancelled; +} livelist_condense_entry_t; + +extern unsigned long zfs_livelist_max_entries; +extern int zfs_livelist_min_percent_shared; + +typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle); + void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); void dsl_deadlist_close(dsl_deadlist_t *dl); +void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg); uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); -void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx); +void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, + boolean_t free, dmu_tx_t *tx); +int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, +dmu_tx_t *tx); +dsl_deadlist_entry_t *dsl_deadlist_first(dsl_deadlist_t *dl); +dsl_deadlist_entry_t *dsl_deadlist_last(dsl_deadlist_t *dl); uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, uint64_t mrs_obj, dmu_tx_t *tx); void dsl_deadlist_space(dsl_deadlist_t *dl, @@ -81,6 +104,10 @@ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx); boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl); +int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free, + zthr_t *t, uint64_t *size); +void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, + dmu_tx_t *tx); #ifdef __cplusplus } diff --git a/include/sys/dsl_destroy.h b/include/sys/dsl_destroy.h index c4dbea26b..208d75bac 100644 --- a/include/sys/dsl_destroy.h +++ b/include/sys/dsl_destroy.h @@ -33,6 +33,7 @@ extern "C" { struct nvlist; struct dsl_dataset; +struct dsl_pool; struct dmu_tx; int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t, diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index 08d1fcb37..bb6921027 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -29,18 +29,20 @@ #define _SYS_DSL_DIR_H #include <sys/dmu.h> +#include <sys/dsl_deadlist.h> #include <sys/dsl_pool.h> #include <sys/dsl_synctask.h> #include <sys/refcount.h> #include <sys/zfs_context.h> #include <sys/dsl_crypt.h> +#include <sys/bplist.h> #ifdef __cplusplus extern "C" { #endif struct dsl_dataset; - +struct zthr; /* * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object. * They should be of the format <reverse-dns>:<field>. @@ -49,6 +51,7 @@ struct dsl_dataset; #define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" #define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" #define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj" +#define DD_FIELD_LIVELIST "com.delphix:livelist" typedef enum dd_used { DD_USED_HEAD, @@ -114,6 +117,10 @@ struct dsl_dir { /* amount of space we expect to write; == amount of dirty data */ int64_t dd_space_towrite[TXG_SIZE]; + dsl_deadlist_t dd_livelist; + bplist_t dd_pending_frees; + bplist_t dd_pending_allocs; + /* protected by dd_lock; keep at end of struct for better locality */ char dd_myname[ZFS_MAX_DATASET_NAME_LEN]; }; @@ -182,6 +189,9 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx); void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx); boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); +void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj); +void dsl_dir_livelist_close(dsl_dir_t *dd); +void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 63ba3509a..172ecdc46 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ @@ -54,6 +54,7 @@ struct dsl_pool; struct dmu_tx; struct dsl_scan; struct dsl_crypto_params; +struct dsl_deadlist; extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max_max; diff --git a/include/sys/spa.h b/include/sys/spa.h index 50ca15be5..e64313783 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -63,6 +63,8 @@ typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; typedef struct zbookmark_phys zbookmark_phys_t; +struct bpobj; +struct bplist; struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; @@ -532,6 +534,9 @@ _NOTE(CONSTCOND) } while (0) #define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) +#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) +#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) + #define BP_PHYSICAL_BIRTH(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) @@ -654,6 +659,7 @@ _NOTE(CONSTCOND) } while (0) * 'func' is either snprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ + #define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *copyname[] = \ @@ -804,6 +810,8 @@ extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); +extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 @@ -1131,6 +1139,7 @@ extern uint64_t spa_total_metaslabs(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); extern unsigned long spa_get_hostid(void); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); +extern boolean_t spa_livelist_delete_check(spa_t *spa); extern int spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 929144017..ebe14dae4 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -49,6 +49,7 @@ #include <sys/dsl_crypt.h> #include <sys/zfeature.h> #include <sys/zthr.h> +#include <sys/dsl_deadlist.h> #include <zfeature_common.h> #ifdef __cplusplus @@ -317,6 +318,11 @@ struct spa { list_t spa_log_summary; uint64_t spa_log_flushall_txg; + zthr_t *spa_livelist_delete_zthr; /* deleting livelists */ + zthr_t *spa_livelist_condense_zthr; /* condensing livelists */ + uint64_t spa_livelists_to_delete; /* set of livelists to free */ + livelist_condense_entry_t spa_to_condense; /* next to condense */ + char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ diff --git a/include/sys/zthr.h b/include/sys/zthr.h index 33c218ec4..0a05f5225 100644 --- a/include/sys/zthr.h +++ b/include/sys/zthr.h @@ -33,7 +33,9 @@ extern void zthr_destroy(zthr_t *t); extern void zthr_wakeup(zthr_t *t); extern void zthr_cancel(zthr_t *t); extern void zthr_resume(zthr_t *t); +extern void zthr_wait_cycle_done(zthr_t *t); extern boolean_t zthr_iscancelled(zthr_t *t); +extern boolean_t zthr_has_waiters(zthr_t *t); #endif /* _SYS_ZTHR_H */ diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 4012b71d6..4f7822973 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -71,6 +71,7 @@ typedef enum spa_feature { SPA_FEATURE_REDACTED_DATASETS, SPA_FEATURE_BOOKMARK_WRITTEN, SPA_FEATURE_LOG_SPACEMAP, + SPA_FEATURE_LIVELIST, SPA_FEATURES } spa_feature_t; diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 62245f6a0..3b88d9748 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1912,6 +1912,98 @@ Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee). .sp .ne 2 .na +\fBzfs_livelist_max_entries\fR (ulong) +.ad +.RS 12n +The threshold size (in block pointers) at which we create a new sub-livelist. +Larger sublists are more costly from a memory perspective but the fewer +sublists there are, the lower the cost of insertion. +.sp +Default value: \fB500,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_min_percent_shared\fR (int) +.ad +.RS 12n +If the amount of shared space between a snapshot and its clone drops below +this threshold, the clone turns off the livelist and reverts to the old deletion +method. This is in place because once a clone has been overwritten enough +livelists no long give us a benefit. +.sp +Default value: \fB75\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_condense_new_alloc\fR (int) +.ad +.RS 12n +Incremented each time an extra ALLOC blkptr is added to a livelist entry while +it is being condensed. +This option is used by the test suite to track race conditions. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_condense_sync_cancel\fR (int) +.ad +.RS 12n +Incremented each time livelist condensing is canceled while in +spa_livelist_condense_sync. +This option is used by the test suite to track race conditions. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_condense_sync_pause\fR (int) +.ad +.RS 12n +When set, the livelist condense process pauses indefinitely before +executing the synctask - spa_livelist_condense_sync. +This option is used by the test suite to trigger race conditions. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_condense_zthr_cancel\fR (int) +.ad +.RS 12n +Incremented each time livelist condensing is canceled while in +spa_livelist_condense_cb. +This option is used by the test suite to track race conditions. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_condense_zthr_pause\fR (int) +.ad +.RS 12n +When set, the livelist condense process pauses indefinitely before +executing the open context condensing work in spa_livelist_condense_cb. +This option is used by the test suite to trigger race conditions. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na \fBzfs_lua_max_instrlimit\fR (ulong) .ad .RS 12n diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index 444a996e1..00bad9f74 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -550,6 +550,26 @@ improving performance by avoiding the use of spill blocks. .sp .ne 2 .na +\fB\fBlivelist\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:livelist +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE +This feature allows clones to be deleted faster than the traditional method +when a large number of random/sparse writes have been made to the clone. +All blocks allocated and freed after a clone is created are tracked by the +the clone's livelist which is referenced during the deletion of the clone. +The feature is activated when a clone is created and remains active until all +clones have been destroyed. +.RE + +.sp +.ne 2 +.na \fBlz4_compress\fR .ad .RS 4n @@ -882,7 +902,6 @@ This feature becomes \fBactive\fR when the \fBzpool checkpoint\fR subcommand is used to checkpoint the pool. The feature will only return back to being \fBenabled\fR when the pool is rewound or the checkpoint has been discarded. -.RE .SH "SEE ALSO" zpool(8) diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index fa9d678a7..8e1aef5da 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -349,6 +349,18 @@ zpool_feature_init(void) ZFEATURE_TYPE_BOOLEAN, NULL); { + static const spa_feature_t livelist_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LIVELIST, + "com.delphix:livelist", "livelist", + "Improved clone deletion performance.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, + livelist_deps); + } + + { static const spa_feature_t log_spacemap_deps[] = { SPA_FEATURE_SPACEMAP_V2, SPA_FEATURE_NONE diff --git a/module/zfs/bplist.c b/module/zfs/bplist.c index c81151e08..47ea364ef 100644 --- a/module/zfs/bplist.c +++ b/module/zfs/bplist.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include <sys/bplist.h> @@ -75,3 +75,17 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) } mutex_exit(&bpl->bpl_lock); } + +void +bplist_clear(bplist_t *bpl) +{ + bplist_entry_t *bpe; + + mutex_enter(&bpl->bpl_lock); + while ((bpe = list_head(&bpl->bpl_list))) { + bplist_iterate_last_removed = bpe; + list_remove(&bpl->bpl_list, bpe); + kmem_free(bpe, sizeof (*bpe)); + } + mutex_exit(&bpl->bpl_lock); +} diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 633801956..561d0cf8a 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017 Datto Inc. */ @@ -83,6 +83,9 @@ bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) size = BPOBJ_SIZE_V0; else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) size = BPOBJ_SIZE_V1; + else if (!spa_feature_is_active(dmu_objset_spa(os), + SPA_FEATURE_LIVELIST)) + size = BPOBJ_SIZE_V2; else size = sizeof (bpobj_phys_t); @@ -171,6 +174,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); + bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2); bpo->bpo_phys = bpo->bpo_dbuf->db_data; return (0); } @@ -245,8 +249,8 @@ bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) * Update bpobj and all of its parents with new space accounting. */ static void -propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed, - uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx) +propagate_space_reduction(bpobj_info_t *bpi, int64_t freed, + int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx) { for (; bpi != NULL; bpi = bpi->bpi_parent) { @@ -263,22 +267,22 @@ propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed, static int bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, - dmu_tx_t *tx, boolean_t free) + int64_t start, dmu_tx_t *tx, boolean_t free) { int err = 0; - uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0; + int64_t freed = 0, comp_freed = 0, uncomp_freed = 0; dmu_buf_t *dbuf = NULL; bpobj_t *bpo = bpi->bpi_bpo; - for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { + for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { uint64_t offset = i * sizeof (blkptr_t); uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); if (dbuf == NULL || dbuf->db_offset > offset) { if (dbuf) dmu_buf_rele(dbuf, FTAG); - err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, - FTAG, &dbuf, 0); + err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, + offset, FTAG, &dbuf, 0); if (err) break; } @@ -288,18 +292,26 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, blkptr_t *bparray = dbuf->db_data; blkptr_t *bp = &bparray[blkoff]; - err = func(arg, bp, tx); + + boolean_t bp_freed = BP_GET_FREE(bp); + err = func(arg, bp, bp_freed, tx); if (err) break; if (free) { + int sign = bp_freed ? -1 : +1; spa_t *spa = dmu_objset_spa(bpo->bpo_os); - freed += bp_get_dsize_sync(spa, bp); - comp_freed += BP_GET_PSIZE(bp); - uncomp_freed += BP_GET_UCSIZE(bp); + freed += sign * bp_get_dsize_sync(spa, bp); + comp_freed += sign * BP_GET_PSIZE(bp); + uncomp_freed += sign * BP_GET_UCSIZE(bp); ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); bpo->bpo_phys->bpo_num_blkptrs--; ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); + if (bp_freed) { + ASSERT(bpo->bpo_havefreed); + bpo->bpo_phys->bpo_num_freed--; + ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0); + } } } if (free) { @@ -328,7 +340,7 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, */ static int bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, - dmu_tx_t *tx, boolean_t free) + dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size) { list_t stack; bpobj_info_t *bpi; @@ -341,6 +353,10 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, list_create(&stack, sizeof (bpobj_info_t), offsetof(bpobj_info_t, bpi_node)); mutex_enter(&initial_bpo->bpo_lock); + + if (bpobj_size != NULL) + *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs; + list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); while ((bpi = list_head(&stack)) != NULL) { @@ -354,7 +370,8 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, dmu_buf_will_dirty(bpo->bpo_dbuf, tx); if (bpi->bpi_visited == B_FALSE) { - err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free); + err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx, + free); bpi->bpi_visited = B_TRUE; if (err != 0) break; @@ -433,6 +450,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, * We have unprocessed subobjs. Process the next one. */ ASSERT(bpo->bpo_havecomp); + ASSERT3P(bpobj_size, ==, NULL); /* Add the last subobj to stack. */ int64_t i = bpi->bpi_unprocessed_subobjs - 1; @@ -489,16 +507,45 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) { - return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); + return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL)); } /* * Iterate the entries. If func returns nonzero, iteration will stop. + * + * If there are no subobjs: + * + * *bpobj_size can be used to return the number of block pointers in the + * bpobj. Note that this may be different from the number of block pointers + * that are iterated over, if iteration is terminated early (e.g. by the func + * returning nonzero). + * + * If there are concurrent (or subsequent) modifications to the bpobj then the + * returned *bpobj_size can be passed as "start" to + * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries. */ int -bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, + uint64_t *bpobj_size) { - return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); + return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size)); +} + +/* + * Iterate over the blkptrs in the bpobj beginning at index start. If func + * returns nonzero, iteration will stop. This is a livelist specific function + * since it assumes that there are no subobjs present. + */ +int +livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, + int64_t start) +{ + if (bpo->bpo_havesubobj) + VERIFY0(bpo->bpo_phys->bpo_subobjs); + bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0); + int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE); + kmem_free(bpi, sizeof (bpobj_info_t)); + return (err); } /* @@ -724,7 +771,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) } void -bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) +bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { blkptr_t stored_bp = *bp; uint64_t offset; @@ -755,8 +803,8 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); } - /* We never need the fill count. */ stored_bp.blk_fill = 0; + BP_SET_FREE(&stored_bp, bp_freed); mutex_enter(&bpo->bpo_lock); @@ -779,11 +827,16 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) dmu_buf_will_dirty(bpo->bpo_dbuf, tx); bpo->bpo_phys->bpo_num_blkptrs++; - bpo->bpo_phys->bpo_bytes += + int sign = bp_freed ? -1 : +1; + bpo->bpo_phys->bpo_bytes += sign * bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); if (bpo->bpo_havecomp) { - bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); - bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); + bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp); + } + if (bp_freed) { + ASSERT(bpo->bpo_havefreed); + bpo->bpo_phys->bpo_num_freed++; } mutex_exit(&bpo->bpo_lock); } @@ -799,7 +852,7 @@ struct space_range_arg { /* ARGSUSED */ static int -space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { struct space_range_arg *sra = arg; @@ -863,3 +916,18 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, *uncompp = sra.uncomp; return (err); } + +/* + * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a + * bpobj are designated as free or allocated that information is not preserved + * in bplists. + */ +/* ARGSUSED */ +int +bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + bplist_t *bpl = arg; + bplist_append(bpl, bp); + return (0); +} diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 4d347b6f4..0518205f9 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3286,6 +3286,13 @@ dbuf_hold_impl_arg(struct dbuf_hold_arg *dh) *(dh->dh_dbp) = NULL; + /* If the pool has been created, verify the tx_sync_lock is not held */ + spa_t *spa = dh->dh_dn->dn_objset->os_spa; + dsl_pool_t *dp = spa->spa_dsl_pool; + if (dp != NULL) { + ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock)); + } + /* dbuf_find() returns with db_mtx held */ dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, dh->dh_level, dh->dh_blkid); @@ -4480,6 +4487,29 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { /* + * If the blkptr being remapped is tracked by a livelist, + * then we need to make sure the livelist reflects the update. + * First, cancel out the old blkptr by appending a 'FREE' + * entry. Next, add an 'ALLOC' to track the new version. This + * way we avoid trying to free an inaccurate blkptr at delete. + * Note that embedded blkptrs are not tracked in livelists. + */ + if (dn->dn_objset != spa_meta_objset(spa)) { + dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg) { + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_frees, + bp); + bplist_append(&ds->ds_dir->dd_pending_allocs, + &bp_copy); + } + } + + /* * The db_rwlock prevents dbuf_read_impl() from * dereferencing the BP while we are changing it. To * avoid lock contention, only grab it when we are actually diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 0cd458ef4..848a8508c 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -122,13 +122,12 @@ parent_delta(dsl_dataset_t *ds, int64_t delta) void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { - int used, compressed, uncompressed; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int used = bp_get_dsize_sync(spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; - used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); - compressed = BP_GET_PSIZE(bp); - uncompressed = BP_GET_UCSIZE(bp); - dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); @@ -164,6 +163,19 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) ds->ds_feature_activation[f] = (void *)B_TRUE; } + /* + * Track block for livelist, but ignore embedded blocks because + * they do not need to be freed. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg && + !(BP_IS_EMBEDDED(bp))) { + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_allocs, bp); + } + mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, compressed, uncompressed, tx); @@ -207,8 +219,8 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, DVA_SET_VDEV(dva, vdev); DVA_SET_OFFSET(dva, offset); DVA_SET_ASIZE(dva, size); - - dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx); + dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE, + tx); } } @@ -239,6 +251,19 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ASSERT(!ds->ds_is_snapshot); dmu_buf_will_dirty(ds->ds_dbuf, tx); + /* + * Track block for livelist, but ignore embedded blocks because + * they do not need to be freed. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg && + !(BP_IS_EMBEDDED(bp))) { + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_frees, bp); + } + if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; @@ -267,7 +292,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, */ bplist_append(&ds->ds_pending_deadlist, bp); } else { - dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); + dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx); } ASSERT3U(ds->ds_prev->ds_object, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj); @@ -1241,6 +1266,14 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, ASSERT(dmu_tx_is_syncing(tx)); ASSERT(lastname[0] != '@'); + /* + * Filesystems will eventually have their origin set to dp_origin_snap, + * but that's taken care of in dsl_dataset_create_sync_dd. When + * creating a filesystem, this function is called with origin equal to + * NULL. + */ + if (origin != NULL) + ASSERT3P(origin, !=, dp->dp_origin_snap); ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); @@ -1251,6 +1284,20 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_deleg_set_create_perms(dd, tx, cr); /* + * If we are creating a clone and the livelist feature is enabled, + * add the entry DD_FIELD_LIVELIST to ZAP. + */ + if (origin != NULL && + spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) { + objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dir_zapify(dd, tx); + uint64_t obj = dsl_deadlist_alloc(mos, tx); + VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST, + sizeof (uint64_t), 1, &obj, tx)); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx); + } + + /* * Since we're creating a new node we know it's a leaf, so we can * initialize the counts if the limit feature is active. */ @@ -2036,12 +2083,149 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) } } +/* + * Check if the percentage of blocks shared between the clone and the + * snapshot (as opposed to those that are clone only) is below a certain + * threshold + */ +boolean_t +dsl_livelist_should_disable(dsl_dataset_t *ds) +{ + uint64_t used, referenced; + int percent_shared; + + used = dsl_dir_get_usedds(ds->ds_dir); + referenced = dsl_get_referenced(ds); + ASSERT3U(referenced, >=, 0); + ASSERT3U(used, >=, 0); + if (referenced == 0) + return (B_FALSE); + percent_shared = (100 * (referenced - used)) / referenced; + if (percent_shared <= zfs_livelist_min_percent_shared) + return (B_TRUE); + return (B_FALSE); +} + +/* + * Check if it is possible to combine two livelist entries into one. + * This is the case if the combined number of 'live' blkptrs (ALLOCs that + * don't have a matching FREE) is under the maximum sublist size. + * We check this by subtracting twice the total number of frees from the total + * number of blkptrs. FREEs are counted twice because each FREE blkptr + * will cancel out an ALLOC blkptr when the livelist is processed. + */ +static boolean_t +dsl_livelist_should_condense(dsl_deadlist_entry_t *first, + dsl_deadlist_entry_t *next) +{ + uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed + + next->dle_bpobj.bpo_phys->bpo_num_freed; + uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs + + next->dle_bpobj.bpo_phys->bpo_num_blkptrs; + if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries) + return (B_TRUE); + return (B_FALSE); +} + +typedef struct try_condense_arg { + spa_t *spa; + dsl_dataset_t *ds; +} try_condense_arg_t; + +/* + * Iterate over the livelist entries, searching for a pair to condense. + * A nonzero return value means stop, 0 means keep looking. + */ static int -deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first) { - dsl_deadlist_t *dl = arg; - dsl_deadlist_insert(dl, bp, tx); - return (0); + try_condense_arg_t *tca = arg; + spa_t *spa = tca->spa; + dsl_dataset_t *ds = tca->ds; + dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; + dsl_deadlist_entry_t *next; + + /* The condense thread has not yet been created at import */ + if (spa->spa_livelist_condense_zthr == NULL) + return (1); + + /* A condense is already in progress */ + if (spa->spa_to_condense.ds != NULL) + return (1); + + next = AVL_NEXT(&ll->dl_tree, &first->dle_node); + /* The livelist has only one entry - don't condense it */ + if (next == NULL) + return (1); + + /* Next is the newest entry - don't condense it */ + if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL) + return (1); + + /* This pair is not ready to condense but keep looking */ + if (!dsl_livelist_should_condense(first, next)) + return (0); + + /* + * Add a ref to prevent the dataset from being evicted while + * the condense zthr or synctask are running. Ref will be + * released at the end of the condense synctask + */ + dmu_buf_add_ref(ds->ds_dbuf, spa); + + spa->spa_to_condense.ds = ds; + spa->spa_to_condense.first = first; + spa->spa_to_condense.next = next; + spa->spa_to_condense.syncing = B_FALSE; + spa->spa_to_condense.cancelled = B_FALSE; + + zthr_wakeup(spa->spa_livelist_condense_zthr); + return (1); +} + +static void +dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_dir_t *dd = ds->ds_dir; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist); + + /* Check if we need to add a new sub-livelist */ + if (last == NULL) { + /* The livelist is empty */ + dsl_deadlist_add_key(&dd->dd_livelist, + tx->tx_txg - 1, tx); + } else if (spa_sync_pass(spa) == 1) { + /* + * Check if the newest entry is full. If it is, make a new one. + * We only do this once per sync because we could overfill a + * sublist in one sync pass and don't want to add another entry + * for a txg that is already represented. This ensures that + * blkptrs born in the same txg are stored in the same sublist. + */ + bpobj_t bpobj = last->dle_bpobj; + uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs; + uint64_t free = bpobj.bpo_phys->bpo_num_freed; + uint64_t alloc = all - free; + if (alloc > zfs_livelist_max_entries) { + dsl_deadlist_add_key(&dd->dd_livelist, + tx->tx_txg - 1, tx); + } + } + + /* Insert each entry into the on-disk livelist */ + bplist_iterate(&dd->dd_pending_allocs, + dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx); + bplist_iterate(&dd->dd_pending_frees, + dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx); + + /* Attempt to condense every pair of adjacent entries */ + try_condense_arg_t arg = { + .spa = spa, + .ds = ds + }; + dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense, + &arg); } void @@ -2050,7 +2234,14 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) objset_t *os = ds->ds_objset; bplist_iterate(&ds->ds_pending_deadlist, - deadlist_enqueue_cb, &ds->ds_deadlist, tx); + dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx); + + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { + dsl_flush_pending_livelist(ds, tx); + if (dsl_livelist_should_disable(ds)) { + dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE); + } + } dsl_bookmark_sync_done(ds, tx); @@ -3335,6 +3526,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) uint64_t oldnext_obj; int64_t delta; + ASSERT(nvlist_empty(ddpa->err_ds)); + VERIFY0(promote_hold(ddpa, dp, FTAG)); hds = ddpa->ddpa_clone; @@ -3519,6 +3712,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; + /* + * Since livelists are specific to a clone's origin txg, they + * are no longer accurate. Destroy the livelist from the clone being + * promoted. If the origin dataset is a clone, destroy its livelist + * as well. + */ + dsl_dir_remove_livelist(dd, tx, B_TRUE); + dsl_dir_remove_livelist(origin_ds->ds_dir, tx, B_TRUE); + /* log history record */ spa_history_log_internal_ds(hds, "promote", tx, ""); @@ -3990,6 +4192,14 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_scan_ds_clone_swapped(origin_head, clone, tx); + /* + * Destroy any livelists associated with the clone or the origin, + * since after the swap the corresponding livelists are no longer + * valid. + */ + dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE); + dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE); + spa_history_log_internal_ds(clone, "clone swap", tx, "parent=%s", origin_head->ds_dir->dd_myname); } diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 9e3a3331b..25878f0ea 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -20,16 +20,16 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ -#include <sys/dsl_dataset.h> #include <sys/dmu.h> #include <sys/refcount.h> #include <sys/zap.h> #include <sys/zfs_context.h> #include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> /* * Deadlist concurrency: @@ -51,6 +51,68 @@ * provides its own locking, and dl_oldfmt is immutable. */ +/* + * Livelist Overview + * ================ + * + * Livelists use the same 'deadlist_t' struct as deadlists and are also used + * to track blkptrs over the lifetime of a dataset. Livelists however, belong + * to clones and track the blkptrs that are clone-specific (were born after + * the clone's creation). The exception is embedded block pointers which are + * not included in livelists because they do not need to be freed. + * + * When it comes time to delete the clone, the livelist provides a quick + * reference as to what needs to be freed. For this reason, livelists also track + * when clone-specific blkptrs are freed before deletion to prevent double + * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the + * deletion algorithm iterates backwards over the livelist, matching + * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists + * are also updated in the case when blkptrs are remapped: the old version + * of the blkptr is cancelled out with a FREE and the new version is tracked + * with an ALLOC. + * + * To bound the amount of memory required for deletion, livelists over a + * certain size are spread over multiple entries. Entries are grouped by + * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will + * be in the same entry. This allows us to delete livelists incrementally + * over multiple syncs, one entry at a time. + * + * During the lifetime of the clone, livelists can get extremely large. + * Their size is managed by periodic condensing (preemptively cancelling out + * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when + * the shared space between the clone and its origin is so small that it + * doesn't make sense to use livelists anymore. + */ + +/* + * The threshold sublist size at which we create a new sub-livelist for the + * next txg. However, since blkptrs of the same transaction group must be in + * the same sub-list, the actual sublist size may exceed this. When picking the + * size we had to balance the fact that larger sublists mean fewer sublists + * (decreasing the cost of insertion) against the consideration that sublists + * will be loaded into memory and shouldn't take up an inordinate amount of + * space. We settled on ~500000 entries, corresponding to roughly 128M. + */ +unsigned long zfs_livelist_max_entries = 500000; + +/* + * We can approximate how much of a performance gain a livelist will give us + * based on the percentage of blocks shared between the clone and its origin. + * 0 percent shared means that the clone has completely diverged and that the + * old method is maximally effective: every read from the block tree will + * result in lots of frees. Livelists give us gains when they track blocks + * scattered across the tree, when one read in the old method might only + * result in a few frees. Once the clone has been overwritten enough, + * writes are no longer sparse and we'll no longer get much of a benefit from + * tracking them with a livelist. We chose a lower limit of 75 percent shared + * (25 percent overwritten). This means that 1/4 of all block pointers will be + * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists + * to make deletion 4x faster. Once the amount of shared space drops below this + * threshold, the clone will revert to the old deletion method. + */ +int zfs_livelist_min_percent_shared = 75; + + static int dsl_deadlist_compare(const void *arg1, const void *arg2) { @@ -89,6 +151,23 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) } void +dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) +{ + dsl_deadlist_entry_t *dle; + + ASSERT(dsl_deadlist_is_open(dl)); + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + mutex_exit(&dl->dl_lock); + for (dle = avl_first(&dl->dl_tree); dle != NULL; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + if (func(args, dle) != 0) + break; + } +} + +void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) { dmu_object_info_t doi; @@ -188,7 +267,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) static void dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, - const blkptr_t *bp, dmu_tx_t *tx) + const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(MUTEX_HELD(&dl->dl_lock)); if (dle->dle_bpobj.bpo_object == @@ -200,7 +279,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } - bpobj_enqueue(&dle->dle_bpobj, bp, tx); + bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx); } static void @@ -221,14 +300,15 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, } void -dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) +dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; avl_index_t where; if (dl->dl_oldfmt) { - bpobj_enqueue(&dl->dl_bpobj, bp, tx); + bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx); return; } @@ -236,10 +316,12 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) dsl_deadlist_load_tree(dl); dmu_buf_will_dirty(dl->dl_dbuf, tx); + + int sign = bp_freed ? -1 : +1; dl->dl_phys->dl_used += - bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); - dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); - dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); + sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); + dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); + dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); dle_tofind.dle_mintxg = bp->blk_birth; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); @@ -255,10 +337,26 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) } ASSERT3P(dle, !=, NULL); - dle_enqueue(dl, dle, bp, tx); + dle_enqueue(dl, dle, bp, bp_freed, tx); mutex_exit(&dl->dl_lock); } +int +dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, B_FALSE, tx); + return (0); +} + +int +dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, B_TRUE, tx); + return (0); +} + /* * Insert new key in deadlist, which must be > all current entries. * mintxg is not inclusive. @@ -317,6 +415,108 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) } /* + * Remove a deadlist entry and all of its contents by removing the entry from + * the deadlist's avl tree, freeing the entry's bpobj and adjusting the + * deadlist's space accounting accordingly. + */ +void +dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + uint64_t used, comp, uncomp; + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + objset_t *os = dl->dl_os; + + if (dl->dl_oldfmt) + return; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); + VERIFY3P(dle, !=, NULL); + + avl_remove(&dl->dl_tree, dle); + VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); + VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) { + bpobj_decr_empty(os, tx); + } else { + bpobj_free(os, dle->dle_bpobj.bpo_object, tx); + } + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + mutex_exit(&dl->dl_lock); +} + +/* + * Clear out the contents of a deadlist_entry by freeing its bpobj, + * replacing it with an empty bpobj and adjusting the deadlist's + * space accounting + */ +void +dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, + dmu_tx_t *tx) +{ + uint64_t new_obj, used, comp, uncomp; + objset_t *os = dl->dl_os; + + mutex_enter(&dl->dl_lock); + VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); + VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) + bpobj_decr_empty(os, tx); + else + bpobj_free(os, dle->dle_bpobj.bpo_object, tx); + bpobj_close(&dle->dle_bpobj); + new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj)); + VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg, + new_obj, tx)); + ASSERT(bpobj_is_empty(&dle->dle_bpobj)); + mutex_exit(&dl->dl_lock); +} + +/* + * Return the first entry in deadlist's avl tree + */ +dsl_deadlist_entry_t * +dsl_deadlist_first(dsl_deadlist_t *dl) +{ + dsl_deadlist_entry_t *dle; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + dle = avl_first(&dl->dl_tree); + mutex_exit(&dl->dl_lock); + + return (dle); +} + +/* + * Return the last entry in deadlist's avl tree + */ +dsl_deadlist_entry_t * +dsl_deadlist_last(dsl_deadlist_t *dl) +{ + dsl_deadlist_entry_t *dle; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + dle = avl_last(&dl->dl_tree); + mutex_exit(&dl->dl_lock); + + return (dle); +} + +/* * Walk ds's snapshots to regenerate generate ZAP & AVL. */ static void @@ -478,10 +678,11 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, } static int -dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; - dsl_deadlist_insert(dl, bp, tx); + dsl_deadlist_insert(dl, bp, bp_freed, tx); return (0); } @@ -572,3 +773,109 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, } mutex_exit(&dl->dl_lock); } + +typedef struct livelist_entry { + const blkptr_t *le_bp; + avl_node_t le_node; +} livelist_entry_t; + +static int +livelist_compare(const void *larg, const void *rarg) +{ + const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp; + const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp; + + /* Sort them according to dva[0] */ + uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); + uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); + + if (l_dva0_vdev != r_dva0_vdev) + return (AVL_CMP(l_dva0_vdev, r_dva0_vdev)); + + /* if vdevs are equal, sort by offsets. */ + uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); + uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); + if (l_dva0_offset == r_dva0_offset) + ASSERT3U(l->blk_birth, ==, r->blk_birth); + return (AVL_CMP(l_dva0_offset, r_dva0_offset)); +} + +struct livelist_iter_arg { + avl_tree_t *avl; + bplist_t *to_free; + zthr_t *t; +}; + +/* + * Expects an AVL tree which is incrementally filled will FREE blkptrs + * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a + * corresponding FREE are stored in the supplied bplist. + */ +static int +dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + struct livelist_iter_arg *lia = arg; + avl_tree_t *avl = lia->avl; + bplist_t *to_free = lia->to_free; + zthr_t *t = lia->t; + ASSERT(tx == NULL); + + if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) + return (SET_ERROR(EINTR)); + if (bp_freed) { + livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t), + KM_SLEEP); + blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + *temp_bp = *bp; + node->le_bp = temp_bp; + avl_add(avl, node); + } else { + livelist_entry_t node; + node.le_bp = bp; + livelist_entry_t *found = avl_find(avl, &node, NULL); + if (found != NULL) { + avl_remove(avl, found); + kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t)); + kmem_free(found, sizeof (livelist_entry_t)); + } else { + bplist_append(to_free, bp); + } + } + return (0); +} + +/* + * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs + * which have an ALLOC entry but no matching FREE + */ +int +dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, + uint64_t *size) +{ + avl_tree_t avl; + avl_create(&avl, livelist_compare, sizeof (livelist_entry_t), + offsetof(livelist_entry_t, le_node)); + + /* process the sublist */ + struct livelist_iter_arg arg = { + .avl = &avl, + .to_free = to_free, + .t = t + }; + int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); + + avl_destroy(&avl); + return (err); +} + +#if defined(_KERNEL) +/* CSTYLED */ +module_param(zfs_livelist_max_entries, ulong, 0644); +MODULE_PARM_DESC(zfs_livelist_max_entries, + "Size to start the next sub-livelist in a livelist"); + +module_param(zfs_livelist_min_percent_shared, int, 0644); +MODULE_PARM_DESC(zfs_livelist_min_percent_shared, + "Threshold at which livelist is disabled"); +#endif diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 2f98e87ed..5c483c5dd 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -45,6 +45,9 @@ #include <sys/dmu_impl.h> #include <sys/zvol.h> #include <sys/zcp.h> +#include <sys/dsl_deadlist.h> +#include <sys/zthr.h> +#include <sys/spa_impl.h> int dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) @@ -120,7 +123,7 @@ struct process_old_arg { }; static int -process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { struct process_old_arg *poa = arg; dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; @@ -128,7 +131,7 @@ process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { - dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); + dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx); if (poa->ds_prev && !poa->after_branch_point && bp->blk_birth > dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { @@ -852,6 +855,127 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) dmu_object_free_zapified(mos, ddobj, tx); } +static void +dsl_clone_destroy_assert(dsl_dir_t *dd) +{ + uint64_t used, comp, uncomp; + + ASSERT(dsl_dir_is_clone(dd)); + dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); + + ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used); + ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp); + /* + * Greater than because we do not track embedded block pointers in + * the livelist + */ + ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp); + + ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list)); + ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list)); +} + +/* + * Start the delete process for a clone. Free its zil, verify the space usage + * and queue the blkptrs for deletion by adding the livelist to the pool-wide + * delete queue. + */ +static void +dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t zap_obj, to_delete, used, comp, uncomp; + objset_t *os; + dsl_dir_t *dd = ds->ds_dir; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + VERIFY0(dmu_objset_from_ds(ds, &os)); + + /* Check that the clone is in a correct state to be deleted */ + dsl_clone_destroy_assert(dd); + + /* Destroy the zil */ + zil_destroy_sync(dmu_objset_zil(os), tx); + + VERIFY0(zap_lookup(mos, dd->dd_object, + DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete)); + /* Initialize deleted_clones entry to track livelists to cleanup */ + int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (error == ENOENT) { + zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA, + DMU_OT_NONE, 0, tx); + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, + &(zap_obj), tx)); + spa->spa_livelists_to_delete = zap_obj; + } else if (error != 0) { + zfs_panic_recover("zfs: error %d was returned while looking " + "up DMU_POOL_DELETED_CLONES in the zap"); + return; + } + VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx)); + + /* Clone is no longer using space, now tracked by dp_free_dir */ + dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); + dsl_dir_diduse_space(dd, DD_USED_HEAD, + -used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes, + tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + dsl_dir_remove_livelist(dd, tx, B_FALSE); + zthr_wakeup(spa->spa_livelist_delete_zthr); +} + +/* + * Move the bptree into the pool's list of trees to clean up, update space + * accounting information and destroy the zil. + */ +void +dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t used, comp, uncomp; + objset_t *os; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + + zil_destroy_sync(dmu_objset_zil(os), tx); + + if (!spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dsl_scan_t *scn = dp->dp_scan; + spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, + tx); + dp->dp_bptree_obj = bptree_alloc(mos, tx); + VERIFY0(zap_add(mos, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx)); + ASSERT(!scn->scn_async_destroying); + scn->scn_async_destroying = B_TRUE; + } + + used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; + comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; + uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + dsl_dataset_phys(ds)->ds_unique_bytes == used); + + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + bptree_add(mos, dp->dp_bptree_obj, + &dsl_dataset_phys(ds)->ds_bp, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + used, comp, uncomp, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); +} + void dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) { @@ -911,7 +1035,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) } /* - * Destroy the deadlist. Unless it's a clone, the + * Destroy the deadlist. Unless it's a clone, the * deadlist should be empty since the dataset has no snapshots. * (If it's a clone, it's safe to ignore the deadlist contents * since they are still referenced by the origin snapshot.) @@ -924,51 +1048,18 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) if (dsl_dataset_remap_deadlist_exists(ds)) dsl_dataset_destroy_remap_deadlist(ds, tx); - objset_t *os; - VERIFY0(dmu_objset_from_ds(ds, &os)); - - if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { - old_synchronous_dataset_destroy(ds, tx); + /* + * Each destroy is responsible for both destroying (enqueuing + * to be destroyed) the blkptrs comprising the dataset as well as + * those belonging to the zil. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { + dsl_async_clone_destroy(ds, tx); + } else if (spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dsl_async_dataset_destroy(ds, tx); } else { - /* - * Move the bptree into the pool's list of trees to - * clean up and update space accounting information. - */ - uint64_t used, comp, uncomp; - - zil_destroy_sync(dmu_objset_zil(os), tx); - - if (!spa_feature_is_active(dp->dp_spa, - SPA_FEATURE_ASYNC_DESTROY)) { - dsl_scan_t *scn = dp->dp_scan; - spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, - tx); - dp->dp_bptree_obj = bptree_alloc(mos, tx); - VERIFY0(zap_add(mos, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, - &dp->dp_bptree_obj, tx)); - ASSERT(!scn->scn_async_destroying); - scn->scn_async_destroying = B_TRUE; - } - - used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; - comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; - uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; - - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - dsl_dataset_phys(ds)->ds_unique_bytes == used); - - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - bptree_add(mos, dp->dp_bptree_obj, - &dsl_dataset_phys(ds)->ds_bp, - dsl_dataset_phys(ds)->ds_prev_snap_txg, - used, comp, uncomp, tx); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, - -used, -comp, -uncomp, tx); - dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, - used, comp, uncomp, tx); + old_synchronous_dataset_destroy(ds, tx); } if (ds->ds_prev != NULL) { diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 741ca232e..7b3c892c0 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -48,6 +48,7 @@ #include <sys/policy.h> #include <sys/zfs_znode.h> #include <sys/zvol.h> +#include <sys/zthr.h> #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -155,6 +156,9 @@ dsl_dir_evict_async(void *dbu) spa_async_close(dd->dd_pool->dp_spa, dd); + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); + dsl_prop_fini(dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); @@ -255,6 +259,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); + if (dsl_dir_is_zapified(dd)) { + uint64_t obj; + err = zap_lookup(dp->dp_meta_objset, + dd->dd_object, DD_FIELD_LIVELIST, + sizeof (uint64_t), 1, &obj); + if (err == 0) + dsl_dir_livelist_open(dd, obj); + else if (err != ENOENT) + goto errout; + } } dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, @@ -263,6 +277,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, if (winner != NULL) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); dsl_prop_fini(dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); @@ -291,6 +307,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, errout: if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); dsl_prop_fini(dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); @@ -2178,6 +2196,90 @@ dsl_dir_is_zapified(dsl_dir_t *dd) return (doi.doi_type == DMU_OTN_ZAP_METADATA); } +void +dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa, + SPA_FEATURE_LIVELIST)); + dsl_deadlist_open(&dd->dd_livelist, mos, obj); + bplist_create(&dd->dd_pending_allocs); + bplist_create(&dd->dd_pending_frees); +} + +void +dsl_dir_livelist_close(dsl_dir_t *dd) +{ + dsl_deadlist_close(&dd->dd_livelist); + bplist_destroy(&dd->dd_pending_allocs); + bplist_destroy(&dd->dd_pending_frees); +} + +void +dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) +{ + uint64_t obj; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + livelist_condense_entry_t to_condense = spa->spa_to_condense; + + if (!dsl_deadlist_is_open(&dd->dd_livelist)) + return; + + /* + * If the livelist being removed is set to be condensed, stop the + * condense zthr and indicate the cancellation in the spa_to_condense + * struct in case the condense no-wait synctask has already started + */ + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL && + (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) { + /* + * We use zthr_wait_cycle_done instead of zthr_cancel + * because we don't want to destroy the zthr, just have + * it skip its current task. + */ + spa->spa_to_condense.cancelled = B_TRUE; + zthr_wait_cycle_done(ll_condense_thread); + /* + * If we've returned from zthr_wait_cycle_done without + * clearing the to_condense data structure it's either + * because the no-wait synctask has started (which is + * indicated by 'syncing' field of to_condense) and we + * can expect it to clear to_condense on its own. + * Otherwise, we returned before the zthr ran. The + * checkfunc will now fail as cancelled == B_TRUE so we + * can safely NULL out ds, allowing a different dir's + * livelist to be condensed. + * + * We can be sure that the to_condense struct will not + * be repopulated at this stage because both this + * function and dsl_livelist_try_condense execute in + * syncing context. + */ + if ((spa->spa_to_condense.ds != NULL) && + !spa->spa_to_condense.syncing) { + dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, + spa); + spa->spa_to_condense.ds = NULL; + } + } + + dsl_dir_livelist_close(dd); + int err = zap_lookup(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj); + if (err == 0) { + VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_LIVELIST, tx)); + if (total) { + dsl_deadlist_free(dp->dp_meta_objset, obj, tx); + spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); + } + } else { + ASSERT3U(err, !=, ENOENT); + } +} + #if defined(_KERNEL) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 49e527912..c342f0c51 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -721,7 +721,8 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * Now that the datasets have been completely synced, we can * clean up our in-memory structures accumulated while syncing: * - * - move dead blocks from the pending deadlist to the on-disk deadlist + * - move dead blocks from the pending deadlist and livelists + * to the on-disk versions * - release hold from dsl_dataset_dirty() * - release key mapping hold from dsl_dataset_dirty() */ diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f25a559a9..d6956f560 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -3103,8 +3103,18 @@ dsl_scan_update_stats(dsl_scan_t *scn) } static int -dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { + ASSERT(!bp_freed); + return (dsl_scan_free_block_cb(arg, bp, tx)); +} + +static int +dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); dsl_scan_t *scn = arg; const dva_t *dva = &bp->blk_dva[0]; @@ -3123,6 +3133,7 @@ dsl_scan_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t used = 0, comp, uncomp; + boolean_t clones_left; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); @@ -3136,7 +3147,8 @@ dsl_scan_active(dsl_scan_t *scn) (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } - return (used != 0); + clones_left = spa_livelist_delete_check(spa); + return ((used != 0) || (clones_left)); } static boolean_t @@ -3233,7 +3245,7 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, - dsl_scan_free_block_cb, scn, tx); + bpobj_dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); scn->scn_zio_root = NULL; @@ -3330,7 +3342,8 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } - if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { + if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && + !spa_livelist_delete_check(spa)) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 6af162edb..da221fb2e 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -233,6 +233,27 @@ uint64_t zfs_max_missing_tvds_scan = 0; boolean_t zfs_pause_spa_sync = B_FALSE; /* + * Variables to indicate the livelist condense zthr func should wait at certain + * points for the livelist to be removed - used to test condense/destroy races + */ +int zfs_livelist_condense_zthr_pause = 0; +int zfs_livelist_condense_sync_pause = 0; + +/* + * Variables to track whether or not condense cancellation has been + * triggered in testing. + */ +int zfs_livelist_condense_sync_cancel = 0; +int zfs_livelist_condense_zthr_cancel = 0; + +/* + * Variable to track whether or not extra ALLOC blkptrs were added to a + * livelist entry while it was being condensed (caused by the way we track + * remapped blkptrs in dbuf_remap_impl) + */ +int zfs_livelist_condense_new_alloc = 0; + +/* * ========================================================================== * SPA properties routines * ========================================================================== @@ -1481,6 +1502,27 @@ spa_unload_log_sm_metadata(spa_t *spa) spa->spa_unflushed_stats.sus_blocklimit = 0; } +static void +spa_destroy_aux_threads(spa_t *spa) +{ + if (spa->spa_condense_zthr != NULL) { + zthr_destroy(spa->spa_condense_zthr); + spa->spa_condense_zthr = NULL; + } + if (spa->spa_checkpoint_discard_zthr != NULL) { + zthr_destroy(spa->spa_checkpoint_discard_zthr); + spa->spa_checkpoint_discard_zthr = NULL; + } + if (spa->spa_livelist_delete_zthr != NULL) { + zthr_destroy(spa->spa_livelist_delete_zthr); + spa->spa_livelist_delete_zthr = NULL; + } + if (spa->spa_livelist_condense_zthr != NULL) { + zthr_destroy(spa->spa_livelist_condense_zthr); + spa->spa_livelist_condense_zthr = NULL; + } +} + /* * Opposite of spa_load(). */ @@ -1552,15 +1594,7 @@ spa_unload(spa_t *spa) spa->spa_vdev_removal = NULL; } - if (spa->spa_condense_zthr != NULL) { - zthr_destroy(spa->spa_condense_zthr); - spa->spa_condense_zthr = NULL; - } - - if (spa->spa_checkpoint_discard_zthr != NULL) { - zthr_destroy(spa->spa_checkpoint_discard_zthr); - spa->spa_checkpoint_discard_zthr = NULL; - } + spa_destroy_aux_threads(spa); spa_condense_fini(spa); @@ -2335,6 +2369,376 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) return (SET_ERROR(err)); } +boolean_t +spa_livelist_delete_check(spa_t *spa) +{ + return (spa->spa_livelists_to_delete != 0); +} + +/* ARGSUSED */ +static boolean_t +spa_livelist_delete_cb_check(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + return (spa_livelist_delete_check(spa)); +} + +static int +delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + spa_t *spa = arg; + zio_free(spa, tx->tx_txg, bp); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, + -bp_get_dsize_sync(spa, bp), + -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); + return (0); +} + +static int +dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) +{ + int err; + zap_cursor_t zc; + zap_attribute_t za; + zap_cursor_init(&zc, os, zap_obj); + err = zap_cursor_retrieve(&zc, &za); + zap_cursor_fini(&zc); + if (err == 0) + *llp = za.za_first_integer; + return (err); +} + +/* + * Components of livelist deletion that must be performed in syncing + * context: freeing block pointers and updating the pool-wide data + * structures to indicate how much work is left to do + */ +typedef struct sublist_delete_arg { + spa_t *spa; + dsl_deadlist_t *ll; + uint64_t key; + bplist_t *to_free; +} sublist_delete_arg_t; + +static void +sublist_delete_sync(void *arg, dmu_tx_t *tx) +{ + sublist_delete_arg_t *sda = arg; + spa_t *spa = sda->spa; + dsl_deadlist_t *ll = sda->ll; + uint64_t key = sda->key; + bplist_t *to_free = sda->to_free; + + bplist_iterate(to_free, delete_blkptr_cb, spa, tx); + dsl_deadlist_remove_entry(ll, key, tx); +} + +typedef struct livelist_delete_arg { + spa_t *spa; + uint64_t ll_obj; + uint64_t zap_obj; +} livelist_delete_arg_t; + +static void +livelist_delete_sync(void *arg, dmu_tx_t *tx) +{ + livelist_delete_arg_t *lda = arg; + spa_t *spa = lda->spa; + uint64_t ll_obj = lda->ll_obj; + uint64_t zap_obj = lda->zap_obj; + objset_t *mos = spa->spa_meta_objset; + uint64_t count; + + /* free the livelist and decrement the feature count */ + VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); + dsl_deadlist_free(mos, ll_obj, tx); + spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); + VERIFY0(zap_count(mos, zap_obj, &count)); + if (count == 0) { + /* no more livelists to delete */ + VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, tx)); + VERIFY0(zap_destroy(mos, zap_obj, tx)); + spa->spa_livelists_to_delete = 0; + } +} + +/* + * Load in the value for the livelist to be removed and open it. Then, + * load its first sublist and determine which block pointers should actually + * be freed. Then, call a synctask which performs the actual frees and updates + * the pool-wide livelist data. + */ +/* ARGSUSED */ +void +spa_livelist_delete_cb(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + uint64_t ll_obj = 0, count; + objset_t *mos = spa->spa_meta_objset; + uint64_t zap_obj = spa->spa_livelists_to_delete; + /* + * Determine the next livelist to delete. This function should only + * be called if there is at least one deleted clone. + */ + VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); + VERIFY0(zap_count(mos, ll_obj, &count)); + if (count > 0) { + dsl_deadlist_t ll = { 0 }; + dsl_deadlist_entry_t *dle; + bplist_t to_free; + dsl_deadlist_open(&ll, mos, ll_obj); + dle = dsl_deadlist_first(&ll); + ASSERT3P(dle, !=, NULL); + bplist_create(&to_free); + int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, + z, NULL); + if (err == 0) { + sublist_delete_arg_t sync_arg = { + .spa = spa, + .ll = &ll, + .key = dle->dle_mintxg, + .to_free = &to_free + }; + zfs_dbgmsg("deleting sublist (id %llu) from" + " livelist %llu, %d remaining", + dle->dle_bpobj.bpo_object, ll_obj, count - 1); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + sublist_delete_sync, &sync_arg, 0, + ZFS_SPACE_CHECK_DESTROY)); + } else { + ASSERT(err == EINTR); + } + bplist_clear(&to_free); + bplist_destroy(&to_free); + dsl_deadlist_close(&ll); + } else { + livelist_delete_arg_t sync_arg = { + .spa = spa, + .ll_obj = ll_obj, + .zap_obj = zap_obj + }; + zfs_dbgmsg("deletion of livelist %llu completed", ll_obj); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, + &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); + } +} + +void +spa_start_livelist_destroy_thread(spa_t *spa) +{ + ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); + spa->spa_livelist_delete_zthr = zthr_create( + spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa); +} + +typedef struct livelist_new_arg { + bplist_t *allocs; + bplist_t *frees; +} livelist_new_arg_t; + +static int +livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(tx == NULL); + livelist_new_arg_t *lna = arg; + if (bp_freed) { + bplist_append(lna->frees, bp); + } else { + bplist_append(lna->allocs, bp); + zfs_livelist_condense_new_alloc++; + } + return (0); +} + +typedef struct livelist_condense_arg { + spa_t *spa; + bplist_t to_keep; + uint64_t first_size; + uint64_t next_size; +} livelist_condense_arg_t; + +static void +spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) +{ + livelist_condense_arg_t *lca = arg; + spa_t *spa = lca->spa; + bplist_t new_frees; + dsl_dataset_t *ds = spa->spa_to_condense.ds; + + /* Have we been cancelled? */ + if (spa->spa_to_condense.cancelled) { + zfs_livelist_condense_sync_cancel++; + goto out; + } + + dsl_deadlist_entry_t *first = spa->spa_to_condense.first; + dsl_deadlist_entry_t *next = spa->spa_to_condense.next; + dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; + + /* + * It's possible that the livelist was changed while the zthr was + * running. Therefore, we need to check for new blkptrs in the two + * entries being condensed and continue to track them in the livelist. + * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), + * it's possible that the newly added blkptrs are FREEs or ALLOCs so + * we need to sort them into two different bplists. + */ + uint64_t first_obj = first->dle_bpobj.bpo_object; + uint64_t next_obj = next->dle_bpobj.bpo_object; + uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; + uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; + + bplist_create(&new_frees); + livelist_new_arg_t new_bps = { + .allocs = &lca->to_keep, + .frees = &new_frees, + }; + + if (cur_first_size > lca->first_size) { + VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, + livelist_track_new_cb, &new_bps, lca->first_size)); + } + if (cur_next_size > lca->next_size) { + VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, + livelist_track_new_cb, &new_bps, lca->next_size)); + } + + dsl_deadlist_clear_entry(first, ll, tx); + ASSERT(bpobj_is_empty(&first->dle_bpobj)); + dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); + + bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); + bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); + bplist_destroy(&new_frees); + + char dsname[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " + "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " + "(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj, + cur_first_size, next_obj, cur_next_size, + first->dle_bpobj.bpo_object, + first->dle_bpobj.bpo_phys->bpo_num_blkptrs); +out: + dmu_buf_rele(ds->ds_dbuf, spa); + spa->spa_to_condense.ds = NULL; + bplist_clear(&lca->to_keep); + bplist_destroy(&lca->to_keep); + kmem_free(lca, sizeof (livelist_condense_arg_t)); + spa->spa_to_condense.syncing = B_FALSE; +} + +void +spa_livelist_condense_cb(void *arg, zthr_t *t) +{ + while (zfs_livelist_condense_zthr_pause && + !(zthr_has_waiters(t) || zthr_iscancelled(t))) + delay(1); + + spa_t *spa = arg; + dsl_deadlist_entry_t *first = spa->spa_to_condense.first; + dsl_deadlist_entry_t *next = spa->spa_to_condense.next; + uint64_t first_size, next_size; + + livelist_condense_arg_t *lca = + kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); + bplist_create(&lca->to_keep); + + /* + * Process the livelists (matching FREEs and ALLOCs) in open context + * so we have minimal work in syncing context to condense. + * + * We save bpobj sizes (first_size and next_size) to use later in + * syncing context to determine if entries were added to these sublists + * while in open context. This is possible because the clone is still + * active and open for normal writes and we want to make sure the new, + * unprocessed blockpointers are inserted into the livelist normally. + * + * Note that dsl_process_sub_livelist() both stores the size number of + * blockpointers and iterates over them while the bpobj's lock held, so + * the sizes returned to us are consistent which what was actually + * processed. + */ + int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, + &first_size); + if (err == 0) + err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, + t, &next_size); + + if (err == 0) { + while (zfs_livelist_condense_sync_pause && + !(zthr_has_waiters(t) || zthr_iscancelled(t))) + delay(1); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + dmu_tx_mark_netfree(tx); + dmu_tx_hold_space(tx, 1); + err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); + if (err == 0) { + /* + * Prevent the condense zthr restarting before + * the synctask completes. + */ + spa->spa_to_condense.syncing = B_TRUE; + lca->spa = spa; + lca->first_size = first_size; + lca->next_size = next_size; + dsl_sync_task_nowait(spa_get_dsl(spa), + spa_livelist_condense_sync, lca, 0, + ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + return; + } + } + /* + * Condensing can not continue: either it was externally stopped or + * we were unable to assign to a tx because the pool has run out of + * space. In the second case, we'll just end up trying to condense + * again in a later txg. + */ + ASSERT(err != 0); + bplist_clear(&lca->to_keep); + bplist_destroy(&lca->to_keep); + kmem_free(lca, sizeof (livelist_condense_arg_t)); + dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); + spa->spa_to_condense.ds = NULL; + if (err == EINTR) + zfs_livelist_condense_zthr_cancel++; +} + +/* ARGSUSED */ +/* + * Check that there is something to condense but that a condense is not + * already in progress and that condensing has not been cancelled. + */ +static boolean_t +spa_livelist_condense_cb_check(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + if ((spa->spa_to_condense.ds != NULL) && + (spa->spa_to_condense.syncing == B_FALSE) && + (spa->spa_to_condense.cancelled == B_FALSE)) { + return (B_TRUE); + } + return (B_FALSE); +} + +void +spa_start_livelist_condensing_thread(spa_t *spa) +{ + spa->spa_to_condense.ds = NULL; + spa->spa_to_condense.first = NULL; + spa->spa_to_condense.next = NULL; + spa->spa_to_condense.syncing = B_FALSE; + spa->spa_to_condense.cancelled = B_FALSE; + + ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); + spa->spa_livelist_condense_zthr = zthr_create( + spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa); +} + static void spa_spawn_aux_threads(spa_t *spa) { @@ -2343,6 +2747,8 @@ spa_spawn_aux_threads(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_start_indirect_condensing_thread(spa); + spa_start_livelist_destroy_thread(spa); + spa_start_livelist_condensing_thread(spa); ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); spa->spa_checkpoint_discard_zthr = @@ -3604,6 +4010,15 @@ spa_ld_get_props(spa_t *spa) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* + * Load the livelist deletion field. If a livelist is queued for + * deletion, indicate that in the spa + */ + error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, + &spa->spa_livelists_to_delete, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* * Load the history object. If we have an older pool, this * will not be present. */ @@ -7571,6 +7986,14 @@ spa_async_suspend(spa_t *spa) zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); + + zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; + if (ll_delete_thread != NULL) + zthr_cancel(ll_delete_thread); + + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL) + zthr_cancel(ll_condense_thread); } void @@ -7589,6 +8012,14 @@ spa_async_resume(spa_t *spa) zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); + + zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; + if (ll_delete_thread != NULL) + zthr_resume(ll_delete_thread); + + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL) + zthr_resume(ll_condense_thread); } static boolean_t @@ -7639,14 +8070,28 @@ spa_async_request(spa_t *spa, int task) * ========================================================================== */ + static int -bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { bpobj_t *bpo = arg; - bpobj_enqueue(bpo, bp, tx); + bpobj_enqueue(bpo, bp, bp_freed, tx); return (0); } +int +bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); +} + +int +bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); +} + static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { @@ -7657,6 +8102,14 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) return (0); } +static int +bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); + return (spa_free_sync_cb(arg, bp, tx)); +} + /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. @@ -7693,7 +8146,7 @@ spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) */ zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, - spa_free_sync_cb, zio, tx), ==, 0); + bpobj_spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } @@ -8296,7 +8749,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); - bplist_iterate(free_bpl, bpobj_enqueue_cb, + bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, &spa->spa_deferred_bpobj, tx); } @@ -8884,4 +9337,24 @@ MODULE_PARM_DESC(zfs_max_missing_tvds, " (in read-only mode)"); /* END CSTYLED */ +module_param(zfs_livelist_condense_zthr_pause, int, 0644); +MODULE_PARM_DESC(zfs_livelist_condense_zthr_pause, + "Set the livelist condense zthr to pause"); +module_param(zfs_livelist_condense_sync_pause, int, 0644); +MODULE_PARM_DESC(zfs_livelist_condense_sync_pause, + "Set the livelist condense synctask to pause"); + +module_param(zfs_livelist_condense_sync_cancel, int, 0644); +MODULE_PARM_DESC(zfs_livelist_condense_sync_cancel, + "Whether livelist condensing was canceled in the synctask"); +module_param(zfs_livelist_condense_zthr_cancel, int, 0644); +MODULE_PARM_DESC(zfs_livelist_condense_zthr_cancel, + "Whether livelist condensing was canceled in the zthr function"); + +/* BEGIN CSTYLED */ +module_param(zfs_livelist_condense_new_alloc, int, 0644); +MODULE_PARM_DESC(zfs_livelist_condense_new_alloc, + "Whether extra ALLOC blkptrs were added to a livelist entry while it" + " was being condensed"); +/* END CSTYLED */ #endif diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index b590a1d57..68c6b544e 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. */ @@ -413,7 +413,6 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) /* spa_history_log_sync will free nvl */ return (err); - } /* diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c index 532e8ce0f..53c0a0b3d 100644 --- a/module/zfs/zthr.c +++ b/module/zfs/zthr.c @@ -207,12 +207,15 @@ struct zthr { /* flag set to true if we are canceling the zthr */ boolean_t zthr_cancel; + /* flag set to true if we are waiting for the zthr to finish */ + boolean_t zthr_haswaiters; + kcondvar_t zthr_wait_cv; /* * maximum amount of time that the zthr is spent sleeping; * if this is 0, the thread doesn't wake up until it gets * signaled. */ - hrtime_t zthr_wait_time; + hrtime_t zthr_sleep_timeout; /* consumer-provided callbacks & data */ zthr_checkfunc_t *zthr_checkfunc; @@ -239,14 +242,18 @@ zthr_procedure(void *arg) * order to prevent this process from incorrectly * contributing to the system load average when idle. */ - if (t->zthr_wait_time == 0) { + if (t->zthr_sleep_timeout == 0) { cv_wait_sig(&t->zthr_cv, &t->zthr_state_lock); } else { (void) cv_timedwait_sig_hires(&t->zthr_cv, - &t->zthr_state_lock, t->zthr_wait_time, + &t->zthr_state_lock, t->zthr_sleep_timeout, MSEC2NSEC(1), 0); } } + if (t->zthr_haswaiters) { + t->zthr_haswaiters = B_FALSE; + cv_broadcast(&t->zthr_wait_cv); + } } /* @@ -280,12 +287,13 @@ zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func, mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL); + cv_init(&t->zthr_wait_cv, NULL, CV_DEFAULT, NULL); mutex_enter(&t->zthr_state_lock); t->zthr_checkfunc = checkfunc; t->zthr_func = func; t->zthr_arg = arg; - t->zthr_wait_time = max_sleep; + t->zthr_sleep_timeout = max_sleep; t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri); @@ -303,6 +311,7 @@ zthr_destroy(zthr_t *t) mutex_destroy(&t->zthr_request_lock); mutex_destroy(&t->zthr_state_lock); cv_destroy(&t->zthr_cv); + cv_destroy(&t->zthr_wait_cv); kmem_free(t, sizeof (*t)); } @@ -355,9 +364,8 @@ zthr_cancel(zthr_t *t) * * [1] The thread has already been cancelled, therefore * there is nothing for us to do. - * [2] The thread is sleeping, so we broadcast the CV first - * to wake it up and then we set the flag and we are - * waiting for it to exit. + * [2] The thread is sleeping so we set the flag, broadcast + * the CV and wait for it to exit. * [3] The thread is doing work, in which case we just set * the flag and wait for it to finish. * [4] The thread was just created/resumed, in which case @@ -397,6 +405,7 @@ zthr_resume(zthr_t *t) ASSERT3P(&t->zthr_checkfunc, !=, NULL); ASSERT3P(&t->zthr_func, !=, NULL); ASSERT(!t->zthr_cancel); + ASSERT(!t->zthr_haswaiters); /* * There are 4 states that we find the zthr in at this point @@ -451,3 +460,74 @@ zthr_iscancelled(zthr_t *t) mutex_exit(&t->zthr_state_lock); return (cancelled); } + +/* + * Wait for the zthr to finish its current function. Similar to + * zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end + * early. Unlike zthr_cancel, the thread is not destroyed. If the zthr was + * sleeping or cancelled, return immediately. + */ +void +zthr_wait_cycle_done(zthr_t *t) +{ + mutex_enter(&t->zthr_state_lock); + + /* + * Since we are holding the zthr_state_lock at this point + * we can find the state in one of the following 5 states: + * + * [1] The thread has already cancelled, therefore + * there is nothing for us to do. + * [2] The thread is sleeping so we set the flag, broadcast + * the CV and wait for it to exit. + * [3] The thread is doing work, in which case we just set + * the flag and wait for it to finish. + * [4] The thread was just created/resumed, in which case + * the behavior is similar to [3]. + * [5] The thread is the middle of being cancelled, which is + * similar to [3]. We'll wait for the cancel, which is + * waiting for the zthr func. + * + * Since requests are serialized, by the time that we get + * control back we expect that the zthr has completed it's + * zthr_func. + */ + if (t->zthr_thread != NULL) { + t->zthr_haswaiters = B_TRUE; + + /* broadcast in case the zthr is sleeping */ + cv_broadcast(&t->zthr_cv); + + while ((t->zthr_haswaiters) && (t->zthr_thread != NULL)) + cv_wait(&t->zthr_wait_cv, &t->zthr_state_lock); + + ASSERT(!t->zthr_haswaiters); + } + + mutex_exit(&t->zthr_state_lock); +} + +/* + * This function is intended to be used by the zthr itself + * to check if another thread is waiting on it to finish + * + * returns TRUE if we have been asked to finish. + * + * returns FALSE otherwise. + */ +boolean_t +zthr_has_waiters(zthr_t *t) +{ + ASSERT3P(t->zthr_thread, ==, curthread); + + mutex_enter(&t->zthr_state_lock); + + /* + * Similarly to zthr_iscancelled(), we only grab the + * zthr_state_lock so that the zthr itself can use this + * to check for the request. + */ + boolean_t has_waiters = t->zthr_haswaiters; + mutex_exit(&t->zthr_state_lock); + return (has_waiters); +} diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index f1337cbc7..e9db66130 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -147,12 +147,15 @@ tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', tags = ['functional', 'cli_root', 'zfs_create'] [tests/functional/cli_root/zfs_destroy] -tests = ['zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', +tests = ['zfs_clone_livelist_condense_and_disable', + 'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos', + 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', 'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos', - 'zfs_destroy_016_pos'] + 'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist', + 'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense'] tags = ['functional', 'cli_root', 'zfs_destroy'] [tests/functional/cli_root/zfs_diff] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 7078eb30b..5d9e3b361 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -22,7 +22,7 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# Copyright (c) 2012, 2017 by Delphix. All rights reserved. +# Copyright (c) 2012, 2018 by Delphix. All rights reserved. # Copyright (c) 2017 by Tim Chase. All rights reserved. # Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. # Copyright (c) 2017 Lawrence Livermore National Security, LLC. diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile.am index 183578df5..c012b35d0 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile.am @@ -2,6 +2,8 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_destro dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ + zfs_clone_livelist_condense_and_disable.ksh \ + zfs_clone_livelist_condense_races.ksh \ zfs_destroy_001_pos.ksh \ zfs_destroy_002_pos.ksh \ zfs_destroy_003_pos.ksh \ @@ -17,7 +19,10 @@ dist_pkgdata_SCRIPTS = \ zfs_destroy_013_neg.ksh \ zfs_destroy_014_pos.ksh \ zfs_destroy_015_pos.ksh \ - zfs_destroy_016_pos.ksh + zfs_destroy_016_pos.ksh \ + zfs_destroy_clone_livelist.ksh \ + zfs_destroy_dev_removal.ksh \ + zfs_destroy_dev_removal_condense.ksh dist_pkgdata_DATA = \ zfs_destroy_common.kshlib \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh new file mode 100755 index 000000000..b9ac87238 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh @@ -0,0 +1,125 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Verify zfs destroy test for clones with the livelist feature +# enabled. + +# STRATEGY +# 1. Clone where livelist is condensed +# - create clone, write several files, delete those files +# - check that the number of livelist entries decreases +# after the delete +# 2. Clone where livelist is deactivated +# - create clone, write files. Delete those files and the +# file in the filesystem when the snapshot was created +# so the clone and snapshot no longer share data +# - check that the livelist is destroyed + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib + +function cleanup +{ + log_must zfs destroy -Rf $TESTPOOL/$TESTFS1 + # reset the livelist sublist size to the original value + set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX + # reset the minimum percent shared to 75 + set_tunable32 zfs_livelist_min_percent_shared $ORIGINAL_MIN +} + +function check_ll_len +{ + string="$(zdb -vvvvv $TESTPOOL | grep "Livelist")" + substring="$1" + msg=$2 + if test "${string#*$substring}" != "$string"; then + return 0 # $substring is in $string + else + log_note $string + log_fail "$msg" # $substring is not in $string + fi +} + +function test_condense +{ + # set the max livelist entries to a small value to more easily + # trigger a condense + set_tunable64 zfs_livelist_max_entries 0x14 + # set a small percent shared threshold so the livelist is not disabled + set_tunable32 zfs_livelist_min_percent_shared 0xa + clone_dataset $TESTFS1 snap $TESTCLONE + + # sync between each write to make sure a new entry is created + for i in {0..4}; do + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/testfile$i + log_must zpool sync $TESTPOOL + done + + check_ll_len "5 entries" "Unexpected livelist size" + + # sync between each write to allow for a condense of the previous entry + for i in {0..4}; do + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/testfile$i + log_must zpool sync $TESTPOOL + done + + check_ll_len "6 entries" "Condense did not occur" + + log_must zfs destroy $TESTPOOL/$TESTCLONE + check_livelist_gone +} + +function test_deactivated +{ + # Threshold set to 50 percent + set_tunable32 zfs_livelist_min_percent_shared 0x32 + clone_dataset $TESTFS1 snap $TESTCLONE + + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE0 + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE1 + log_must zpool sync $TESTPOOL + # snapshot and clone share 'atestfile', 33 percent + check_livelist_gone + log_must zfs destroy -R $TESTPOOL/$TESTCLONE + + # Threshold set to 20 percent + set_tunable32 zfs_livelist_min_percent_shared 0x14 + clone_dataset $TESTFS1 snap $TESTCLONE + + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE0 + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE1 + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE2 + log_must zpool sync $TESTPOOL + # snapshot and clone share 'atestfile', 25 percent + check_livelist_exists $TESTCLONE + log_must rm /$TESTPOOL/$TESTCLONE/atestfile + # snapshot and clone share no files + check_livelist_gone + log_must zfs destroy -R $TESTPOOL/$TESTCLONE +} + +ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries) +ORIGINAL_MIN=$(get_tunable zfs_livelist_min_percent_shared) + +log_onexit cleanup +log_must zfs create $TESTPOOL/$TESTFS1 +log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap +test_condense +test_deactivated + +log_pass "Clone's livelist condenses and disables as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh new file mode 100755 index 000000000..037983ba7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh @@ -0,0 +1,116 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Test race conditions for livelist condensing + +# STRATEGY +# These tests exercise code paths that deal with a livelist being +# simultaneously condensed and deactivated (deleted, exported or disabled). +# If a variable is set, the zthr will pause until it is cancelled or waited +# and then a counter variable keeps track of whether or not the code path is +# reached. + +# 1. Deletion race: repeatedly overwrite the same file to trigger condense +# and then delete the clone. +# 2. Disable race: Overwrite enough files to trigger condenses and disabling of +# the livelist. +# 3. Export race: repeatedly overwrite the same file to trigger condense and +# then export the pool. + +. $STF_SUITE/include/libtest.shlib + +function cleanup +{ + log_must zfs destroy -Rf $TESTPOOL/$TESTFS1 + # reset the livelist sublist size to the original value + set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX + # reset the condense tests to 0 + set_tunable32 zfs_livelist_condense_zthr_pause 0 + set_tunable32 zfs_livelist_condense_sync_pause 0 +} + +function delete_race +{ + set_tunable32 "$1" 0 + log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE + for i in {1..5}; do + log_must zpool sync $TESTPOOL + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out + done + log_must zfs destroy $TESTPOOL/$TESTCLONE + log_must zpool sync $TESTPOOL + [[ "1" == "$(get_tunable "$1")" ]] || \ + log_fail "delete/condense race test failed" +} + +function export_race +{ + set_tunable32 "$1" 0 + log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE + for i in {1..5}; do + log_must zpool sync $TESTPOOL + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out + done + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + [[ "1" == "$(get_tunable "$1")" ]] || \ + log_fail "export/condense race test failed" + log_must zfs destroy $TESTPOOL/$TESTCLONE +} + +function disable_race +{ + set_tunable32 "$1" 0 + log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE + for i in {1..5}; do + log_must zpool sync $TESTPOOL + log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out + done + # overwrite the file shared with the origin to trigger disable + log_must mkfile 100m /$TESTPOOL/$TESTCLONE/atestfile + log_must zpool sync $TESTPOOL + [[ "1" == "$(get_tunable "$1")" ]] || \ + log_fail "disable/condense race test failed" + log_must zfs destroy $TESTPOOL/$TESTCLONE +} + +ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries) + +log_onexit cleanup + +log_must zfs create $TESTPOOL/$TESTFS1 +log_must mkfile 100m /$TESTPOOL/$TESTFS1/atestfile +log_must zpool sync $TESTPOOL +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap + +# Reduce livelist size to trigger condense more easily +set_tunable64 zfs_livelist_max_entries 0x14 + +# Test cancellation path in the zthr +set_tunable32 zfs_livelist_condense_zthr_pause 1 +set_tunable32 zfs_livelist_condense_sync_pause 0 +disable_race "zfs_livelist_condense_zthr_cancel" +delete_race "zfs_livelist_condense_zthr_cancel" +export_race "zfs_livelist_condense_zthr_cancel" + +# Test cancellation path in the synctask +set_tunable32 zfs_livelist_condense_zthr_pause 0 +set_tunable32 zfs_livelist_condense_sync_pause 1 +disable_race "zfs_livelist_condense_sync_cancel" +delete_race "zfs_livelist_condense_sync_cancel" + +log_pass "Clone livelist condense race conditions passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh new file mode 100755 index 000000000..1dd011519 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh @@ -0,0 +1,140 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Verify zfs destroy test for clones with the livelist feature +# enabled. + +# STRATEGY +# 1. One clone with an empty livelist +# - create the clone, check that livelist exists +# - delete the clone, check that livelist is eventually +# destroyed +# 2. One clone with populated livelist +# - create the clone, check that livelist exists +# - write multiple files to the clone +# - delete the clone, check that livelist is eventually +# destroyed +# 3. Multiple clones with empty livelists +# - same as 1. but with multiple clones +# 4. Multuple clones with populated livelists +# - same as 2. but with multiple clones + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && zfs destroy -R $TESTPOOL/$TESTFS1 + # reset the livelist sublist size to its original value + set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX +} + +function clone_write_file +{ + log_must mkfile 1m /$TESTPOOL/$1/$2 + log_must zpool sync $TESTPOOL +} + +function test_one_empty +{ + clone_dataset $TESTFS1 snap $TESTCLONE + + log_must zfs destroy $TESTPOOL/$TESTCLONE + check_livelist_gone +} + +function test_one +{ + clone_dataset $TESTFS1 snap $TESTCLONE + + clone_write_file $TESTCLONE $TESTFILE0 + clone_write_file $TESTCLONE $TESTFILE1 + clone_write_file $TESTCLONE $TESTFILE2 + log_must rm /$TESTPOOL/$TESTCLONE/$TESTFILE0 + log_must rm /$TESTPOOL/$TESTCLONE/$TESTFILE2 + check_livelist_exists $TESTCLONE + + log_must zfs destroy $TESTPOOL/$TESTCLONE + check_livelist_gone +} + +function test_multiple_empty +{ + clone_dataset $TESTFS1 snap $TESTCLONE + clone_dataset $TESTFS1 snap $TESTCLONE1 + clone_dataset $TESTFS1 snap $TESTCLONE2 + + log_must zfs destroy $TESTPOOL/$TESTCLONE + log_must zfs destroy $TESTPOOL/$TESTCLONE1 + log_must zfs destroy $TESTPOOL/$TESTCLONE2 + check_livelist_gone +} + +function test_multiple +{ + clone_dataset $TESTFS1 snap $TESTCLONE + clone_dataset $TESTFS1 snap $TESTCLONE1 + clone_dataset $TESTFS1 snap $TESTCLONE2 + + clone_write_file $TESTCLONE $TESTFILE0 + + clone_write_file $TESTCLONE1 $TESTFILE0 + clone_write_file $TESTCLONE1 $TESTFILE1 + clone_write_file $TESTCLONE1 $TESTFILE2 + + clone_write_file $TESTCLONE2 $TESTFILE0 + log_must rm /$TESTPOOL/$TESTCLONE2/$TESTFILE0 + clone_write_file $TESTCLONE2 $TESTFILE1 + log_must rm /$TESTPOOL/$TESTCLONE2/$TESTFILE1 + + check_livelist_exists $TESTCLONE + check_livelist_exists $TESTCLONE1 + check_livelist_exists $TESTCLONE2 + + log_must zfs destroy $TESTPOOL/$TESTCLONE + log_must zfs destroy $TESTPOOL/$TESTCLONE1 + log_must zfs destroy $TESTPOOL/$TESTCLONE2 + check_livelist_gone +} + +function test_promote +{ + clone_dataset $TESTFS1 snap $TESTCLONE + + log_must zfs promote $TESTPOOL/$TESTCLONE + check_livelist_gone + log_must zfs destroy -R $TESTPOOL/$TESTCLONE +} + +ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries) + +log_onexit cleanup +log_must zfs create $TESTPOOL/$TESTFS1 +log_must mkfile 20m /$TESTPOOL/$TESTFS1/atestfile +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap + +# set a small livelist entry size to more easily test multiple entry livelists +set_tunable64 zfs_livelist_max_entries 0x14 + +test_one_empty +test_one +test_multiple_empty +test_multiple +test_promote + +log_pass "Clone with the livelist feature enabled could be destroyed," \ + "also could be promoted and destroyed as expected." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib index 0a6f5ed9d..895efaf98 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib @@ -25,7 +25,7 @@ # # -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2012, 2018 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -146,3 +146,43 @@ function check_dataset done fi } + +# Use zdb to see if a livelist exists for a given clone +# $1 clone name +function check_livelist_exists +{ + zdb -vvvvv $TESTPOOL/$1 | grep "Livelist" || \ + log_fail "zdb could not find Livelist" +} + +# Wait for the deferred destroy livelists to be removed +function wait_for_deferred_destroy +{ + sync + deleted=$(zdb -vvvvv $TESTPOOL | grep "Deleted Livelist") + while [[ "$deleted" != "" ]]; do + deleted=$(zdb -vvvvv $TESTPOOL | grep "Deleted Livelist") + done +} + +# Check that a livelist has been removed, waiting for deferred destroy entries +# to be cleared from zdb. +function check_livelist_gone +{ + wait_for_deferred_destroy + zdb -vvvvv $TESTPOOL | grep "Livelist" && \ + log_fail "zdb found Livelist after the clone is deleted." +} + +# Create a clone in the testpool based on $TESTFS@snap. Verify that the clone +# was created and that it includes a livelist +# $1 fs name +# $2 snap name +# $3 clone name +function clone_dataset +{ + log_must zfs clone $TESTPOOL/$1@$2 $TESTPOOL/$3 + datasetexists $TESTPOOL/$3 || \ + log_fail "zfs clone $TESTPOOL/$3 fail." + check_livelist_exists $3 +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh new file mode 100755 index 000000000..0425322ae --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh @@ -0,0 +1,68 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Verify that livelists tracking remapped blocks can be +# properly destroyed. + +# STRATEGY +# 1. Create a pool with disk1 and create a filesystem, snapshot +# and clone. Write several files to the clone. +# 2. Add disk2 to the pool and then remove disk1, triggering a +# remap of the blkptrs tracked in the livelist. +# 3. Delete the clone + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +function cleanup +{ + poolexists $TESTPOOL2 && zpool destroy $TESTPOOL2 + [[ -f $VIRTUAL_DISK1 ]] && log_must rm $VIRTUAL_DISK1 + [[ -f $VIRTUAL_DISK2 ]] && lot_must rm $VIRTUAL_DISK2 +} + +log_onexit cleanup + +VIRTUAL_DISK1=/var/tmp/disk1 +VIRTUAL_DISK2=/var/tmp/disk2 +log_must mkfile $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK1 +log_must mkfile $(($MINVDEVSIZE * 16)) $VIRTUAL_DISK2 + +log_must zpool create $TESTPOOL2 $VIRTUAL_DISK1 +log_must poolexists $TESTPOOL2 + +log_must zfs create $TESTPOOL2/$TESTFS +log_must mkfile 25m /$TESTPOOL2/$TESTFS/atestfile +log_must zfs snapshot $TESTPOOL2/$TESTFS@snap + +log_must zfs clone $TESTPOOL2/$TESTFS@snap $TESTPOOL2/$TESTCLONE + +log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE0 +log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE1 +log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE2 + +log_must zpool add $TESTPOOL2 $VIRTUAL_DISK2 +log_must zpool remove $TESTPOOL2 $VIRTUAL_DISK1 +wait_for_removal $TESTPOOL2 + +log_must rm /$TESTPOOL2/$TESTCLONE/$TESTFILE0 +log_must rm /$TESTPOOL2/$TESTCLONE/$TESTFILE1 + +log_must zfs destroy $TESTPOOL2/$TESTCLONE + +log_pass "Clone with the livelist feature and remapped blocks," \ + "can be destroyed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh new file mode 100755 index 000000000..da5f314ef --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +# DESCRIPTION +# Verify that livelists tracking remapped blocks can be +# properly condensed. + +# STRATEGY +# 1. Create a pool with disk1 and create a filesystem, snapshot +# and clone. Create two files for the first livelist entry and +# pause condensing. +# 2. Add disk2 to the pool and then remove disk1, triggering a +# remap of the blkptrs tracked in the livelist. +# 3. Overwrite the first file several times to trigger a condense, +# overwrite the second file once and resume condensing, now with +# extra blkptrs added during the remap +# 4. Check that the test added new ALLOC blkptrs mid-condense using +# a variable set in that code path + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +function cleanup +{ + poolexists $TESTPOOL2 && zpool destroy $TESTPOOL2 + # reset livelist max size + set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX + [[ -f $VIRTUAL_DISK1 ]] && log_must rm $VIRTUAL_DISK1 + [[ -f $VIRTUAL_DISK2 ]] && lot_must rm $VIRTUAL_DISK2 +} + +log_onexit cleanup + +ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries) +set_tunable64 zfs_livelist_max_entries 0x14 + +VIRTUAL_DISK1=/var/tmp/disk1 +VIRTUAL_DISK2=/var/tmp/disk2 +log_must mkfile $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK1 +log_must mkfile $(($MINVDEVSIZE * 16)) $VIRTUAL_DISK2 + +log_must zpool create $TESTPOOL2 $VIRTUAL_DISK1 +log_must poolexists $TESTPOOL2 + +log_must zfs create $TESTPOOL2/$TESTFS +log_must mkfile 100m /$TESTPOOL2/$TESTFS/atestfile +log_must zfs snapshot $TESTPOOL2/$TESTFS@snap + +log_must zfs clone $TESTPOOL2/$TESTFS@snap $TESTPOOL2/$TESTCLONE + +# Create inital files and pause condense zthr on next execution +log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A +log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/B +log_must zpool sync $TESTPOOL2 +set_tunable32 zfs_livelist_condense_sync_pause 1 + +# Add a new dev and remove the old one +log_must zpool add $TESTPOOL2 $VIRTUAL_DISK2 +log_must zpool remove $TESTPOOL2 $VIRTUAL_DISK1 +wait_for_removal $TESTPOOL2 + +set_tunable32 zfs_livelist_condense_new_alloc 0 +# Trigger a condense +log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A +log_must zpool sync $TESTPOOL2 +log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A +log_must zpool sync $TESTPOOL2 +# Write remapped blkptrs which will modify the livelist mid-condense +log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/B + +# Resume condense thr +set_tunable32 zfs_livelist_condense_sync_pause 0 +log_must zpool sync $TESTPOOL2 +# Check that we've added new ALLOC blkptrs during the condense +[[ "0" < "$(get_tunable zfs_livelist_condense_new_alloc)" ]] || \ + log_fail "removal/condense test failed" + +log_must zfs destroy $TESTPOOL2/$TESTCLONE +log_pass "Clone with the livelist feature and remapped blocks," \ + "can be condensed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 717ee9cb2..9dea1e2cd 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -93,5 +93,6 @@ if is_linux; then "feature@allocation_classes" "feature@resilver_defer" "feature@bookmark_v2" + "feature@livelist" ) fi |