diff options
author | Sara Hartse <[email protected]> | 2019-07-26 10:54:14 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2019-07-26 10:54:14 -0700 |
commit | 37f03da8ba6e1ab074b503e1dd63bfa7199d0537 (patch) | |
tree | 987b03643c33cd43b246a20aea28b8750f7b4ee6 /module/zfs | |
parent | d274ac54609894d00a49c0a0da89abd3a7f3998d (diff) |
Fast Clone Deletion
Deleting a clone requires finding blocks are clone-only, not shared
with the snapshot. This was done by traversing the entire block tree
which results in a large performance penalty for sparsely
written clones.
This is new method keeps track of clone blocks when they are
modified in a "Livelist" so that, when it’s time to delete,
the clone-specific blocks are already at hand.
We see performance improvements because now deletion work is
proportional to the number of clone-modified blocks, not the size
of the original dataset.
Reviewed-by: Sean Eric Fagan <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Serapheim Dimitropoulos <[email protected]>
Signed-off-by: Sara Hartse <[email protected]>
Closes #8416
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/bplist.c | 16 | ||||
-rw-r--r-- | module/zfs/bpobj.c | 114 | ||||
-rw-r--r-- | module/zfs/dbuf.c | 30 | ||||
-rw-r--r-- | module/zfs/dsl_dataset.c | 236 | ||||
-rw-r--r-- | module/zfs/dsl_deadlist.c | 331 | ||||
-rw-r--r-- | module/zfs/dsl_destroy.c | 185 | ||||
-rw-r--r-- | module/zfs/dsl_dir.c | 104 | ||||
-rw-r--r-- | module/zfs/dsl_pool.c | 3 | ||||
-rw-r--r-- | module/zfs/dsl_scan.c | 21 | ||||
-rw-r--r-- | module/zfs/spa.c | 499 | ||||
-rw-r--r-- | module/zfs/spa_history.c | 3 | ||||
-rw-r--r-- | module/zfs/zthr.c | 94 |
12 files changed, 1512 insertions, 124 deletions
diff --git a/module/zfs/bplist.c b/module/zfs/bplist.c index c81151e08..47ea364ef 100644 --- a/module/zfs/bplist.c +++ b/module/zfs/bplist.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include <sys/bplist.h> @@ -75,3 +75,17 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) } mutex_exit(&bpl->bpl_lock); } + +void +bplist_clear(bplist_t *bpl) +{ + bplist_entry_t *bpe; + + mutex_enter(&bpl->bpl_lock); + while ((bpe = list_head(&bpl->bpl_list))) { + bplist_iterate_last_removed = bpe; + list_remove(&bpl->bpl_list, bpe); + kmem_free(bpe, sizeof (*bpe)); + } + mutex_exit(&bpl->bpl_lock); +} diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 633801956..561d0cf8a 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017 Datto Inc. */ @@ -83,6 +83,9 @@ bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) size = BPOBJ_SIZE_V0; else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) size = BPOBJ_SIZE_V1; + else if (!spa_feature_is_active(dmu_objset_spa(os), + SPA_FEATURE_LIVELIST)) + size = BPOBJ_SIZE_V2; else size = sizeof (bpobj_phys_t); @@ -171,6 +174,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); + bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2); bpo->bpo_phys = bpo->bpo_dbuf->db_data; return (0); } @@ -245,8 +249,8 @@ bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) * Update bpobj and all of its parents with new space accounting. */ static void -propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed, - uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx) +propagate_space_reduction(bpobj_info_t *bpi, int64_t freed, + int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx) { for (; bpi != NULL; bpi = bpi->bpi_parent) { @@ -263,22 +267,22 @@ propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed, static int bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, - dmu_tx_t *tx, boolean_t free) + int64_t start, dmu_tx_t *tx, boolean_t free) { int err = 0; - uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0; + int64_t freed = 0, comp_freed = 0, uncomp_freed = 0; dmu_buf_t *dbuf = NULL; bpobj_t *bpo = bpi->bpi_bpo; - for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { + for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { uint64_t offset = i * sizeof (blkptr_t); uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); if (dbuf == NULL || dbuf->db_offset > offset) { if (dbuf) dmu_buf_rele(dbuf, FTAG); - err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, - FTAG, &dbuf, 0); + err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, + offset, FTAG, &dbuf, 0); if (err) break; } @@ -288,18 +292,26 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, blkptr_t *bparray = dbuf->db_data; blkptr_t *bp = &bparray[blkoff]; - err = func(arg, bp, tx); + + boolean_t bp_freed = BP_GET_FREE(bp); + err = func(arg, bp, bp_freed, tx); if (err) break; if (free) { + int sign = bp_freed ? -1 : +1; spa_t *spa = dmu_objset_spa(bpo->bpo_os); - freed += bp_get_dsize_sync(spa, bp); - comp_freed += BP_GET_PSIZE(bp); - uncomp_freed += BP_GET_UCSIZE(bp); + freed += sign * bp_get_dsize_sync(spa, bp); + comp_freed += sign * BP_GET_PSIZE(bp); + uncomp_freed += sign * BP_GET_UCSIZE(bp); ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); bpo->bpo_phys->bpo_num_blkptrs--; ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); + if (bp_freed) { + ASSERT(bpo->bpo_havefreed); + bpo->bpo_phys->bpo_num_freed--; + ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0); + } } } if (free) { @@ -328,7 +340,7 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, */ static int bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, - dmu_tx_t *tx, boolean_t free) + dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size) { list_t stack; bpobj_info_t *bpi; @@ -341,6 +353,10 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, list_create(&stack, sizeof (bpobj_info_t), offsetof(bpobj_info_t, bpi_node)); mutex_enter(&initial_bpo->bpo_lock); + + if (bpobj_size != NULL) + *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs; + list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); while ((bpi = list_head(&stack)) != NULL) { @@ -354,7 +370,8 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, dmu_buf_will_dirty(bpo->bpo_dbuf, tx); if (bpi->bpi_visited == B_FALSE) { - err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free); + err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx, + free); bpi->bpi_visited = B_TRUE; if (err != 0) break; @@ -433,6 +450,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, * We have unprocessed subobjs. Process the next one. */ ASSERT(bpo->bpo_havecomp); + ASSERT3P(bpobj_size, ==, NULL); /* Add the last subobj to stack. */ int64_t i = bpi->bpi_unprocessed_subobjs - 1; @@ -489,16 +507,45 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) { - return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); + return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL)); } /* * Iterate the entries. If func returns nonzero, iteration will stop. + * + * If there are no subobjs: + * + * *bpobj_size can be used to return the number of block pointers in the + * bpobj. Note that this may be different from the number of block pointers + * that are iterated over, if iteration is terminated early (e.g. by the func + * returning nonzero). + * + * If there are concurrent (or subsequent) modifications to the bpobj then the + * returned *bpobj_size can be passed as "start" to + * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries. */ int -bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, + uint64_t *bpobj_size) { - return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); + return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size)); +} + +/* + * Iterate over the blkptrs in the bpobj beginning at index start. If func + * returns nonzero, iteration will stop. This is a livelist specific function + * since it assumes that there are no subobjs present. + */ +int +livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, + int64_t start) +{ + if (bpo->bpo_havesubobj) + VERIFY0(bpo->bpo_phys->bpo_subobjs); + bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0); + int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE); + kmem_free(bpi, sizeof (bpobj_info_t)); + return (err); } /* @@ -724,7 +771,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) } void -bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) +bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { blkptr_t stored_bp = *bp; uint64_t offset; @@ -755,8 +803,8 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); } - /* We never need the fill count. */ stored_bp.blk_fill = 0; + BP_SET_FREE(&stored_bp, bp_freed); mutex_enter(&bpo->bpo_lock); @@ -779,11 +827,16 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) dmu_buf_will_dirty(bpo->bpo_dbuf, tx); bpo->bpo_phys->bpo_num_blkptrs++; - bpo->bpo_phys->bpo_bytes += + int sign = bp_freed ? -1 : +1; + bpo->bpo_phys->bpo_bytes += sign * bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); if (bpo->bpo_havecomp) { - bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); - bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); + bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp); + } + if (bp_freed) { + ASSERT(bpo->bpo_havefreed); + bpo->bpo_phys->bpo_num_freed++; } mutex_exit(&bpo->bpo_lock); } @@ -799,7 +852,7 @@ struct space_range_arg { /* ARGSUSED */ static int -space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { struct space_range_arg *sra = arg; @@ -863,3 +916,18 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, *uncompp = sra.uncomp; return (err); } + +/* + * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a + * bpobj are designated as free or allocated that information is not preserved + * in bplists. + */ +/* ARGSUSED */ +int +bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + bplist_t *bpl = arg; + bplist_append(bpl, bp); + return (0); +} diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 4d347b6f4..0518205f9 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3286,6 +3286,13 @@ dbuf_hold_impl_arg(struct dbuf_hold_arg *dh) *(dh->dh_dbp) = NULL; + /* If the pool has been created, verify the tx_sync_lock is not held */ + spa_t *spa = dh->dh_dn->dn_objset->os_spa; + dsl_pool_t *dp = spa->spa_dsl_pool; + if (dp != NULL) { + ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock)); + } + /* dbuf_find() returns with db_mtx held */ dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, dh->dh_level, dh->dh_blkid); @@ -4480,6 +4487,29 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { /* + * If the blkptr being remapped is tracked by a livelist, + * then we need to make sure the livelist reflects the update. + * First, cancel out the old blkptr by appending a 'FREE' + * entry. Next, add an 'ALLOC' to track the new version. This + * way we avoid trying to free an inaccurate blkptr at delete. + * Note that embedded blkptrs are not tracked in livelists. + */ + if (dn->dn_objset != spa_meta_objset(spa)) { + dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg) { + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_frees, + bp); + bplist_append(&ds->ds_dir->dd_pending_allocs, + &bp_copy); + } + } + + /* * The db_rwlock prevents dbuf_read_impl() from * dereferencing the BP while we are changing it. To * avoid lock contention, only grab it when we are actually diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 0cd458ef4..848a8508c 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -122,13 +122,12 @@ parent_delta(dsl_dataset_t *ds, int64_t delta) void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { - int used, compressed, uncompressed; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int used = bp_get_dsize_sync(spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; - used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); - compressed = BP_GET_PSIZE(bp); - uncompressed = BP_GET_UCSIZE(bp); - dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); @@ -164,6 +163,19 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) ds->ds_feature_activation[f] = (void *)B_TRUE; } + /* + * Track block for livelist, but ignore embedded blocks because + * they do not need to be freed. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg && + !(BP_IS_EMBEDDED(bp))) { + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_allocs, bp); + } + mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, compressed, uncompressed, tx); @@ -207,8 +219,8 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, DVA_SET_VDEV(dva, vdev); DVA_SET_OFFSET(dva, offset); DVA_SET_ASIZE(dva, size); - - dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx); + dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE, + tx); } } @@ -239,6 +251,19 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ASSERT(!ds->ds_is_snapshot); dmu_buf_will_dirty(ds->ds_dbuf, tx); + /* + * Track block for livelist, but ignore embedded blocks because + * they do not need to be freed. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg && + !(BP_IS_EMBEDDED(bp))) { + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_frees, bp); + } + if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; @@ -267,7 +292,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, */ bplist_append(&ds->ds_pending_deadlist, bp); } else { - dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); + dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx); } ASSERT3U(ds->ds_prev->ds_object, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj); @@ -1241,6 +1266,14 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, ASSERT(dmu_tx_is_syncing(tx)); ASSERT(lastname[0] != '@'); + /* + * Filesystems will eventually have their origin set to dp_origin_snap, + * but that's taken care of in dsl_dataset_create_sync_dd. When + * creating a filesystem, this function is called with origin equal to + * NULL. + */ + if (origin != NULL) + ASSERT3P(origin, !=, dp->dp_origin_snap); ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); @@ -1251,6 +1284,20 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_deleg_set_create_perms(dd, tx, cr); /* + * If we are creating a clone and the livelist feature is enabled, + * add the entry DD_FIELD_LIVELIST to ZAP. + */ + if (origin != NULL && + spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) { + objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dir_zapify(dd, tx); + uint64_t obj = dsl_deadlist_alloc(mos, tx); + VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST, + sizeof (uint64_t), 1, &obj, tx)); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx); + } + + /* * Since we're creating a new node we know it's a leaf, so we can * initialize the counts if the limit feature is active. */ @@ -2036,12 +2083,149 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) } } +/* + * Check if the percentage of blocks shared between the clone and the + * snapshot (as opposed to those that are clone only) is below a certain + * threshold + */ +boolean_t +dsl_livelist_should_disable(dsl_dataset_t *ds) +{ + uint64_t used, referenced; + int percent_shared; + + used = dsl_dir_get_usedds(ds->ds_dir); + referenced = dsl_get_referenced(ds); + ASSERT3U(referenced, >=, 0); + ASSERT3U(used, >=, 0); + if (referenced == 0) + return (B_FALSE); + percent_shared = (100 * (referenced - used)) / referenced; + if (percent_shared <= zfs_livelist_min_percent_shared) + return (B_TRUE); + return (B_FALSE); +} + +/* + * Check if it is possible to combine two livelist entries into one. + * This is the case if the combined number of 'live' blkptrs (ALLOCs that + * don't have a matching FREE) is under the maximum sublist size. + * We check this by subtracting twice the total number of frees from the total + * number of blkptrs. FREEs are counted twice because each FREE blkptr + * will cancel out an ALLOC blkptr when the livelist is processed. + */ +static boolean_t +dsl_livelist_should_condense(dsl_deadlist_entry_t *first, + dsl_deadlist_entry_t *next) +{ + uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed + + next->dle_bpobj.bpo_phys->bpo_num_freed; + uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs + + next->dle_bpobj.bpo_phys->bpo_num_blkptrs; + if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries) + return (B_TRUE); + return (B_FALSE); +} + +typedef struct try_condense_arg { + spa_t *spa; + dsl_dataset_t *ds; +} try_condense_arg_t; + +/* + * Iterate over the livelist entries, searching for a pair to condense. + * A nonzero return value means stop, 0 means keep looking. + */ static int -deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first) { - dsl_deadlist_t *dl = arg; - dsl_deadlist_insert(dl, bp, tx); - return (0); + try_condense_arg_t *tca = arg; + spa_t *spa = tca->spa; + dsl_dataset_t *ds = tca->ds; + dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; + dsl_deadlist_entry_t *next; + + /* The condense thread has not yet been created at import */ + if (spa->spa_livelist_condense_zthr == NULL) + return (1); + + /* A condense is already in progress */ + if (spa->spa_to_condense.ds != NULL) + return (1); + + next = AVL_NEXT(&ll->dl_tree, &first->dle_node); + /* The livelist has only one entry - don't condense it */ + if (next == NULL) + return (1); + + /* Next is the newest entry - don't condense it */ + if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL) + return (1); + + /* This pair is not ready to condense but keep looking */ + if (!dsl_livelist_should_condense(first, next)) + return (0); + + /* + * Add a ref to prevent the dataset from being evicted while + * the condense zthr or synctask are running. Ref will be + * released at the end of the condense synctask + */ + dmu_buf_add_ref(ds->ds_dbuf, spa); + + spa->spa_to_condense.ds = ds; + spa->spa_to_condense.first = first; + spa->spa_to_condense.next = next; + spa->spa_to_condense.syncing = B_FALSE; + spa->spa_to_condense.cancelled = B_FALSE; + + zthr_wakeup(spa->spa_livelist_condense_zthr); + return (1); +} + +static void +dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_dir_t *dd = ds->ds_dir; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist); + + /* Check if we need to add a new sub-livelist */ + if (last == NULL) { + /* The livelist is empty */ + dsl_deadlist_add_key(&dd->dd_livelist, + tx->tx_txg - 1, tx); + } else if (spa_sync_pass(spa) == 1) { + /* + * Check if the newest entry is full. If it is, make a new one. + * We only do this once per sync because we could overfill a + * sublist in one sync pass and don't want to add another entry + * for a txg that is already represented. This ensures that + * blkptrs born in the same txg are stored in the same sublist. + */ + bpobj_t bpobj = last->dle_bpobj; + uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs; + uint64_t free = bpobj.bpo_phys->bpo_num_freed; + uint64_t alloc = all - free; + if (alloc > zfs_livelist_max_entries) { + dsl_deadlist_add_key(&dd->dd_livelist, + tx->tx_txg - 1, tx); + } + } + + /* Insert each entry into the on-disk livelist */ + bplist_iterate(&dd->dd_pending_allocs, + dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx); + bplist_iterate(&dd->dd_pending_frees, + dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx); + + /* Attempt to condense every pair of adjacent entries */ + try_condense_arg_t arg = { + .spa = spa, + .ds = ds + }; + dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense, + &arg); } void @@ -2050,7 +2234,14 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) objset_t *os = ds->ds_objset; bplist_iterate(&ds->ds_pending_deadlist, - deadlist_enqueue_cb, &ds->ds_deadlist, tx); + dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx); + + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { + dsl_flush_pending_livelist(ds, tx); + if (dsl_livelist_should_disable(ds)) { + dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE); + } + } dsl_bookmark_sync_done(ds, tx); @@ -3335,6 +3526,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) uint64_t oldnext_obj; int64_t delta; + ASSERT(nvlist_empty(ddpa->err_ds)); + VERIFY0(promote_hold(ddpa, dp, FTAG)); hds = ddpa->ddpa_clone; @@ -3519,6 +3712,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; + /* + * Since livelists are specific to a clone's origin txg, they + * are no longer accurate. Destroy the livelist from the clone being + * promoted. If the origin dataset is a clone, destroy its livelist + * as well. + */ + dsl_dir_remove_livelist(dd, tx, B_TRUE); + dsl_dir_remove_livelist(origin_ds->ds_dir, tx, B_TRUE); + /* log history record */ spa_history_log_internal_ds(hds, "promote", tx, ""); @@ -3990,6 +4192,14 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_scan_ds_clone_swapped(origin_head, clone, tx); + /* + * Destroy any livelists associated with the clone or the origin, + * since after the swap the corresponding livelists are no longer + * valid. + */ + dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE); + dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE); + spa_history_log_internal_ds(clone, "clone swap", tx, "parent=%s", origin_head->ds_dir->dd_myname); } diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 9e3a3331b..25878f0ea 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -20,16 +20,16 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ -#include <sys/dsl_dataset.h> #include <sys/dmu.h> #include <sys/refcount.h> #include <sys/zap.h> #include <sys/zfs_context.h> #include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> /* * Deadlist concurrency: @@ -51,6 +51,68 @@ * provides its own locking, and dl_oldfmt is immutable. */ +/* + * Livelist Overview + * ================ + * + * Livelists use the same 'deadlist_t' struct as deadlists and are also used + * to track blkptrs over the lifetime of a dataset. Livelists however, belong + * to clones and track the blkptrs that are clone-specific (were born after + * the clone's creation). The exception is embedded block pointers which are + * not included in livelists because they do not need to be freed. + * + * When it comes time to delete the clone, the livelist provides a quick + * reference as to what needs to be freed. For this reason, livelists also track + * when clone-specific blkptrs are freed before deletion to prevent double + * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the + * deletion algorithm iterates backwards over the livelist, matching + * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists + * are also updated in the case when blkptrs are remapped: the old version + * of the blkptr is cancelled out with a FREE and the new version is tracked + * with an ALLOC. + * + * To bound the amount of memory required for deletion, livelists over a + * certain size are spread over multiple entries. Entries are grouped by + * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will + * be in the same entry. This allows us to delete livelists incrementally + * over multiple syncs, one entry at a time. + * + * During the lifetime of the clone, livelists can get extremely large. + * Their size is managed by periodic condensing (preemptively cancelling out + * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when + * the shared space between the clone and its origin is so small that it + * doesn't make sense to use livelists anymore. + */ + +/* + * The threshold sublist size at which we create a new sub-livelist for the + * next txg. However, since blkptrs of the same transaction group must be in + * the same sub-list, the actual sublist size may exceed this. When picking the + * size we had to balance the fact that larger sublists mean fewer sublists + * (decreasing the cost of insertion) against the consideration that sublists + * will be loaded into memory and shouldn't take up an inordinate amount of + * space. We settled on ~500000 entries, corresponding to roughly 128M. + */ +unsigned long zfs_livelist_max_entries = 500000; + +/* + * We can approximate how much of a performance gain a livelist will give us + * based on the percentage of blocks shared between the clone and its origin. + * 0 percent shared means that the clone has completely diverged and that the + * old method is maximally effective: every read from the block tree will + * result in lots of frees. Livelists give us gains when they track blocks + * scattered across the tree, when one read in the old method might only + * result in a few frees. Once the clone has been overwritten enough, + * writes are no longer sparse and we'll no longer get much of a benefit from + * tracking them with a livelist. We chose a lower limit of 75 percent shared + * (25 percent overwritten). This means that 1/4 of all block pointers will be + * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists + * to make deletion 4x faster. Once the amount of shared space drops below this + * threshold, the clone will revert to the old deletion method. + */ +int zfs_livelist_min_percent_shared = 75; + + static int dsl_deadlist_compare(const void *arg1, const void *arg2) { @@ -89,6 +151,23 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) } void +dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) +{ + dsl_deadlist_entry_t *dle; + + ASSERT(dsl_deadlist_is_open(dl)); + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + mutex_exit(&dl->dl_lock); + for (dle = avl_first(&dl->dl_tree); dle != NULL; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + if (func(args, dle) != 0) + break; + } +} + +void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) { dmu_object_info_t doi; @@ -188,7 +267,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) static void dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, - const blkptr_t *bp, dmu_tx_t *tx) + const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(MUTEX_HELD(&dl->dl_lock)); if (dle->dle_bpobj.bpo_object == @@ -200,7 +279,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } - bpobj_enqueue(&dle->dle_bpobj, bp, tx); + bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx); } static void @@ -221,14 +300,15 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, } void -dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) +dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; avl_index_t where; if (dl->dl_oldfmt) { - bpobj_enqueue(&dl->dl_bpobj, bp, tx); + bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx); return; } @@ -236,10 +316,12 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) dsl_deadlist_load_tree(dl); dmu_buf_will_dirty(dl->dl_dbuf, tx); + + int sign = bp_freed ? -1 : +1; dl->dl_phys->dl_used += - bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); - dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); - dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); + sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); + dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); + dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); dle_tofind.dle_mintxg = bp->blk_birth; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); @@ -255,10 +337,26 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) } ASSERT3P(dle, !=, NULL); - dle_enqueue(dl, dle, bp, tx); + dle_enqueue(dl, dle, bp, bp_freed, tx); mutex_exit(&dl->dl_lock); } +int +dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, B_FALSE, tx); + return (0); +} + +int +dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, B_TRUE, tx); + return (0); +} + /* * Insert new key in deadlist, which must be > all current entries. * mintxg is not inclusive. @@ -317,6 +415,108 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) } /* + * Remove a deadlist entry and all of its contents by removing the entry from + * the deadlist's avl tree, freeing the entry's bpobj and adjusting the + * deadlist's space accounting accordingly. + */ +void +dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + uint64_t used, comp, uncomp; + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + objset_t *os = dl->dl_os; + + if (dl->dl_oldfmt) + return; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); + VERIFY3P(dle, !=, NULL); + + avl_remove(&dl->dl_tree, dle); + VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); + VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) { + bpobj_decr_empty(os, tx); + } else { + bpobj_free(os, dle->dle_bpobj.bpo_object, tx); + } + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + mutex_exit(&dl->dl_lock); +} + +/* + * Clear out the contents of a deadlist_entry by freeing its bpobj, + * replacing it with an empty bpobj and adjusting the deadlist's + * space accounting + */ +void +dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, + dmu_tx_t *tx) +{ + uint64_t new_obj, used, comp, uncomp; + objset_t *os = dl->dl_os; + + mutex_enter(&dl->dl_lock); + VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); + VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) + bpobj_decr_empty(os, tx); + else + bpobj_free(os, dle->dle_bpobj.bpo_object, tx); + bpobj_close(&dle->dle_bpobj); + new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj)); + VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg, + new_obj, tx)); + ASSERT(bpobj_is_empty(&dle->dle_bpobj)); + mutex_exit(&dl->dl_lock); +} + +/* + * Return the first entry in deadlist's avl tree + */ +dsl_deadlist_entry_t * +dsl_deadlist_first(dsl_deadlist_t *dl) +{ + dsl_deadlist_entry_t *dle; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + dle = avl_first(&dl->dl_tree); + mutex_exit(&dl->dl_lock); + + return (dle); +} + +/* + * Return the last entry in deadlist's avl tree + */ +dsl_deadlist_entry_t * +dsl_deadlist_last(dsl_deadlist_t *dl) +{ + dsl_deadlist_entry_t *dle; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + dle = avl_last(&dl->dl_tree); + mutex_exit(&dl->dl_lock); + + return (dle); +} + +/* * Walk ds's snapshots to regenerate generate ZAP & AVL. */ static void @@ -478,10 +678,11 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, } static int -dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; - dsl_deadlist_insert(dl, bp, tx); + dsl_deadlist_insert(dl, bp, bp_freed, tx); return (0); } @@ -572,3 +773,109 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, } mutex_exit(&dl->dl_lock); } + +typedef struct livelist_entry { + const blkptr_t *le_bp; + avl_node_t le_node; +} livelist_entry_t; + +static int +livelist_compare(const void *larg, const void *rarg) +{ + const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp; + const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp; + + /* Sort them according to dva[0] */ + uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); + uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); + + if (l_dva0_vdev != r_dva0_vdev) + return (AVL_CMP(l_dva0_vdev, r_dva0_vdev)); + + /* if vdevs are equal, sort by offsets. */ + uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); + uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); + if (l_dva0_offset == r_dva0_offset) + ASSERT3U(l->blk_birth, ==, r->blk_birth); + return (AVL_CMP(l_dva0_offset, r_dva0_offset)); +} + +struct livelist_iter_arg { + avl_tree_t *avl; + bplist_t *to_free; + zthr_t *t; +}; + +/* + * Expects an AVL tree which is incrementally filled will FREE blkptrs + * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a + * corresponding FREE are stored in the supplied bplist. + */ +static int +dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + struct livelist_iter_arg *lia = arg; + avl_tree_t *avl = lia->avl; + bplist_t *to_free = lia->to_free; + zthr_t *t = lia->t; + ASSERT(tx == NULL); + + if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) + return (SET_ERROR(EINTR)); + if (bp_freed) { + livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t), + KM_SLEEP); + blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + *temp_bp = *bp; + node->le_bp = temp_bp; + avl_add(avl, node); + } else { + livelist_entry_t node; + node.le_bp = bp; + livelist_entry_t *found = avl_find(avl, &node, NULL); + if (found != NULL) { + avl_remove(avl, found); + kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t)); + kmem_free(found, sizeof (livelist_entry_t)); + } else { + bplist_append(to_free, bp); + } + } + return (0); +} + +/* + * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs + * which have an ALLOC entry but no matching FREE + */ +int +dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, + uint64_t *size) +{ + avl_tree_t avl; + avl_create(&avl, livelist_compare, sizeof (livelist_entry_t), + offsetof(livelist_entry_t, le_node)); + + /* process the sublist */ + struct livelist_iter_arg arg = { + .avl = &avl, + .to_free = to_free, + .t = t + }; + int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); + + avl_destroy(&avl); + return (err); +} + +#if defined(_KERNEL) +/* CSTYLED */ +module_param(zfs_livelist_max_entries, ulong, 0644); +MODULE_PARM_DESC(zfs_livelist_max_entries, + "Size to start the next sub-livelist in a livelist"); + +module_param(zfs_livelist_min_percent_shared, int, 0644); +MODULE_PARM_DESC(zfs_livelist_min_percent_shared, + "Threshold at which livelist is disabled"); +#endif diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 2f98e87ed..5c483c5dd 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -45,6 +45,9 @@ #include <sys/dmu_impl.h> #include <sys/zvol.h> #include <sys/zcp.h> +#include <sys/dsl_deadlist.h> +#include <sys/zthr.h> +#include <sys/spa_impl.h> int dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) @@ -120,7 +123,7 @@ struct process_old_arg { }; static int -process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { struct process_old_arg *poa = arg; dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; @@ -128,7 +131,7 @@ process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { - dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); + dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx); if (poa->ds_prev && !poa->after_branch_point && bp->blk_birth > dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { @@ -852,6 +855,127 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) dmu_object_free_zapified(mos, ddobj, tx); } +static void +dsl_clone_destroy_assert(dsl_dir_t *dd) +{ + uint64_t used, comp, uncomp; + + ASSERT(dsl_dir_is_clone(dd)); + dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); + + ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used); + ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp); + /* + * Greater than because we do not track embedded block pointers in + * the livelist + */ + ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp); + + ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list)); + ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list)); +} + +/* + * Start the delete process for a clone. Free its zil, verify the space usage + * and queue the blkptrs for deletion by adding the livelist to the pool-wide + * delete queue. + */ +static void +dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t zap_obj, to_delete, used, comp, uncomp; + objset_t *os; + dsl_dir_t *dd = ds->ds_dir; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + VERIFY0(dmu_objset_from_ds(ds, &os)); + + /* Check that the clone is in a correct state to be deleted */ + dsl_clone_destroy_assert(dd); + + /* Destroy the zil */ + zil_destroy_sync(dmu_objset_zil(os), tx); + + VERIFY0(zap_lookup(mos, dd->dd_object, + DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete)); + /* Initialize deleted_clones entry to track livelists to cleanup */ + int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (error == ENOENT) { + zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA, + DMU_OT_NONE, 0, tx); + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, + &(zap_obj), tx)); + spa->spa_livelists_to_delete = zap_obj; + } else if (error != 0) { + zfs_panic_recover("zfs: error %d was returned while looking " + "up DMU_POOL_DELETED_CLONES in the zap"); + return; + } + VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx)); + + /* Clone is no longer using space, now tracked by dp_free_dir */ + dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); + dsl_dir_diduse_space(dd, DD_USED_HEAD, + -used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes, + tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + dsl_dir_remove_livelist(dd, tx, B_FALSE); + zthr_wakeup(spa->spa_livelist_delete_zthr); +} + +/* + * Move the bptree into the pool's list of trees to clean up, update space + * accounting information and destroy the zil. + */ +void +dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t used, comp, uncomp; + objset_t *os; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + + zil_destroy_sync(dmu_objset_zil(os), tx); + + if (!spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dsl_scan_t *scn = dp->dp_scan; + spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, + tx); + dp->dp_bptree_obj = bptree_alloc(mos, tx); + VERIFY0(zap_add(mos, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx)); + ASSERT(!scn->scn_async_destroying); + scn->scn_async_destroying = B_TRUE; + } + + used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; + comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; + uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + dsl_dataset_phys(ds)->ds_unique_bytes == used); + + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + bptree_add(mos, dp->dp_bptree_obj, + &dsl_dataset_phys(ds)->ds_bp, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + used, comp, uncomp, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); +} + void dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) { @@ -911,7 +1035,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) } /* - * Destroy the deadlist. Unless it's a clone, the + * Destroy the deadlist. Unless it's a clone, the * deadlist should be empty since the dataset has no snapshots. * (If it's a clone, it's safe to ignore the deadlist contents * since they are still referenced by the origin snapshot.) @@ -924,51 +1048,18 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) if (dsl_dataset_remap_deadlist_exists(ds)) dsl_dataset_destroy_remap_deadlist(ds, tx); - objset_t *os; - VERIFY0(dmu_objset_from_ds(ds, &os)); - - if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { - old_synchronous_dataset_destroy(ds, tx); + /* + * Each destroy is responsible for both destroying (enqueuing + * to be destroyed) the blkptrs comprising the dataset as well as + * those belonging to the zil. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { + dsl_async_clone_destroy(ds, tx); + } else if (spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dsl_async_dataset_destroy(ds, tx); } else { - /* - * Move the bptree into the pool's list of trees to - * clean up and update space accounting information. - */ - uint64_t used, comp, uncomp; - - zil_destroy_sync(dmu_objset_zil(os), tx); - - if (!spa_feature_is_active(dp->dp_spa, - SPA_FEATURE_ASYNC_DESTROY)) { - dsl_scan_t *scn = dp->dp_scan; - spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, - tx); - dp->dp_bptree_obj = bptree_alloc(mos, tx); - VERIFY0(zap_add(mos, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, - &dp->dp_bptree_obj, tx)); - ASSERT(!scn->scn_async_destroying); - scn->scn_async_destroying = B_TRUE; - } - - used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; - comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; - uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; - - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - dsl_dataset_phys(ds)->ds_unique_bytes == used); - - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - bptree_add(mos, dp->dp_bptree_obj, - &dsl_dataset_phys(ds)->ds_bp, - dsl_dataset_phys(ds)->ds_prev_snap_txg, - used, comp, uncomp, tx); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, - -used, -comp, -uncomp, tx); - dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, - used, comp, uncomp, tx); + old_synchronous_dataset_destroy(ds, tx); } if (ds->ds_prev != NULL) { diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 741ca232e..7b3c892c0 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -48,6 +48,7 @@ #include <sys/policy.h> #include <sys/zfs_znode.h> #include <sys/zvol.h> +#include <sys/zthr.h> #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -155,6 +156,9 @@ dsl_dir_evict_async(void *dbu) spa_async_close(dd->dd_pool->dp_spa, dd); + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); + dsl_prop_fini(dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); @@ -255,6 +259,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); + if (dsl_dir_is_zapified(dd)) { + uint64_t obj; + err = zap_lookup(dp->dp_meta_objset, + dd->dd_object, DD_FIELD_LIVELIST, + sizeof (uint64_t), 1, &obj); + if (err == 0) + dsl_dir_livelist_open(dd, obj); + else if (err != ENOENT) + goto errout; + } } dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, @@ -263,6 +277,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, if (winner != NULL) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); dsl_prop_fini(dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); @@ -291,6 +307,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, errout: if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); dsl_prop_fini(dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); @@ -2178,6 +2196,90 @@ dsl_dir_is_zapified(dsl_dir_t *dd) return (doi.doi_type == DMU_OTN_ZAP_METADATA); } +void +dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa, + SPA_FEATURE_LIVELIST)); + dsl_deadlist_open(&dd->dd_livelist, mos, obj); + bplist_create(&dd->dd_pending_allocs); + bplist_create(&dd->dd_pending_frees); +} + +void +dsl_dir_livelist_close(dsl_dir_t *dd) +{ + dsl_deadlist_close(&dd->dd_livelist); + bplist_destroy(&dd->dd_pending_allocs); + bplist_destroy(&dd->dd_pending_frees); +} + +void +dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) +{ + uint64_t obj; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + livelist_condense_entry_t to_condense = spa->spa_to_condense; + + if (!dsl_deadlist_is_open(&dd->dd_livelist)) + return; + + /* + * If the livelist being removed is set to be condensed, stop the + * condense zthr and indicate the cancellation in the spa_to_condense + * struct in case the condense no-wait synctask has already started + */ + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL && + (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) { + /* + * We use zthr_wait_cycle_done instead of zthr_cancel + * because we don't want to destroy the zthr, just have + * it skip its current task. + */ + spa->spa_to_condense.cancelled = B_TRUE; + zthr_wait_cycle_done(ll_condense_thread); + /* + * If we've returned from zthr_wait_cycle_done without + * clearing the to_condense data structure it's either + * because the no-wait synctask has started (which is + * indicated by 'syncing' field of to_condense) and we + * can expect it to clear to_condense on its own. + * Otherwise, we returned before the zthr ran. The + * checkfunc will now fail as cancelled == B_TRUE so we + * can safely NULL out ds, allowing a different dir's + * livelist to be condensed. + * + * We can be sure that the to_condense struct will not + * be repopulated at this stage because both this + * function and dsl_livelist_try_condense execute in + * syncing context. + */ + if ((spa->spa_to_condense.ds != NULL) && + !spa->spa_to_condense.syncing) { + dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, + spa); + spa->spa_to_condense.ds = NULL; + } + } + + dsl_dir_livelist_close(dd); + int err = zap_lookup(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj); + if (err == 0) { + VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_LIVELIST, tx)); + if (total) { + dsl_deadlist_free(dp->dp_meta_objset, obj, tx); + spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); + } + } else { + ASSERT3U(err, !=, ENOENT); + } +} + #if defined(_KERNEL) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 49e527912..c342f0c51 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -721,7 +721,8 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * Now that the datasets have been completely synced, we can * clean up our in-memory structures accumulated while syncing: * - * - move dead blocks from the pending deadlist to the on-disk deadlist + * - move dead blocks from the pending deadlist and livelists + * to the on-disk versions * - release hold from dsl_dataset_dirty() * - release key mapping hold from dsl_dataset_dirty() */ diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f25a559a9..d6956f560 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -3103,8 +3103,18 @@ dsl_scan_update_stats(dsl_scan_t *scn) } static int -dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { + ASSERT(!bp_freed); + return (dsl_scan_free_block_cb(arg, bp, tx)); +} + +static int +dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); dsl_scan_t *scn = arg; const dva_t *dva = &bp->blk_dva[0]; @@ -3123,6 +3133,7 @@ dsl_scan_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t used = 0, comp, uncomp; + boolean_t clones_left; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); @@ -3136,7 +3147,8 @@ dsl_scan_active(dsl_scan_t *scn) (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } - return (used != 0); + clones_left = spa_livelist_delete_check(spa); + return ((used != 0) || (clones_left)); } static boolean_t @@ -3233,7 +3245,7 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, - dsl_scan_free_block_cb, scn, tx); + bpobj_dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); scn->scn_zio_root = NULL; @@ -3330,7 +3342,8 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } - if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { + if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && + !spa_livelist_delete_check(spa)) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 6af162edb..da221fb2e 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -233,6 +233,27 @@ uint64_t zfs_max_missing_tvds_scan = 0; boolean_t zfs_pause_spa_sync = B_FALSE; /* + * Variables to indicate the livelist condense zthr func should wait at certain + * points for the livelist to be removed - used to test condense/destroy races + */ +int zfs_livelist_condense_zthr_pause = 0; +int zfs_livelist_condense_sync_pause = 0; + +/* + * Variables to track whether or not condense cancellation has been + * triggered in testing. + */ +int zfs_livelist_condense_sync_cancel = 0; +int zfs_livelist_condense_zthr_cancel = 0; + +/* + * Variable to track whether or not extra ALLOC blkptrs were added to a + * livelist entry while it was being condensed (caused by the way we track + * remapped blkptrs in dbuf_remap_impl) + */ +int zfs_livelist_condense_new_alloc = 0; + +/* * ========================================================================== * SPA properties routines * ========================================================================== @@ -1481,6 +1502,27 @@ spa_unload_log_sm_metadata(spa_t *spa) spa->spa_unflushed_stats.sus_blocklimit = 0; } +static void +spa_destroy_aux_threads(spa_t *spa) +{ + if (spa->spa_condense_zthr != NULL) { + zthr_destroy(spa->spa_condense_zthr); + spa->spa_condense_zthr = NULL; + } + if (spa->spa_checkpoint_discard_zthr != NULL) { + zthr_destroy(spa->spa_checkpoint_discard_zthr); + spa->spa_checkpoint_discard_zthr = NULL; + } + if (spa->spa_livelist_delete_zthr != NULL) { + zthr_destroy(spa->spa_livelist_delete_zthr); + spa->spa_livelist_delete_zthr = NULL; + } + if (spa->spa_livelist_condense_zthr != NULL) { + zthr_destroy(spa->spa_livelist_condense_zthr); + spa->spa_livelist_condense_zthr = NULL; + } +} + /* * Opposite of spa_load(). */ @@ -1552,15 +1594,7 @@ spa_unload(spa_t *spa) spa->spa_vdev_removal = NULL; } - if (spa->spa_condense_zthr != NULL) { - zthr_destroy(spa->spa_condense_zthr); - spa->spa_condense_zthr = NULL; - } - - if (spa->spa_checkpoint_discard_zthr != NULL) { - zthr_destroy(spa->spa_checkpoint_discard_zthr); - spa->spa_checkpoint_discard_zthr = NULL; - } + spa_destroy_aux_threads(spa); spa_condense_fini(spa); @@ -2335,6 +2369,376 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) return (SET_ERROR(err)); } +boolean_t +spa_livelist_delete_check(spa_t *spa) +{ + return (spa->spa_livelists_to_delete != 0); +} + +/* ARGSUSED */ +static boolean_t +spa_livelist_delete_cb_check(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + return (spa_livelist_delete_check(spa)); +} + +static int +delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + spa_t *spa = arg; + zio_free(spa, tx->tx_txg, bp); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, + -bp_get_dsize_sync(spa, bp), + -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); + return (0); +} + +static int +dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) +{ + int err; + zap_cursor_t zc; + zap_attribute_t za; + zap_cursor_init(&zc, os, zap_obj); + err = zap_cursor_retrieve(&zc, &za); + zap_cursor_fini(&zc); + if (err == 0) + *llp = za.za_first_integer; + return (err); +} + +/* + * Components of livelist deletion that must be performed in syncing + * context: freeing block pointers and updating the pool-wide data + * structures to indicate how much work is left to do + */ +typedef struct sublist_delete_arg { + spa_t *spa; + dsl_deadlist_t *ll; + uint64_t key; + bplist_t *to_free; +} sublist_delete_arg_t; + +static void +sublist_delete_sync(void *arg, dmu_tx_t *tx) +{ + sublist_delete_arg_t *sda = arg; + spa_t *spa = sda->spa; + dsl_deadlist_t *ll = sda->ll; + uint64_t key = sda->key; + bplist_t *to_free = sda->to_free; + + bplist_iterate(to_free, delete_blkptr_cb, spa, tx); + dsl_deadlist_remove_entry(ll, key, tx); +} + +typedef struct livelist_delete_arg { + spa_t *spa; + uint64_t ll_obj; + uint64_t zap_obj; +} livelist_delete_arg_t; + +static void +livelist_delete_sync(void *arg, dmu_tx_t *tx) +{ + livelist_delete_arg_t *lda = arg; + spa_t *spa = lda->spa; + uint64_t ll_obj = lda->ll_obj; + uint64_t zap_obj = lda->zap_obj; + objset_t *mos = spa->spa_meta_objset; + uint64_t count; + + /* free the livelist and decrement the feature count */ + VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); + dsl_deadlist_free(mos, ll_obj, tx); + spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); + VERIFY0(zap_count(mos, zap_obj, &count)); + if (count == 0) { + /* no more livelists to delete */ + VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, tx)); + VERIFY0(zap_destroy(mos, zap_obj, tx)); + spa->spa_livelists_to_delete = 0; + } +} + +/* + * Load in the value for the livelist to be removed and open it. Then, + * load its first sublist and determine which block pointers should actually + * be freed. Then, call a synctask which performs the actual frees and updates + * the pool-wide livelist data. + */ +/* ARGSUSED */ +void +spa_livelist_delete_cb(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + uint64_t ll_obj = 0, count; + objset_t *mos = spa->spa_meta_objset; + uint64_t zap_obj = spa->spa_livelists_to_delete; + /* + * Determine the next livelist to delete. This function should only + * be called if there is at least one deleted clone. + */ + VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); + VERIFY0(zap_count(mos, ll_obj, &count)); + if (count > 0) { + dsl_deadlist_t ll = { 0 }; + dsl_deadlist_entry_t *dle; + bplist_t to_free; + dsl_deadlist_open(&ll, mos, ll_obj); + dle = dsl_deadlist_first(&ll); + ASSERT3P(dle, !=, NULL); + bplist_create(&to_free); + int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, + z, NULL); + if (err == 0) { + sublist_delete_arg_t sync_arg = { + .spa = spa, + .ll = &ll, + .key = dle->dle_mintxg, + .to_free = &to_free + }; + zfs_dbgmsg("deleting sublist (id %llu) from" + " livelist %llu, %d remaining", + dle->dle_bpobj.bpo_object, ll_obj, count - 1); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + sublist_delete_sync, &sync_arg, 0, + ZFS_SPACE_CHECK_DESTROY)); + } else { + ASSERT(err == EINTR); + } + bplist_clear(&to_free); + bplist_destroy(&to_free); + dsl_deadlist_close(&ll); + } else { + livelist_delete_arg_t sync_arg = { + .spa = spa, + .ll_obj = ll_obj, + .zap_obj = zap_obj + }; + zfs_dbgmsg("deletion of livelist %llu completed", ll_obj); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, + &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); + } +} + +void +spa_start_livelist_destroy_thread(spa_t *spa) +{ + ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); + spa->spa_livelist_delete_zthr = zthr_create( + spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa); +} + +typedef struct livelist_new_arg { + bplist_t *allocs; + bplist_t *frees; +} livelist_new_arg_t; + +static int +livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(tx == NULL); + livelist_new_arg_t *lna = arg; + if (bp_freed) { + bplist_append(lna->frees, bp); + } else { + bplist_append(lna->allocs, bp); + zfs_livelist_condense_new_alloc++; + } + return (0); +} + +typedef struct livelist_condense_arg { + spa_t *spa; + bplist_t to_keep; + uint64_t first_size; + uint64_t next_size; +} livelist_condense_arg_t; + +static void +spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) +{ + livelist_condense_arg_t *lca = arg; + spa_t *spa = lca->spa; + bplist_t new_frees; + dsl_dataset_t *ds = spa->spa_to_condense.ds; + + /* Have we been cancelled? */ + if (spa->spa_to_condense.cancelled) { + zfs_livelist_condense_sync_cancel++; + goto out; + } + + dsl_deadlist_entry_t *first = spa->spa_to_condense.first; + dsl_deadlist_entry_t *next = spa->spa_to_condense.next; + dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; + + /* + * It's possible that the livelist was changed while the zthr was + * running. Therefore, we need to check for new blkptrs in the two + * entries being condensed and continue to track them in the livelist. + * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), + * it's possible that the newly added blkptrs are FREEs or ALLOCs so + * we need to sort them into two different bplists. + */ + uint64_t first_obj = first->dle_bpobj.bpo_object; + uint64_t next_obj = next->dle_bpobj.bpo_object; + uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; + uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; + + bplist_create(&new_frees); + livelist_new_arg_t new_bps = { + .allocs = &lca->to_keep, + .frees = &new_frees, + }; + + if (cur_first_size > lca->first_size) { + VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, + livelist_track_new_cb, &new_bps, lca->first_size)); + } + if (cur_next_size > lca->next_size) { + VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, + livelist_track_new_cb, &new_bps, lca->next_size)); + } + + dsl_deadlist_clear_entry(first, ll, tx); + ASSERT(bpobj_is_empty(&first->dle_bpobj)); + dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); + + bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); + bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); + bplist_destroy(&new_frees); + + char dsname[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " + "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " + "(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj, + cur_first_size, next_obj, cur_next_size, + first->dle_bpobj.bpo_object, + first->dle_bpobj.bpo_phys->bpo_num_blkptrs); +out: + dmu_buf_rele(ds->ds_dbuf, spa); + spa->spa_to_condense.ds = NULL; + bplist_clear(&lca->to_keep); + bplist_destroy(&lca->to_keep); + kmem_free(lca, sizeof (livelist_condense_arg_t)); + spa->spa_to_condense.syncing = B_FALSE; +} + +void +spa_livelist_condense_cb(void *arg, zthr_t *t) +{ + while (zfs_livelist_condense_zthr_pause && + !(zthr_has_waiters(t) || zthr_iscancelled(t))) + delay(1); + + spa_t *spa = arg; + dsl_deadlist_entry_t *first = spa->spa_to_condense.first; + dsl_deadlist_entry_t *next = spa->spa_to_condense.next; + uint64_t first_size, next_size; + + livelist_condense_arg_t *lca = + kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); + bplist_create(&lca->to_keep); + + /* + * Process the livelists (matching FREEs and ALLOCs) in open context + * so we have minimal work in syncing context to condense. + * + * We save bpobj sizes (first_size and next_size) to use later in + * syncing context to determine if entries were added to these sublists + * while in open context. This is possible because the clone is still + * active and open for normal writes and we want to make sure the new, + * unprocessed blockpointers are inserted into the livelist normally. + * + * Note that dsl_process_sub_livelist() both stores the size number of + * blockpointers and iterates over them while the bpobj's lock held, so + * the sizes returned to us are consistent which what was actually + * processed. + */ + int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, + &first_size); + if (err == 0) + err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, + t, &next_size); + + if (err == 0) { + while (zfs_livelist_condense_sync_pause && + !(zthr_has_waiters(t) || zthr_iscancelled(t))) + delay(1); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + dmu_tx_mark_netfree(tx); + dmu_tx_hold_space(tx, 1); + err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); + if (err == 0) { + /* + * Prevent the condense zthr restarting before + * the synctask completes. + */ + spa->spa_to_condense.syncing = B_TRUE; + lca->spa = spa; + lca->first_size = first_size; + lca->next_size = next_size; + dsl_sync_task_nowait(spa_get_dsl(spa), + spa_livelist_condense_sync, lca, 0, + ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + return; + } + } + /* + * Condensing can not continue: either it was externally stopped or + * we were unable to assign to a tx because the pool has run out of + * space. In the second case, we'll just end up trying to condense + * again in a later txg. + */ + ASSERT(err != 0); + bplist_clear(&lca->to_keep); + bplist_destroy(&lca->to_keep); + kmem_free(lca, sizeof (livelist_condense_arg_t)); + dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); + spa->spa_to_condense.ds = NULL; + if (err == EINTR) + zfs_livelist_condense_zthr_cancel++; +} + +/* ARGSUSED */ +/* + * Check that there is something to condense but that a condense is not + * already in progress and that condensing has not been cancelled. + */ +static boolean_t +spa_livelist_condense_cb_check(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + if ((spa->spa_to_condense.ds != NULL) && + (spa->spa_to_condense.syncing == B_FALSE) && + (spa->spa_to_condense.cancelled == B_FALSE)) { + return (B_TRUE); + } + return (B_FALSE); +} + +void +spa_start_livelist_condensing_thread(spa_t *spa) +{ + spa->spa_to_condense.ds = NULL; + spa->spa_to_condense.first = NULL; + spa->spa_to_condense.next = NULL; + spa->spa_to_condense.syncing = B_FALSE; + spa->spa_to_condense.cancelled = B_FALSE; + + ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); + spa->spa_livelist_condense_zthr = zthr_create( + spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa); +} + static void spa_spawn_aux_threads(spa_t *spa) { @@ -2343,6 +2747,8 @@ spa_spawn_aux_threads(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_start_indirect_condensing_thread(spa); + spa_start_livelist_destroy_thread(spa); + spa_start_livelist_condensing_thread(spa); ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); spa->spa_checkpoint_discard_zthr = @@ -3604,6 +4010,15 @@ spa_ld_get_props(spa_t *spa) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* + * Load the livelist deletion field. If a livelist is queued for + * deletion, indicate that in the spa + */ + error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, + &spa->spa_livelists_to_delete, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* * Load the history object. If we have an older pool, this * will not be present. */ @@ -7571,6 +7986,14 @@ spa_async_suspend(spa_t *spa) zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); + + zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; + if (ll_delete_thread != NULL) + zthr_cancel(ll_delete_thread); + + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL) + zthr_cancel(ll_condense_thread); } void @@ -7589,6 +8012,14 @@ spa_async_resume(spa_t *spa) zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); + + zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; + if (ll_delete_thread != NULL) + zthr_resume(ll_delete_thread); + + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL) + zthr_resume(ll_condense_thread); } static boolean_t @@ -7639,14 +8070,28 @@ spa_async_request(spa_t *spa, int task) * ========================================================================== */ + static int -bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) { bpobj_t *bpo = arg; - bpobj_enqueue(bpo, bp, tx); + bpobj_enqueue(bpo, bp, bp_freed, tx); return (0); } +int +bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); +} + +int +bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); +} + static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { @@ -7657,6 +8102,14 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) return (0); } +static int +bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); + return (spa_free_sync_cb(arg, bp, tx)); +} + /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. @@ -7693,7 +8146,7 @@ spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) */ zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, - spa_free_sync_cb, zio, tx), ==, 0); + bpobj_spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } @@ -8296,7 +8749,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); - bplist_iterate(free_bpl, bpobj_enqueue_cb, + bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, &spa->spa_deferred_bpobj, tx); } @@ -8884,4 +9337,24 @@ MODULE_PARM_DESC(zfs_max_missing_tvds, " (in read-only mode)"); /* END CSTYLED */ +module_param(zfs_livelist_condense_zthr_pause, int, 0644); +MODULE_PARM_DESC(zfs_livelist_condense_zthr_pause, + "Set the livelist condense zthr to pause"); +module_param(zfs_livelist_condense_sync_pause, int, 0644); +MODULE_PARM_DESC(zfs_livelist_condense_sync_pause, + "Set the livelist condense synctask to pause"); + +module_param(zfs_livelist_condense_sync_cancel, int, 0644); +MODULE_PARM_DESC(zfs_livelist_condense_sync_cancel, + "Whether livelist condensing was canceled in the synctask"); +module_param(zfs_livelist_condense_zthr_cancel, int, 0644); +MODULE_PARM_DESC(zfs_livelist_condense_zthr_cancel, + "Whether livelist condensing was canceled in the zthr function"); + +/* BEGIN CSTYLED */ +module_param(zfs_livelist_condense_new_alloc, int, 0644); +MODULE_PARM_DESC(zfs_livelist_condense_new_alloc, + "Whether extra ALLOC blkptrs were added to a livelist entry while it" + " was being condensed"); +/* END CSTYLED */ #endif diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index b590a1d57..68c6b544e 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. */ @@ -413,7 +413,6 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) /* spa_history_log_sync will free nvl */ return (err); - } /* diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c index 532e8ce0f..53c0a0b3d 100644 --- a/module/zfs/zthr.c +++ b/module/zfs/zthr.c @@ -207,12 +207,15 @@ struct zthr { /* flag set to true if we are canceling the zthr */ boolean_t zthr_cancel; + /* flag set to true if we are waiting for the zthr to finish */ + boolean_t zthr_haswaiters; + kcondvar_t zthr_wait_cv; /* * maximum amount of time that the zthr is spent sleeping; * if this is 0, the thread doesn't wake up until it gets * signaled. */ - hrtime_t zthr_wait_time; + hrtime_t zthr_sleep_timeout; /* consumer-provided callbacks & data */ zthr_checkfunc_t *zthr_checkfunc; @@ -239,14 +242,18 @@ zthr_procedure(void *arg) * order to prevent this process from incorrectly * contributing to the system load average when idle. */ - if (t->zthr_wait_time == 0) { + if (t->zthr_sleep_timeout == 0) { cv_wait_sig(&t->zthr_cv, &t->zthr_state_lock); } else { (void) cv_timedwait_sig_hires(&t->zthr_cv, - &t->zthr_state_lock, t->zthr_wait_time, + &t->zthr_state_lock, t->zthr_sleep_timeout, MSEC2NSEC(1), 0); } } + if (t->zthr_haswaiters) { + t->zthr_haswaiters = B_FALSE; + cv_broadcast(&t->zthr_wait_cv); + } } /* @@ -280,12 +287,13 @@ zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func, mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL); + cv_init(&t->zthr_wait_cv, NULL, CV_DEFAULT, NULL); mutex_enter(&t->zthr_state_lock); t->zthr_checkfunc = checkfunc; t->zthr_func = func; t->zthr_arg = arg; - t->zthr_wait_time = max_sleep; + t->zthr_sleep_timeout = max_sleep; t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri); @@ -303,6 +311,7 @@ zthr_destroy(zthr_t *t) mutex_destroy(&t->zthr_request_lock); mutex_destroy(&t->zthr_state_lock); cv_destroy(&t->zthr_cv); + cv_destroy(&t->zthr_wait_cv); kmem_free(t, sizeof (*t)); } @@ -355,9 +364,8 @@ zthr_cancel(zthr_t *t) * * [1] The thread has already been cancelled, therefore * there is nothing for us to do. - * [2] The thread is sleeping, so we broadcast the CV first - * to wake it up and then we set the flag and we are - * waiting for it to exit. + * [2] The thread is sleeping so we set the flag, broadcast + * the CV and wait for it to exit. * [3] The thread is doing work, in which case we just set * the flag and wait for it to finish. * [4] The thread was just created/resumed, in which case @@ -397,6 +405,7 @@ zthr_resume(zthr_t *t) ASSERT3P(&t->zthr_checkfunc, !=, NULL); ASSERT3P(&t->zthr_func, !=, NULL); ASSERT(!t->zthr_cancel); + ASSERT(!t->zthr_haswaiters); /* * There are 4 states that we find the zthr in at this point @@ -451,3 +460,74 @@ zthr_iscancelled(zthr_t *t) mutex_exit(&t->zthr_state_lock); return (cancelled); } + +/* + * Wait for the zthr to finish its current function. Similar to + * zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end + * early. Unlike zthr_cancel, the thread is not destroyed. If the zthr was + * sleeping or cancelled, return immediately. + */ +void +zthr_wait_cycle_done(zthr_t *t) +{ + mutex_enter(&t->zthr_state_lock); + + /* + * Since we are holding the zthr_state_lock at this point + * we can find the state in one of the following 5 states: + * + * [1] The thread has already cancelled, therefore + * there is nothing for us to do. + * [2] The thread is sleeping so we set the flag, broadcast + * the CV and wait for it to exit. + * [3] The thread is doing work, in which case we just set + * the flag and wait for it to finish. + * [4] The thread was just created/resumed, in which case + * the behavior is similar to [3]. + * [5] The thread is the middle of being cancelled, which is + * similar to [3]. We'll wait for the cancel, which is + * waiting for the zthr func. + * + * Since requests are serialized, by the time that we get + * control back we expect that the zthr has completed it's + * zthr_func. + */ + if (t->zthr_thread != NULL) { + t->zthr_haswaiters = B_TRUE; + + /* broadcast in case the zthr is sleeping */ + cv_broadcast(&t->zthr_cv); + + while ((t->zthr_haswaiters) && (t->zthr_thread != NULL)) + cv_wait(&t->zthr_wait_cv, &t->zthr_state_lock); + + ASSERT(!t->zthr_haswaiters); + } + + mutex_exit(&t->zthr_state_lock); +} + +/* + * This function is intended to be used by the zthr itself + * to check if another thread is waiting on it to finish + * + * returns TRUE if we have been asked to finish. + * + * returns FALSE otherwise. + */ +boolean_t +zthr_has_waiters(zthr_t *t) +{ + ASSERT3P(t->zthr_thread, ==, curthread); + + mutex_enter(&t->zthr_state_lock); + + /* + * Similarly to zthr_iscancelled(), we only grab the + * zthr_state_lock so that the zthr itself can use this + * to check for the request. + */ + boolean_t has_waiters = t->zthr_haswaiters; + mutex_exit(&t->zthr_state_lock); + return (has_waiters); +} |