summaryrefslogtreecommitdiffstats
path: root/module/zfs/dsl_dataset.c
diff options
context:
space:
mode:
authorSara Hartse <[email protected]>2019-07-26 10:54:14 -0700
committerBrian Behlendorf <[email protected]>2019-07-26 10:54:14 -0700
commit37f03da8ba6e1ab074b503e1dd63bfa7199d0537 (patch)
tree987b03643c33cd43b246a20aea28b8750f7b4ee6 /module/zfs/dsl_dataset.c
parentd274ac54609894d00a49c0a0da89abd3a7f3998d (diff)
Fast Clone Deletion
Deleting a clone requires finding blocks are clone-only, not shared with the snapshot. This was done by traversing the entire block tree which results in a large performance penalty for sparsely written clones. This is new method keeps track of clone blocks when they are modified in a "Livelist" so that, when it’s time to delete, the clone-specific blocks are already at hand. We see performance improvements because now deletion work is proportional to the number of clone-modified blocks, not the size of the original dataset. Reviewed-by: Sean Eric Fagan <[email protected]> Reviewed-by: Matt Ahrens <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Serapheim Dimitropoulos <[email protected]> Signed-off-by: Sara Hartse <[email protected]> Closes #8416
Diffstat (limited to 'module/zfs/dsl_dataset.c')
-rw-r--r--module/zfs/dsl_dataset.c236
1 files changed, 223 insertions, 13 deletions
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 0cd458ef4..848a8508c 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -122,13 +122,12 @@ parent_delta(dsl_dataset_t *ds, int64_t delta)
void
dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
{
- int used, compressed, uncompressed;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ int used = bp_get_dsize_sync(spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
int64_t delta;
- used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
- compressed = BP_GET_PSIZE(bp);
- uncompressed = BP_GET_UCSIZE(bp);
-
dprintf_bp(bp, "ds=%p", ds);
ASSERT(dmu_tx_is_syncing(tx));
@@ -164,6 +163,19 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
ds->ds_feature_activation[f] = (void *)B_TRUE;
}
+ /*
+ * Track block for livelist, but ignore embedded blocks because
+ * they do not need to be freed.
+ */
+ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+ bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+ !(BP_IS_EMBEDDED(bp))) {
+ ASSERT(dsl_dir_is_clone(ds->ds_dir));
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_LIVELIST));
+ bplist_append(&ds->ds_dir->dd_pending_allocs, bp);
+ }
+
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
@@ -207,8 +219,8 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
DVA_SET_VDEV(dva, vdev);
DVA_SET_OFFSET(dva, offset);
DVA_SET_ASIZE(dva, size);
-
- dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx);
+ dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE,
+ tx);
}
}
@@ -239,6 +251,19 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
ASSERT(!ds->ds_is_snapshot);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ /*
+ * Track block for livelist, but ignore embedded blocks because
+ * they do not need to be freed.
+ */
+ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+ bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+ !(BP_IS_EMBEDDED(bp))) {
+ ASSERT(dsl_dir_is_clone(ds->ds_dir));
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_LIVELIST));
+ bplist_append(&ds->ds_dir->dd_pending_frees, bp);
+ }
+
if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
int64_t delta;
@@ -267,7 +292,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
*/
bplist_append(&ds->ds_pending_deadlist, bp);
} else {
- dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
+ dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx);
}
ASSERT3U(ds->ds_prev->ds_object, ==,
dsl_dataset_phys(ds)->ds_prev_snap_obj);
@@ -1241,6 +1266,14 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(lastname[0] != '@');
+ /*
+ * Filesystems will eventually have their origin set to dp_origin_snap,
+ * but that's taken care of in dsl_dataset_create_sync_dd. When
+ * creating a filesystem, this function is called with origin equal to
+ * NULL.
+ */
+ if (origin != NULL)
+ ASSERT3P(origin, !=, dp->dp_origin_snap);
ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
@@ -1251,6 +1284,20 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
dsl_deleg_set_create_perms(dd, tx, cr);
/*
+ * If we are creating a clone and the livelist feature is enabled,
+ * add the entry DD_FIELD_LIVELIST to ZAP.
+ */
+ if (origin != NULL &&
+ spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) {
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dsl_dir_zapify(dd, tx);
+ uint64_t obj = dsl_deadlist_alloc(mos, tx);
+ VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST,
+ sizeof (uint64_t), 1, &obj, tx));
+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx);
+ }
+
+ /*
* Since we're creating a new node we know it's a leaf, so we can
* initialize the counts if the limit feature is active.
*/
@@ -2036,12 +2083,149 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
}
}
+/*
+ * Check if the percentage of blocks shared between the clone and the
+ * snapshot (as opposed to those that are clone only) is below a certain
+ * threshold
+ */
+boolean_t
+dsl_livelist_should_disable(dsl_dataset_t *ds)
+{
+ uint64_t used, referenced;
+ int percent_shared;
+
+ used = dsl_dir_get_usedds(ds->ds_dir);
+ referenced = dsl_get_referenced(ds);
+ ASSERT3U(referenced, >=, 0);
+ ASSERT3U(used, >=, 0);
+ if (referenced == 0)
+ return (B_FALSE);
+ percent_shared = (100 * (referenced - used)) / referenced;
+ if (percent_shared <= zfs_livelist_min_percent_shared)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/*
+ * Check if it is possible to combine two livelist entries into one.
+ * This is the case if the combined number of 'live' blkptrs (ALLOCs that
+ * don't have a matching FREE) is under the maximum sublist size.
+ * We check this by subtracting twice the total number of frees from the total
+ * number of blkptrs. FREEs are counted twice because each FREE blkptr
+ * will cancel out an ALLOC blkptr when the livelist is processed.
+ */
+static boolean_t
+dsl_livelist_should_condense(dsl_deadlist_entry_t *first,
+ dsl_deadlist_entry_t *next)
+{
+ uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed +
+ next->dle_bpobj.bpo_phys->bpo_num_freed;
+ uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs +
+ next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+ if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+typedef struct try_condense_arg {
+ spa_t *spa;
+ dsl_dataset_t *ds;
+} try_condense_arg_t;
+
+/*
+ * Iterate over the livelist entries, searching for a pair to condense.
+ * A nonzero return value means stop, 0 means keep looking.
+ */
static int
-deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first)
{
- dsl_deadlist_t *dl = arg;
- dsl_deadlist_insert(dl, bp, tx);
- return (0);
+ try_condense_arg_t *tca = arg;
+ spa_t *spa = tca->spa;
+ dsl_dataset_t *ds = tca->ds;
+ dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
+ dsl_deadlist_entry_t *next;
+
+ /* The condense thread has not yet been created at import */
+ if (spa->spa_livelist_condense_zthr == NULL)
+ return (1);
+
+ /* A condense is already in progress */
+ if (spa->spa_to_condense.ds != NULL)
+ return (1);
+
+ next = AVL_NEXT(&ll->dl_tree, &first->dle_node);
+ /* The livelist has only one entry - don't condense it */
+ if (next == NULL)
+ return (1);
+
+ /* Next is the newest entry - don't condense it */
+ if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL)
+ return (1);
+
+ /* This pair is not ready to condense but keep looking */
+ if (!dsl_livelist_should_condense(first, next))
+ return (0);
+
+ /*
+ * Add a ref to prevent the dataset from being evicted while
+ * the condense zthr or synctask are running. Ref will be
+ * released at the end of the condense synctask
+ */
+ dmu_buf_add_ref(ds->ds_dbuf, spa);
+
+ spa->spa_to_condense.ds = ds;
+ spa->spa_to_condense.first = first;
+ spa->spa_to_condense.next = next;
+ spa->spa_to_condense.syncing = B_FALSE;
+ spa->spa_to_condense.cancelled = B_FALSE;
+
+ zthr_wakeup(spa->spa_livelist_condense_zthr);
+ return (1);
+}
+
+static void
+dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+ dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist);
+
+ /* Check if we need to add a new sub-livelist */
+ if (last == NULL) {
+ /* The livelist is empty */
+ dsl_deadlist_add_key(&dd->dd_livelist,
+ tx->tx_txg - 1, tx);
+ } else if (spa_sync_pass(spa) == 1) {
+ /*
+ * Check if the newest entry is full. If it is, make a new one.
+ * We only do this once per sync because we could overfill a
+ * sublist in one sync pass and don't want to add another entry
+ * for a txg that is already represented. This ensures that
+ * blkptrs born in the same txg are stored in the same sublist.
+ */
+ bpobj_t bpobj = last->dle_bpobj;
+ uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs;
+ uint64_t free = bpobj.bpo_phys->bpo_num_freed;
+ uint64_t alloc = all - free;
+ if (alloc > zfs_livelist_max_entries) {
+ dsl_deadlist_add_key(&dd->dd_livelist,
+ tx->tx_txg - 1, tx);
+ }
+ }
+
+ /* Insert each entry into the on-disk livelist */
+ bplist_iterate(&dd->dd_pending_allocs,
+ dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx);
+ bplist_iterate(&dd->dd_pending_frees,
+ dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx);
+
+ /* Attempt to condense every pair of adjacent entries */
+ try_condense_arg_t arg = {
+ .spa = spa,
+ .ds = ds
+ };
+ dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense,
+ &arg);
}
void
@@ -2050,7 +2234,14 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
objset_t *os = ds->ds_objset;
bplist_iterate(&ds->ds_pending_deadlist,
- deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+ dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx);
+
+ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
+ dsl_flush_pending_livelist(ds, tx);
+ if (dsl_livelist_should_disable(ds)) {
+ dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE);
+ }
+ }
dsl_bookmark_sync_done(ds, tx);
@@ -3335,6 +3526,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
uint64_t oldnext_obj;
int64_t delta;
+ ASSERT(nvlist_empty(ddpa->err_ds));
+
VERIFY0(promote_hold(ddpa, dp, FTAG));
hds = ddpa->ddpa_clone;
@@ -3519,6 +3712,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
+ /*
+ * Since livelists are specific to a clone's origin txg, they
+ * are no longer accurate. Destroy the livelist from the clone being
+ * promoted. If the origin dataset is a clone, destroy its livelist
+ * as well.
+ */
+ dsl_dir_remove_livelist(dd, tx, B_TRUE);
+ dsl_dir_remove_livelist(origin_ds->ds_dir, tx, B_TRUE);
+
/* log history record */
spa_history_log_internal_ds(hds, "promote", tx, "");
@@ -3990,6 +4192,14 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
dsl_scan_ds_clone_swapped(origin_head, clone, tx);
+ /*
+ * Destroy any livelists associated with the clone or the origin,
+ * since after the swap the corresponding livelists are no longer
+ * valid.
+ */
+ dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE);
+ dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE);
+
spa_history_log_internal_ds(clone, "clone swap", tx,
"parent=%s", origin_head->ds_dir->dd_myname);
}