aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/bpobj.c
diff options
context:
space:
mode:
authorSara Hartse <[email protected]>2019-07-26 10:54:14 -0700
committerBrian Behlendorf <[email protected]>2019-07-26 10:54:14 -0700
commit37f03da8ba6e1ab074b503e1dd63bfa7199d0537 (patch)
tree987b03643c33cd43b246a20aea28b8750f7b4ee6 /module/zfs/bpobj.c
parentd274ac54609894d00a49c0a0da89abd3a7f3998d (diff)
Fast Clone Deletion
Deleting a clone requires finding blocks are clone-only, not shared with the snapshot. This was done by traversing the entire block tree which results in a large performance penalty for sparsely written clones. This is new method keeps track of clone blocks when they are modified in a "Livelist" so that, when it’s time to delete, the clone-specific blocks are already at hand. We see performance improvements because now deletion work is proportional to the number of clone-modified blocks, not the size of the original dataset. Reviewed-by: Sean Eric Fagan <[email protected]> Reviewed-by: Matt Ahrens <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Serapheim Dimitropoulos <[email protected]> Signed-off-by: Sara Hartse <[email protected]> Closes #8416
Diffstat (limited to 'module/zfs/bpobj.c')
-rw-r--r--module/zfs/bpobj.c114
1 files changed, 91 insertions, 23 deletions
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index 633801956..561d0cf8a 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2017 Datto Inc.
*/
@@ -83,6 +83,9 @@ bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
size = BPOBJ_SIZE_V0;
else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
size = BPOBJ_SIZE_V1;
+ else if (!spa_feature_is_active(dmu_objset_spa(os),
+ SPA_FEATURE_LIVELIST))
+ size = BPOBJ_SIZE_V2;
else
size = sizeof (bpobj_phys_t);
@@ -171,6 +174,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+ bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
bpo->bpo_phys = bpo->bpo_dbuf->db_data;
return (0);
}
@@ -245,8 +249,8 @@ bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
* Update bpobj and all of its parents with new space accounting.
*/
static void
-propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
- uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx)
+propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
+ int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
{
for (; bpi != NULL; bpi = bpi->bpi_parent) {
@@ -263,22 +267,22 @@ propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
static int
bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
- dmu_tx_t *tx, boolean_t free)
+ int64_t start, dmu_tx_t *tx, boolean_t free)
{
int err = 0;
- uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
+ int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
dmu_buf_t *dbuf = NULL;
bpobj_t *bpo = bpi->bpi_bpo;
- for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+ for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
uint64_t offset = i * sizeof (blkptr_t);
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
if (dbuf == NULL || dbuf->db_offset > offset) {
if (dbuf)
dmu_buf_rele(dbuf, FTAG);
- err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
- FTAG, &dbuf, 0);
+ err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+ offset, FTAG, &dbuf, 0);
if (err)
break;
}
@@ -288,18 +292,26 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
blkptr_t *bparray = dbuf->db_data;
blkptr_t *bp = &bparray[blkoff];
- err = func(arg, bp, tx);
+
+ boolean_t bp_freed = BP_GET_FREE(bp);
+ err = func(arg, bp, bp_freed, tx);
if (err)
break;
if (free) {
+ int sign = bp_freed ? -1 : +1;
spa_t *spa = dmu_objset_spa(bpo->bpo_os);
- freed += bp_get_dsize_sync(spa, bp);
- comp_freed += BP_GET_PSIZE(bp);
- uncomp_freed += BP_GET_UCSIZE(bp);
+ freed += sign * bp_get_dsize_sync(spa, bp);
+ comp_freed += sign * BP_GET_PSIZE(bp);
+ uncomp_freed += sign * BP_GET_UCSIZE(bp);
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
bpo->bpo_phys->bpo_num_blkptrs--;
ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+ if (bp_freed) {
+ ASSERT(bpo->bpo_havefreed);
+ bpo->bpo_phys->bpo_num_freed--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
+ }
}
}
if (free) {
@@ -328,7 +340,7 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
*/
static int
bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
- dmu_tx_t *tx, boolean_t free)
+ dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
{
list_t stack;
bpobj_info_t *bpi;
@@ -341,6 +353,10 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
list_create(&stack, sizeof (bpobj_info_t),
offsetof(bpobj_info_t, bpi_node));
mutex_enter(&initial_bpo->bpo_lock);
+
+ if (bpobj_size != NULL)
+ *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
+
list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
while ((bpi = list_head(&stack)) != NULL) {
@@ -354,7 +370,8 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
if (bpi->bpi_visited == B_FALSE) {
- err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free);
+ err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
+ free);
bpi->bpi_visited = B_TRUE;
if (err != 0)
break;
@@ -433,6 +450,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
* We have unprocessed subobjs. Process the next one.
*/
ASSERT(bpo->bpo_havecomp);
+ ASSERT3P(bpobj_size, ==, NULL);
/* Add the last subobj to stack. */
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
@@ -489,16 +507,45 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
int
bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
{
- return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
}
/*
* Iterate the entries. If func returns nonzero, iteration will stop.
+ *
+ * If there are no subobjs:
+ *
+ * *bpobj_size can be used to return the number of block pointers in the
+ * bpobj. Note that this may be different from the number of block pointers
+ * that are iterated over, if iteration is terminated early (e.g. by the func
+ * returning nonzero).
+ *
+ * If there are concurrent (or subsequent) modifications to the bpobj then the
+ * returned *bpobj_size can be passed as "start" to
+ * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
*/
int
-bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
+ uint64_t *bpobj_size)
{
- return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
+ return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
+}
+
+/*
+ * Iterate over the blkptrs in the bpobj beginning at index start. If func
+ * returns nonzero, iteration will stop. This is a livelist specific function
+ * since it assumes that there are no subobjs present.
+ */
+int
+livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
+ int64_t start)
+{
+ if (bpo->bpo_havesubobj)
+ VERIFY0(bpo->bpo_phys->bpo_subobjs);
+ bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
+ int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
+ kmem_free(bpi, sizeof (bpobj_info_t));
+ return (err);
}
/*
@@ -724,7 +771,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
}
void
-bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
{
blkptr_t stored_bp = *bp;
uint64_t offset;
@@ -755,8 +803,8 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
}
- /* We never need the fill count. */
stored_bp.blk_fill = 0;
+ BP_SET_FREE(&stored_bp, bp_freed);
mutex_enter(&bpo->bpo_lock);
@@ -779,11 +827,16 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
bpo->bpo_phys->bpo_num_blkptrs++;
- bpo->bpo_phys->bpo_bytes +=
+ int sign = bp_freed ? -1 : +1;
+ bpo->bpo_phys->bpo_bytes += sign *
bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
if (bpo->bpo_havecomp) {
- bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
- bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
+ bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
+ }
+ if (bp_freed) {
+ ASSERT(bpo->bpo_havefreed);
+ bpo->bpo_phys->bpo_num_freed++;
}
mutex_exit(&bpo->bpo_lock);
}
@@ -799,7 +852,7 @@ struct space_range_arg {
/* ARGSUSED */
static int
-space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
{
struct space_range_arg *sra = arg;
@@ -863,3 +916,18 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
*uncompp = sra.uncomp;
return (err);
}
+
+/*
+ * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
+ * bpobj are designated as free or allocated that information is not preserved
+ * in bplists.
+ */
+/* ARGSUSED */
+int
+bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ bplist_t *bpl = arg;
+ bplist_append(bpl, bp);
+ return (0);
+}