aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorPaul Zuchowski <[email protected]>2019-03-06 12:50:55 -0500
committerBrian Behlendorf <[email protected]>2019-03-06 09:50:55 -0800
commita73e8fdb93d24b885f0c38202a34da51013d674a (patch)
treece23108be9aee5f492375ce034e836769654e15b /module/zfs
parent96ebc5a1a4cc57806882e4e9b38c49ba8a5bdfda (diff)
Stack overflow in recursive bpobj_iterate_impl
The function bpobj_iterate_impl overflows the stack when bpobjs are deeply nested. Rewrite the function to eliminate the recursion. Reviewed-by: Serapheim Dimitropoulos <[email protected]> Reviewed-by: Matt Ahrens <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Paul Zuchowski <[email protected]> Closes #7674 Closes #7675 Closes #7908
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/bpobj.c328
-rw-r--r--module/zfs/dbuf.c18
2 files changed, 244 insertions, 102 deletions
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index 94bd1fd8f..633801956 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -206,29 +206,73 @@ bpobj_is_empty(bpobj_t *bpo)
(!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
}
-static int
-bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
- boolean_t free)
+/*
+ * A recursive iteration of the bpobjs would be nice here but we run the risk
+ * of overflowing function stack space. Instead, find each subobj and add it
+ * to the head of our list so it can be scanned for subjobjs. Like a
+ * recursive implementation, the "deepest" subobjs will be freed first.
+ * When a subobj is found to have no additional subojs, free it.
+ */
+typedef struct bpobj_info {
+ bpobj_t *bpi_bpo;
+ /*
+ * This object is a subobj of bpi_parent,
+ * at bpi_index in its subobj array.
+ */
+ struct bpobj_info *bpi_parent;
+ uint64_t bpi_index;
+ /* How many of our subobj's are left to process. */
+ uint64_t bpi_unprocessed_subobjs;
+ /* True after having visited this bpo's directly referenced BPs. */
+ boolean_t bpi_visited;
+ list_node_t bpi_node;
+} bpobj_info_t;
+
+static bpobj_info_t *
+bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
{
- dmu_object_info_t doi;
- int epb;
- int64_t i;
- int err = 0;
- dmu_buf_t *dbuf = NULL;
+ bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
+ bpi->bpi_bpo = bpo;
+ bpi->bpi_parent = parent;
+ bpi->bpi_index = index;
+ if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+ bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
+ }
+ return (bpi);
+}
- ASSERT(bpobj_is_open(bpo));
- mutex_enter(&bpo->bpo_lock);
+/*
+ * Update bpobj and all of its parents with new space accounting.
+ */
+static void
+propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
+ uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx)
+{
- if (free)
- dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ for (; bpi != NULL; bpi = bpi->bpi_parent) {
+ bpobj_t *p = bpi->bpi_bpo;
+ ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
+ p->bpo_phys->bpo_bytes -= freed;
+ ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
+ if (p->bpo_havecomp) {
+ p->bpo_phys->bpo_comp -= comp_freed;
+ p->bpo_phys->bpo_uncomp -= uncomp_freed;
+ }
+ }
+}
- for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
- blkptr_t *bparray;
- blkptr_t *bp;
- uint64_t offset, blkoff;
+static int
+bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
+ dmu_tx_t *tx, boolean_t free)
+{
+ int err = 0;
+ uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
+ dmu_buf_t *dbuf = NULL;
+ bpobj_t *bpo = bpi->bpi_bpo;
- offset = i * sizeof (blkptr_t);
- blkoff = P2PHASE(i, bpo->bpo_epb);
+ for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+ uint64_t offset = i * sizeof (blkptr_t);
+ uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
if (dbuf == NULL || dbuf->db_offset > offset) {
if (dbuf)
@@ -242,119 +286,199 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
ASSERT3U(offset, >=, dbuf->db_offset);
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
- bparray = dbuf->db_data;
- bp = &bparray[blkoff];
+ blkptr_t *bparray = dbuf->db_data;
+ blkptr_t *bp = &bparray[blkoff];
err = func(arg, bp, tx);
if (err)
break;
+
if (free) {
- bpo->bpo_phys->bpo_bytes -=
- bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
- ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
- if (bpo->bpo_havecomp) {
- bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
- bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
- }
+ spa_t *spa = dmu_objset_spa(bpo->bpo_os);
+ freed += bp_get_dsize_sync(spa, bp);
+ comp_freed += BP_GET_PSIZE(bp);
+ uncomp_freed += BP_GET_UCSIZE(bp);
+ ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
bpo->bpo_phys->bpo_num_blkptrs--;
ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
}
}
+ if (free) {
+ propagate_space_reduction(bpi, freed, comp_freed,
+ uncomp_freed, tx);
+ VERIFY0(dmu_free_range(bpo->bpo_os,
+ bpo->bpo_object,
+ bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
+ DMU_OBJECT_END, tx));
+ }
if (dbuf) {
dmu_buf_rele(dbuf, FTAG);
dbuf = NULL;
}
- if (free) {
- VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
- (i + 1) * sizeof (blkptr_t), DMU_OBJECT_END, tx));
- }
- if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
- goto out;
+ return (err);
+}
- ASSERT(bpo->bpo_havecomp);
- err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
- if (err) {
- mutex_exit(&bpo->bpo_lock);
- return (err);
- }
- ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
- epb = doi.doi_data_block_size / sizeof (uint64_t);
+/*
+ * Given an initial bpo, start by freeing the BPs that are directly referenced
+ * by that bpo. If the bpo has subobjs, read in its last subobj and push the
+ * subobj to our stack. By popping items off our stack, eventually we will
+ * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if
+ * requested also free the now-empty bpo from disk and decrement
+ * its parent's subobj count. We continue popping each subobj from our stack,
+ * visiting its last subobj until they too have no more subobjs, and so on.
+ */
+static int
+bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
+ dmu_tx_t *tx, boolean_t free)
+{
+ list_t stack;
+ bpobj_info_t *bpi;
+ int err = 0;
- for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
- uint64_t *objarray;
- uint64_t offset, blkoff;
- bpobj_t sublist;
- uint64_t used_before, comp_before, uncomp_before;
- uint64_t used_after, comp_after, uncomp_after;
+ /*
+ * Create a "stack" for us to work with without worrying about
+ * stack overflows. Initialize it with the initial_bpo.
+ */
+ list_create(&stack, sizeof (bpobj_info_t),
+ offsetof(bpobj_info_t, bpi_node));
+ mutex_enter(&initial_bpo->bpo_lock);
+ list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
- offset = i * sizeof (uint64_t);
- blkoff = P2PHASE(i, epb);
+ while ((bpi = list_head(&stack)) != NULL) {
+ bpobj_t *bpo = bpi->bpi_bpo;
- if (dbuf == NULL || dbuf->db_offset > offset) {
- if (dbuf)
- dmu_buf_rele(dbuf, FTAG);
- err = dmu_buf_hold(bpo->bpo_os,
- bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
- if (err)
+ ASSERT3P(bpo, !=, NULL);
+ ASSERT(MUTEX_HELD(&bpo->bpo_lock));
+ ASSERT(bpobj_is_open(bpo));
+
+ if (free)
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+ if (bpi->bpi_visited == B_FALSE) {
+ err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free);
+ bpi->bpi_visited = B_TRUE;
+ if (err != 0)
break;
}
+ /*
+ * We've finished with this bpo's directly-referenced BP's and
+ * it has no more unprocessed subobjs. We can free its
+ * bpobj_info_t (unless it is the topmost, initial_bpo).
+ * If we are freeing from disk, we can also do that.
+ */
+ if (bpi->bpi_unprocessed_subobjs == 0) {
+ /*
+ * If there are no entries, there should
+ * be no bytes.
+ */
+ if (bpobj_is_empty(bpo)) {
+ ASSERT0(bpo->bpo_phys->bpo_bytes);
+ ASSERT0(bpo->bpo_phys->bpo_comp);
+ ASSERT0(bpo->bpo_phys->bpo_uncomp);
+ }
- ASSERT3U(offset, >=, dbuf->db_offset);
- ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+ /* The initial_bpo has no parent and is not closed. */
+ if (bpi->bpi_parent != NULL) {
+ if (free) {
+ bpobj_t *p = bpi->bpi_parent->bpi_bpo;
+
+ ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
+ ASSERT3U(p->bpo_phys->bpo_num_subobjs,
+ >, 0);
+ ASSERT3U(bpi->bpi_index, ==,
+ p->bpo_phys->bpo_num_subobjs - 1);
+ ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
+ tx));
+
+ p->bpo_phys->bpo_num_subobjs--;
+
+ VERIFY0(dmu_free_range(p->bpo_os,
+ p->bpo_phys->bpo_subobjs,
+ bpi->bpi_index * sizeof (uint64_t),
+ sizeof (uint64_t), tx));
+
+ /* eliminate the empty subobj list */
+ if (bpo->bpo_havesubobj &&
+ bpo->bpo_phys->bpo_subobjs != 0) {
+ ASSERT0(bpo->bpo_phys->
+ bpo_num_subobjs);
+ err = dmu_object_free(
+ bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs,
+ tx);
+ if (err)
+ break;
+ bpo->bpo_phys->bpo_subobjs = 0;
+ }
+ err = dmu_object_free(p->bpo_os,
+ bpo->bpo_object, tx);
+ if (err)
+ break;
+ }
+
+ mutex_exit(&bpo->bpo_lock);
+ bpobj_close(bpo);
+ kmem_free(bpo, sizeof (bpobj_t));
+ } else {
+ mutex_exit(&bpo->bpo_lock);
+ }
- objarray = dbuf->db_data;
- err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
- if (err)
- break;
- if (free) {
- err = bpobj_space(&sublist,
- &used_before, &comp_before, &uncomp_before);
- if (err != 0) {
- bpobj_close(&sublist);
+ /*
+ * Finished processing this bpo. Unlock, and free
+ * our "stack" info.
+ */
+ list_remove_head(&stack);
+ kmem_free(bpi, sizeof (bpobj_info_t));
+ } else {
+ /*
+ * We have unprocessed subobjs. Process the next one.
+ */
+ ASSERT(bpo->bpo_havecomp);
+
+ /* Add the last subobj to stack. */
+ int64_t i = bpi->bpi_unprocessed_subobjs - 1;
+ uint64_t offset = i * sizeof (uint64_t);
+
+ uint64_t obj_from_sublist;
+ err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ offset, sizeof (uint64_t), &obj_from_sublist,
+ DMU_READ_PREFETCH);
+ if (err)
break;
- }
- }
- err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
- if (free) {
- VERIFY3U(0, ==, bpobj_space(&sublist,
- &used_after, &comp_after, &uncomp_after));
- bpo->bpo_phys->bpo_bytes -= used_before - used_after;
- ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
- bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
- bpo->bpo_phys->bpo_uncomp -=
- uncomp_before - uncomp_after;
- }
+ bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
+ KM_SLEEP);
- bpobj_close(&sublist);
- if (err)
- break;
- if (free) {
- err = dmu_object_free(bpo->bpo_os,
- objarray[blkoff], tx);
+ err = bpobj_open(sublist, bpo->bpo_os,
+ obj_from_sublist);
if (err)
break;
- bpo->bpo_phys->bpo_num_subobjs--;
- ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
+
+ list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
+ mutex_enter(&sublist->bpo_lock);
+ bpi->bpi_unprocessed_subobjs--;
}
}
- if (dbuf) {
- dmu_buf_rele(dbuf, FTAG);
- dbuf = NULL;
- }
- if (free) {
- VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
- bpo->bpo_phys->bpo_subobjs,
- (i + 1) * sizeof (uint64_t), DMU_OBJECT_END, tx));
- }
+ /*
+ * Cleanup anything left on the "stack" after we left the loop.
+ * Every bpo on the stack is locked so we must remember to undo
+ * that now (in LIFO order).
+ */
+ while ((bpi = list_remove_head(&stack)) != NULL) {
+ bpobj_t *bpo = bpi->bpi_bpo;
+ ASSERT(err != 0);
+ ASSERT3P(bpo, !=, NULL);
-out:
- /* If there are no entries, there should be no bytes. */
- if (bpobj_is_empty(bpo)) {
- ASSERT0(bpo->bpo_phys->bpo_bytes);
- ASSERT0(bpo->bpo_phys->bpo_comp);
- ASSERT0(bpo->bpo_phys->bpo_uncomp);
+ mutex_exit(&bpo->bpo_lock);
+
+ /* do not free the initial_bpo */
+ if (bpi->bpi_parent != NULL) {
+ bpobj_close(bpi->bpi_bpo);
+ kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
+ }
+ kmem_free(bpi, sizeof (bpobj_info_t));
}
- mutex_exit(&bpo->bpo_lock);
+ list_destroy(&stack);
+
return (err);
}
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 9f3c9bfd5..28ff5fc7e 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2311,6 +2311,23 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
}
+boolean_t
+dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ mutex_enter(&db->db_mtx);
+ for (dbuf_dirty_record_t *dr = db->db_last_dirty;
+ dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
+ if (dr->dr_txg == tx->tx_txg) {
+ mutex_exit(&db->db_mtx);
+ return (B_TRUE);
+ }
+ }
+ mutex_exit(&db->db_mtx);
+ return (B_FALSE);
+}
+
void
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
@@ -4564,6 +4581,7 @@ EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
+EXPORT_SYMBOL(dmu_buf_is_dirty);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);
EXPORT_SYMBOL(dmu_buf_fill_done);