aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/bptree.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/bptree.c')
-rw-r--r--module/zfs/bptree.c109
1 files changed, 87 insertions, 22 deletions
diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c
index 83f365864..cbe8d1caa 100644
--- a/module/zfs/bptree.c
+++ b/module/zfs/bptree.c
@@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
return (dmu_object_free(os, obj, tx));
}
+boolean_t
+bptree_is_empty(objset_t *os, uint64_t obj)
+{
+ dmu_buf_t *db;
+ bptree_phys_t *bt;
+ boolean_t rv;
+
+ VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
+ bt = db->db_data;
+ rv = (bt->bt_begin == bt->bt_end);
+ dmu_buf_rele(db, FTAG);
+ return (rv);
+}
+
void
bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
{
dmu_buf_t *db;
bptree_phys_t *bt;
- bptree_entry_phys_t bte;
+ bptree_entry_phys_t *bte;
/*
* bptree objects are in the pool mos, therefore they can only be
@@ -120,10 +134,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
bt = db->db_data;
- bte.be_birth_txg = birth_txg;
- bte.be_bp = *bp;
- bzero(&bte.be_zb, sizeof (bte.be_zb));
- dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
+ bte = kmem_zalloc(sizeof (*bte), KM_PUSHPAGE);
+ bte->be_birth_txg = birth_txg;
+ bte->be_bp = *bp;
+ dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
+ kmem_free(bte, sizeof (*bte));
dmu_buf_will_dirty(db, tx);
bt->bt_end++;
@@ -153,10 +168,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
return (err);
}
+/*
+ * If "free" is set:
+ * - It is assumed that "func" will be freeing the block pointers.
+ * - If "func" returns nonzero, the bookmark will be remembered and
+ * iteration will be restarted from this point on next invocation.
+ * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
+ * bptree_iterate will remember the bookmark, continue traversing
+ * any additional entries, and return 0.
+ *
+ * If "free" is not set, traversal will stop and return an error if
+ * an i/o error is encountered.
+ *
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
+ * traverse_dataset_destroyed()).
+ */
int
bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
void *arg, dmu_tx_t *tx)
{
+ boolean_t ioerr = B_FALSE;
int err;
uint64_t i;
dmu_buf_t *db;
@@ -182,49 +214,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
bptree_entry_phys_t bte;
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
- ASSERT(!free || i == ba.ba_phys->bt_begin);
-
err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
&bte, DMU_READ_NO_PREFETCH);
if (err != 0)
break;
- if (zfs_recover)
+ if (zfs_free_leak_on_eio)
flags |= TRAVERSE_HARD;
+ zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
+ "bookmark %lld/%lld/%lld/%lld",
+ i, (longlong_t)bte.be_birth_txg,
+ (longlong_t)bte.be_zb.zb_objset,
+ (longlong_t)bte.be_zb.zb_object,
+ (longlong_t)bte.be_zb.zb_level,
+ (longlong_t)bte.be_zb.zb_blkid);
err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
bte.be_birth_txg, &bte.be_zb, flags,
bptree_visit_cb, &ba);
if (free) {
- if (err == ERESTART) {
+ /*
+ * The callback has freed the visited block pointers.
+ * Record our traversal progress on disk, either by
+ * updating this record's bookmark, or by logically
+ * removing this record by advancing bt_begin.
+ */
+ if (err != 0) {
/* save bookmark for future resume */
ASSERT3U(bte.be_zb.zb_objset, ==,
ZB_DESTROYED_OBJSET);
ASSERT0(bte.be_zb.zb_level);
dmu_write(os, obj, i * sizeof (bte),
sizeof (bte), &bte, tx);
- break;
- }
- if (err != 0) {
+ if (err == EIO || err == ECKSUM ||
+ err == ENXIO) {
+ /*
+ * Skip the rest of this tree and
+ * continue on to the next entry.
+ */
+ err = 0;
+ ioerr = B_TRUE;
+ } else {
+ break;
+ }
+ } else if (ioerr) {
/*
- * We can not properly handle an i/o
- * error, because the traversal code
- * does not know how to resume from an
- * arbitrary bookmark.
+ * This entry is finished, but there were
+ * i/o errors on previous entries, so we
+ * can't adjust bt_begin. Set this entry's
+ * be_birth_txg such that it will be
+ * treated as a no-op in future traversals.
*/
- zfs_panic_recover("error %u from "
- "traverse_dataset_destroyed()", err);
+ bte.be_birth_txg = UINT64_MAX;
+ dmu_write(os, obj, i * sizeof (bte),
+ sizeof (bte), &bte, tx);
}
- ba.ba_phys->bt_begin++;
- (void) dmu_free_range(os, obj,
- i * sizeof (bte), sizeof (bte), tx);
+ if (!ioerr) {
+ ba.ba_phys->bt_begin++;
+ (void) dmu_free_range(os, obj,
+ i * sizeof (bte), sizeof (bte), tx);
+ }
+ } else if (err != 0) {
+ break;
}
}
- ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+ ASSERT(!free || err != 0 || ioerr ||
+ ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
/* if all blocks are free there should be no used space */
if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+ if (zfs_free_leak_on_eio) {
+ ba.ba_phys->bt_bytes = 0;
+ ba.ba_phys->bt_comp = 0;
+ ba.ba_phys->bt_uncomp = 0;
+ }
+
ASSERT0(ba.ba_phys->bt_bytes);
ASSERT0(ba.ba_phys->bt_comp);
ASSERT0(ba.ba_phys->bt_uncomp);