diff options
Diffstat (limited to 'module/zfs/bptree.c')
-rw-r--r-- | module/zfs/bptree.c | 109 |
1 files changed, 87 insertions, 22 deletions
diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index 83f365864..cbe8d1caa 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) return (dmu_object_free(os, obj, tx)); } +boolean_t +bptree_is_empty(objset_t *os, uint64_t obj) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + boolean_t rv; + + VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + rv = (bt->bt_begin == bt->bt_end); + dmu_buf_rele(db, FTAG); + return (rv); +} + void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) { dmu_buf_t *db; bptree_phys_t *bt; - bptree_entry_phys_t bte; + bptree_entry_phys_t *bte; /* * bptree objects are in the pool mos, therefore they can only be @@ -120,10 +134,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); bt = db->db_data; - bte.be_birth_txg = birth_txg; - bte.be_bp = *bp; - bzero(&bte.be_zb, sizeof (bte.be_zb)); - dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); + bte = kmem_zalloc(sizeof (*bte), KM_PUSHPAGE); + bte->be_birth_txg = birth_txg; + bte->be_bp = *bp; + dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx); + kmem_free(bte, sizeof (*bte)); dmu_buf_will_dirty(db, tx); bt->bt_end++; @@ -153,10 +168,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, return (err); } +/* + * If "free" is set: + * - It is assumed that "func" will be freeing the block pointers. + * - If "func" returns nonzero, the bookmark will be remembered and + * iteration will be restarted from this point on next invocation. + * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), + * bptree_iterate will remember the bookmark, continue traversing + * any additional entries, and return 0. + * + * If "free" is not set, traversal will stop and return an error if + * an i/o error is encountered. + * + * In either case, if zfs_free_leak_on_eio is set, i/o errors will be + * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to + * traverse_dataset_destroyed()). + */ int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, void *arg, dmu_tx_t *tx) { + boolean_t ioerr = B_FALSE; int err; uint64_t i; dmu_buf_t *db; @@ -182,49 +214,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, bptree_entry_phys_t bte; int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; - ASSERT(!free || i == ba.ba_phys->bt_begin); - err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), &bte, DMU_READ_NO_PREFETCH); if (err != 0) break; - if (zfs_recover) + if (zfs_free_leak_on_eio) flags |= TRAVERSE_HARD; + zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld " + "bookmark %lld/%lld/%lld/%lld", + i, (longlong_t)bte.be_birth_txg, + (longlong_t)bte.be_zb.zb_objset, + (longlong_t)bte.be_zb.zb_object, + (longlong_t)bte.be_zb.zb_level, + (longlong_t)bte.be_zb.zb_blkid); err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, bte.be_birth_txg, &bte.be_zb, flags, bptree_visit_cb, &ba); if (free) { - if (err == ERESTART) { + /* + * The callback has freed the visited block pointers. + * Record our traversal progress on disk, either by + * updating this record's bookmark, or by logically + * removing this record by advancing bt_begin. + */ + if (err != 0) { /* save bookmark for future resume */ ASSERT3U(bte.be_zb.zb_objset, ==, ZB_DESTROYED_OBJSET); ASSERT0(bte.be_zb.zb_level); dmu_write(os, obj, i * sizeof (bte), sizeof (bte), &bte, tx); - break; - } - if (err != 0) { + if (err == EIO || err == ECKSUM || + err == ENXIO) { + /* + * Skip the rest of this tree and + * continue on to the next entry. + */ + err = 0; + ioerr = B_TRUE; + } else { + break; + } + } else if (ioerr) { /* - * We can not properly handle an i/o - * error, because the traversal code - * does not know how to resume from an - * arbitrary bookmark. + * This entry is finished, but there were + * i/o errors on previous entries, so we + * can't adjust bt_begin. Set this entry's + * be_birth_txg such that it will be + * treated as a no-op in future traversals. */ - zfs_panic_recover("error %u from " - "traverse_dataset_destroyed()", err); + bte.be_birth_txg = UINT64_MAX; + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); } - ba.ba_phys->bt_begin++; - (void) dmu_free_range(os, obj, - i * sizeof (bte), sizeof (bte), tx); + if (!ioerr) { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } else if (err != 0) { + break; } } - ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + ASSERT(!free || err != 0 || ioerr || + ba.ba_phys->bt_begin == ba.ba_phys->bt_end); /* if all blocks are free there should be no used space */ if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + if (zfs_free_leak_on_eio) { + ba.ba_phys->bt_bytes = 0; + ba.ba_phys->bt_comp = 0; + ba.ba_phys->bt_uncomp = 0; + } + ASSERT0(ba.ba_phys->bt_bytes); ASSERT0(ba.ba_phys->bt_comp); ASSERT0(ba.ba_phys->bt_uncomp); |