diff options
author | Matthew Ahrens <[email protected]> | 2014-06-05 13:20:08 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2014-08-04 11:50:52 -0700 |
commit | fbeddd60b79690b6a6ececc9b00b6014d21405aa (patch) | |
tree | 67d3e5730537bc17cc5032d84864b3a9a10d3028 /module/zfs/bptree.c | |
parent | 9b67f605601c77c814037613d8129562db642a29 (diff) |
Illumos 4390 - I/O errors can corrupt space map when deleting fs/vol
4390 i/o errors when deleting filesystem/zvol can lead to space map corruption
Reviewed by: George Wilson <[email protected]>
Reviewed by: Christopher Siden <[email protected]>
Reviewed by: Adam Leventhal <[email protected]>
Reviewed by: Dan McDonald <[email protected]>
Reviewed by: Saso Kiselkov <[email protected]>
Approved by: Dan McDonald <[email protected]>
References:
https://www.illumos.org/issues/4390
https://github.com/illumos/illumos-gate/commit/7fd05ac
Porting notes:
Previous stack-reduction efforts in traverse_visitb() caused a fair
number of un-mergable pieces of code. This patch should reduce its
stack footprint a bit more.
The new local bptree_entry_phys_t in bptree_add() is dynamically-allocated
using kmem_zalloc() for the purpose of stack reduction.
The new global zfs_free_leak_on_eio has been defined as an integer
rather than a boolean_t as was the case with the related zfs_recover
global. Also, zfs_free_leak_on_eio's definition has been inserted into
zfs_debug.c for consistency with the existing definition of zfs_recover.
Illumos placed it in spa_misc.c.
Ported by: Tim Chase <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #2545
Diffstat (limited to 'module/zfs/bptree.c')
-rw-r--r-- | module/zfs/bptree.c | 109 |
1 files changed, 87 insertions, 22 deletions
diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index 83f365864..cbe8d1caa 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) return (dmu_object_free(os, obj, tx)); } +boolean_t +bptree_is_empty(objset_t *os, uint64_t obj) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + boolean_t rv; + + VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + rv = (bt->bt_begin == bt->bt_end); + dmu_buf_rele(db, FTAG); + return (rv); +} + void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) { dmu_buf_t *db; bptree_phys_t *bt; - bptree_entry_phys_t bte; + bptree_entry_phys_t *bte; /* * bptree objects are in the pool mos, therefore they can only be @@ -120,10 +134,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); bt = db->db_data; - bte.be_birth_txg = birth_txg; - bte.be_bp = *bp; - bzero(&bte.be_zb, sizeof (bte.be_zb)); - dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); + bte = kmem_zalloc(sizeof (*bte), KM_PUSHPAGE); + bte->be_birth_txg = birth_txg; + bte->be_bp = *bp; + dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx); + kmem_free(bte, sizeof (*bte)); dmu_buf_will_dirty(db, tx); bt->bt_end++; @@ -153,10 +168,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, return (err); } +/* + * If "free" is set: + * - It is assumed that "func" will be freeing the block pointers. + * - If "func" returns nonzero, the bookmark will be remembered and + * iteration will be restarted from this point on next invocation. + * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), + * bptree_iterate will remember the bookmark, continue traversing + * any additional entries, and return 0. + * + * If "free" is not set, traversal will stop and return an error if + * an i/o error is encountered. + * + * In either case, if zfs_free_leak_on_eio is set, i/o errors will be + * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to + * traverse_dataset_destroyed()). + */ int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, void *arg, dmu_tx_t *tx) { + boolean_t ioerr = B_FALSE; int err; uint64_t i; dmu_buf_t *db; @@ -182,49 +214,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, bptree_entry_phys_t bte; int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; - ASSERT(!free || i == ba.ba_phys->bt_begin); - err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), &bte, DMU_READ_NO_PREFETCH); if (err != 0) break; - if (zfs_recover) + if (zfs_free_leak_on_eio) flags |= TRAVERSE_HARD; + zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld " + "bookmark %lld/%lld/%lld/%lld", + i, (longlong_t)bte.be_birth_txg, + (longlong_t)bte.be_zb.zb_objset, + (longlong_t)bte.be_zb.zb_object, + (longlong_t)bte.be_zb.zb_level, + (longlong_t)bte.be_zb.zb_blkid); err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, bte.be_birth_txg, &bte.be_zb, flags, bptree_visit_cb, &ba); if (free) { - if (err == ERESTART) { + /* + * The callback has freed the visited block pointers. + * Record our traversal progress on disk, either by + * updating this record's bookmark, or by logically + * removing this record by advancing bt_begin. + */ + if (err != 0) { /* save bookmark for future resume */ ASSERT3U(bte.be_zb.zb_objset, ==, ZB_DESTROYED_OBJSET); ASSERT0(bte.be_zb.zb_level); dmu_write(os, obj, i * sizeof (bte), sizeof (bte), &bte, tx); - break; - } - if (err != 0) { + if (err == EIO || err == ECKSUM || + err == ENXIO) { + /* + * Skip the rest of this tree and + * continue on to the next entry. + */ + err = 0; + ioerr = B_TRUE; + } else { + break; + } + } else if (ioerr) { /* - * We can not properly handle an i/o - * error, because the traversal code - * does not know how to resume from an - * arbitrary bookmark. + * This entry is finished, but there were + * i/o errors on previous entries, so we + * can't adjust bt_begin. Set this entry's + * be_birth_txg such that it will be + * treated as a no-op in future traversals. */ - zfs_panic_recover("error %u from " - "traverse_dataset_destroyed()", err); + bte.be_birth_txg = UINT64_MAX; + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); } - ba.ba_phys->bt_begin++; - (void) dmu_free_range(os, obj, - i * sizeof (bte), sizeof (bte), tx); + if (!ioerr) { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } else if (err != 0) { + break; } } - ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + ASSERT(!free || err != 0 || ioerr || + ba.ba_phys->bt_begin == ba.ba_phys->bt_end); /* if all blocks are free there should be no used space */ if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + if (zfs_free_leak_on_eio) { + ba.ba_phys->bt_bytes = 0; + ba.ba_phys->bt_comp = 0; + ba.ba_phys->bt_uncomp = 0; + } + ASSERT0(ba.ba_phys->bt_bytes); ASSERT0(ba.ba_phys->bt_comp); ASSERT0(ba.ba_phys->bt_uncomp); |