diff options
author | George Amanakis <[email protected]> | 2023-03-29 01:51:58 +0200 |
---|---|---|
committer | GitHub <[email protected]> | 2023-03-28 16:51:58 -0700 |
commit | 431083f75bdd3efaee992bdd672625ec7240d252 (patch) | |
tree | 9795844e36c83621d347ab2d930676b32ab7022e /module/zfs/spa_errlog.c | |
parent | 65d10bd87c408bfa13fa27bb6ad3ecc0e2e3775b (diff) |
Fixes in persistent error log
Address the following bugs in persistent error log:
1) Check nested clones, eg "fs->snap->clone->snap2->clone2".
2) When deleting files containing error blocks in those clones (from
"clone" the example above), do not break the check chain.
3) When deleting files in the originating fs before syncing the errlog
to disk, do not break the check chain. This happens because at the
time of introducing the error block in the error list, we do not have
its birth txg and the head filesystem. If the original file is
deleted before the error list is synced to the error log (which is
when we actually lookup the birth txg and the head filesystem), then
we do not have access to this info anymore and break the check chain.
The most prominent change is related to achieving (3). We expand the
spa_error_entry_t structure to accommodate the newly introduced
zbookmark_err_phys_t structure (containing the birth txg of the error
block).Due to compatibility reasons we cannot remove the
zbookmark_phys_t structure and we also need to place the new structure
after se_avl, so it is not accounted for in avl_find(). Then we modify
spa_log_error() to also provide the birth txg of the error block. With
these changes in place we simplify the previously introduced function
get_head_and_birth_txg() (now named get_head_ds()).
We chose not to follow the same approach for the head filesystem (thus
completely removing get_head_ds()) to avoid introducing new lock
contentions.
The stack sizes of nested functions (as measured by checkstack.pl in the
linux kernel) are:
check_filesystem [zfs]: 272 (was 912)
check_clones [zfs]: 64
We also introduced two new tests covering the above changes.
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: George Amanakis <[email protected]>
Closes #14633
Diffstat (limited to 'module/zfs/spa_errlog.c')
-rw-r--r-- | module/zfs/spa_errlog.c | 340 |
1 files changed, 185 insertions, 155 deletions
diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index c6d97eed2..41cb9d012 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -135,6 +135,10 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) } #ifdef _KERNEL +static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, + uint64_t *snap_obj_array, zbookmark_err_phys_t *zep, void* uaddr, + uint64_t *count); + static void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) { @@ -152,74 +156,22 @@ name_to_object(char *buf, uint64_t *obj) ASSERT(*buf == '\0'); } -static int -get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, - uint64_t *head_dataset_id) +/* + * Retrieve the head filesystem. + */ +static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds) { - dsl_pool_t *dp = spa->spa_dsl_pool; dsl_dataset_t *ds; - objset_t *os; - - int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds); - if (error != 0) { - return (error); - } - ASSERT(head_dataset_id); - *head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; - - error = dmu_objset_from_ds(ds, &os); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - /* - * If the key is not loaded dbuf_dnode_findbp() will error out with - * EACCES. However in that case dnode_hold() will eventually call - * dbuf_read()->zio_wait() which may call spa_log_error(). This will - * lead to a deadlock due to us holding the mutex spa_errlist_lock. - * Avoid this by checking here if the keys are loaded, if not return. - * If the keys are not loaded the head_errlog feature is meaningless - * as we cannot figure out the birth txg of the block pointer. - */ - if (dsl_dataset_get_keystatus(ds->ds_dir) == - ZFS_KEYSTATUS_UNAVAILABLE) { - zep->zb_birth = 0; - dsl_dataset_rele(ds, FTAG); - return (0); - } + int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, + dsobj, FTAG, &ds); - dnode_t *dn; - blkptr_t bp; - - error = dnode_hold(os, zep->zb_object, FTAG, &dn); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); + if (error != 0) return (error); - } - rw_enter(&dn->dn_struct_rwlock, RW_READER); - error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, - NULL); - if (error == 0 && BP_IS_HOLE(&bp)) - error = SET_ERROR(ENOENT); - - /* - * If the key is loaded but the encrypted filesystem is unmounted when - * a scrub is run, then dbuf_dnode_findbp() will still error out with - * EACCES (possibly due to the key mapping being removed upon - * unmounting). In that case the head_errlog feature is also - * meaningless as we cannot figure out the birth txg of the block - * pointer. - */ - if (error == EACCES) - error = 0; - else if (!error) - zep->zb_birth = bp.blk_birth; - - rw_exit(&dn->dn_struct_rwlock); - dnode_rele(dn, FTAG); + ASSERT(head_ds); + *head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; dsl_dataset_rele(ds, FTAG); + return (error); } @@ -229,7 +181,7 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, * during spa_errlog_sync(). */ void -spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) +spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth) { spa_error_entry_t search; spa_error_entry_t *new; @@ -262,8 +214,26 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); new->se_bookmark = *zb; - avl_insert(tree, new, where); + /* + * If the head_errlog feature is enabled, store the birth txg now. In + * case the file is deleted before spa_errlog_sync() runs, we will not + * be able to retrieve the birth txg. + */ + if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + new->se_zep.zb_object = zb->zb_object; + new->se_zep.zb_level = zb->zb_level; + new->se_zep.zb_blkid = zb->zb_blkid; + + /* + * birth may end up being NULL, e.g. in zio_done(). We + * will handle this in process_error_block(). + */ + if (birth != NULL) + new->se_zep.zb_birth = *birth; + } + + avl_insert(tree, new, where); mutex_exit(&spa->spa_errlist_lock); } @@ -336,20 +306,28 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, error = find_birth_txg(ds, zep, &latest_txg); /* - * If we cannot figure out the current birth txg of the block pointer - * error out. If the filesystem is encrypted and the key is not loaded + * If the filesystem is encrypted and the key is not loaded * or the encrypted filesystem is not mounted the error will be EACCES. - * In that case do not return an error. + * In that case report an error in the head filesystem and return. */ if (error == EACCES) { dsl_dataset_rele(ds, FTAG); + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + error = copyout_entry(&zb, uaddr, count); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } return (0); } - if (error) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - if (zep->zb_birth == latest_txg) { + + /* + * If find_birth_txg() errors out otherwise, let txg_to_consider be + * equal to the spa's syncing txg: if check_filesystem() errors out + * then affected snapshots or clones will not be checked. + */ + if (error == 0 && zep->zb_birth == latest_txg) { /* Block neither free nor rewritten. */ zbookmark_phys_t zb; zep_to_zb(head_ds, zep, &zb); @@ -359,44 +337,55 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, return (error); } check_snapshot = B_FALSE; - } else { - ASSERT3U(zep->zb_birth, <, latest_txg); + } else if (error == 0) { txg_to_consider = latest_txg; } - /* How many snapshots reference this block. */ - uint64_t snap_count; - error = zap_count(spa->spa_meta_objset, - dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } + /* + * Retrieve the number of snapshots if the dataset is not a snapshot. + */ + uint64_t snap_count = 0; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { - if (snap_count == 0) { - /* File system has no snapshot. */ - dsl_dataset_rele(ds, FTAG); - return (0); + error = zap_count(spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (snap_count == 0) { + /* Filesystem without snapshots. */ + dsl_dataset_rele(ds, FTAG); + return (0); + } } - uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t), + uint64_t *snap_obj_array = kmem_zalloc(snap_count * sizeof (uint64_t), KM_SLEEP); int aff_snap_count = 0; uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones; + + dsl_dataset_rele(ds, FTAG); /* Check only snapshots created from this file system. */ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && snap_obj_txg <= txg_to_consider) { - dsl_dataset_rele(ds, FTAG); error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); if (error != 0) goto out; - if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) - break; + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) { + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsl_dataset_rele(ds, FTAG); + continue; + } boolean_t affected = B_TRUE; if (check_snapshot) { @@ -405,6 +394,7 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, affected = (error == 0 && zep->zb_birth == blk_txg); } + /* Report errors in snapshots. */ if (affected) { snap_obj_array[aff_snap_count] = snap_obj; aff_snap_count++; @@ -416,37 +406,77 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, dsl_dataset_rele(ds, FTAG); goto out; } - - /* - * Only clones whose origins were affected could also - * have affected snapshots. - */ - zap_cursor_t zc; - zap_attribute_t za; - for (zap_cursor_init(&zc, spa->spa_meta_objset, - dsl_dataset_phys(ds)->ds_next_clones_obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - error = check_filesystem(spa, - za.za_first_integer, zep, uaddr, count); - - if (error != 0) { - zap_cursor_fini(&zc); - goto out; - } - } - zap_cursor_fini(&zc); } - snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsl_dataset_rele(ds, FTAG); + } + + if (zap_clone != 0 && aff_snap_count > 0) { + error = check_clones(spa, zap_clone, snap_count, snap_obj_array, + zep, uaddr, count); } - dsl_dataset_rele(ds, FTAG); out: kmem_free(snap_obj_array, sizeof (*snap_obj_array)); return (error); } +/* + * Clone checking. + */ +static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, + uint64_t *snap_obj_array, zbookmark_err_phys_t *zep, void* uaddr, + uint64_t *count) +{ + int error = 0; + zap_cursor_t *zc; + zap_attribute_t *za; + + zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + + for (zap_cursor_init(zc, spa->spa_meta_objset, zap_clone); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_dataset_t *clone; + error = dsl_dataset_hold_obj(dp, za->za_first_integer, + FTAG, &clone); + + if (error != 0) + break; + + /* + * Only clones whose origins were affected could also + * have affected snapshots. + */ + boolean_t found = B_FALSE; + for (int i = 0; i < snap_count; i++) { + if (dsl_dir_phys(clone->ds_dir)->dd_origin_obj + == snap_obj_array[i]) + found = B_TRUE; + } + dsl_dataset_rele(clone, FTAG); + + if (!found) + continue; + + error = check_filesystem(spa, za->za_first_integer, zep, + uaddr, count); + + if (error != 0) + break; + } + + kmem_free(za, sizeof (*za)); + kmem_free(zc, sizeof (*zc)); + zap_cursor_fini(zc); + + return (error); +} + static int find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t *top_affected_fs) @@ -474,12 +504,13 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, void *uaddr, uint64_t *count) { /* - * If the zb_birth is 0 it means we failed to retrieve the birth txg - * of the block pointer. This happens when an encrypted filesystem is - * not mounted or when the key is not loaded. Do not proceed to + * If zb_birth == 0 or head_ds == 0 it means we failed to retrieve the + * birth txg or the head filesystem of the block pointer. This may + * happen e.g. when an encrypted filesystem is not mounted or when + * the key is not loaded. In this case do not proceed to * check_filesystem(), instead do the accounting here. */ - if (zep->zb_birth == 0) { + if (zep->zb_birth == 0 || head_ds == 0) { zbookmark_phys_t zb; zep_to_zb(head_ds, zep, &zb); int error = copyout_entry(&zb, uaddr, count); @@ -697,11 +728,10 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, zep.zb_birth = 0; /* - * We cannot use get_head_and_birth_txg() because it will - * acquire the pool config lock, which we already have. In case - * of an error we simply continue. + * In case of an error we should simply continue instead of + * returning prematurely. See the next comment. */ - uint64_t head_dataset_obj; + uint64_t head_ds; dsl_pool_t *dp = spa->spa_dsl_pool; dsl_dataset_t *ds; objset_t *os; @@ -710,8 +740,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, if (error != 0) continue; - head_dataset_obj = - dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; /* * The objset and the dnode are required for getting the block @@ -751,14 +780,14 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, uint64_t err_obj; error = zap_lookup_int_key(spa->spa_meta_objset, *newobj, - head_dataset_obj, &err_obj); + head_ds, &err_obj); if (error == ENOENT) { err_obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); (void) zap_update_int_key(spa->spa_meta_objset, - *newobj, head_dataset_obj, err_obj, tx); + *newobj, head_ds, err_obj, tx); } char buf[64]; @@ -875,20 +904,21 @@ process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count) } for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { - zbookmark_err_phys_t zep; - zep.zb_object = se->se_bookmark.zb_object; - zep.zb_level = se->se_bookmark.zb_level; - zep.zb_blkid = se->se_bookmark.zb_blkid; - zep.zb_birth = 0; + uint64_t head_ds = 0; + int error = get_head_ds(spa, se->se_bookmark.zb_objset, + &head_ds); - uint64_t head_ds_obj; - int error = get_head_and_birth_txg(spa, &zep, - se->se_bookmark.zb_objset, &head_ds_obj); + /* + * If get_head_ds() errors out, set the head filesystem + * to the filesystem stored in the bookmark of the + * error block. + */ + if (error != 0) + head_ds = se->se_bookmark.zb_objset; - if (!error) - error = process_error_block(spa, head_ds_obj, &zep, - uaddr, count); - if (error) + error = process_error_block(spa, head_ds, + &se->se_zep, uaddr, count); + if (error != 0) return (error); } return (0); @@ -914,8 +944,9 @@ spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) #ifdef _KERNEL /* * The pool config lock is needed to hold a dataset_t via (among other - * places) process_error_list() -> get_head_and_birth_txg(), and lock - * ordering requires that we get it before the spa_errlog_lock. + * places) process_error_list() -> process_error_block()-> + * find_top_affected_fs(), and lock ordering requires that we get it + * before the spa_errlog_lock. */ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); mutex_enter(&spa->spa_errlog_lock); @@ -1011,34 +1042,33 @@ sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) } else { for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { zbookmark_err_phys_t zep; - zep.zb_object = se->se_bookmark.zb_object; - zep.zb_level = se->se_bookmark.zb_level; - zep.zb_blkid = se->se_bookmark.zb_blkid; - zep.zb_birth = 0; + zep.zb_object = se->se_zep.zb_object; + zep.zb_level = se->se_zep.zb_level; + zep.zb_blkid = se->se_zep.zb_blkid; + zep.zb_birth = se->se_zep.zb_birth; + + uint64_t head_ds = 0; + int error = get_head_ds(spa, se->se_bookmark.zb_objset, + &head_ds); /* - * If we cannot find out the head dataset and birth txg - * of the present error block, we simply continue. - * Reinserting that error block to the error lists, - * even if we are not syncing the final txg, results - * in duplicate posting of errors. + * If get_head_ds() errors out, set the head filesystem + * to the filesystem stored in the bookmark of the + * error block. */ - uint64_t head_dataset_obj; - int error = get_head_and_birth_txg(spa, &zep, - se->se_bookmark.zb_objset, &head_dataset_obj); - if (error) - continue; + if (error != 0) + head_ds = se->se_bookmark.zb_objset; uint64_t err_obj; error = zap_lookup_int_key(spa->spa_meta_objset, - *obj, head_dataset_obj, &err_obj); + *obj, head_ds, &err_obj); if (error == ENOENT) { err_obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); (void) zap_update_int_key(spa->spa_meta_objset, - *obj, head_dataset_obj, err_obj, tx); + *obj, head_ds, err_obj, tx); } errphys_to_name(&zep, buf, sizeof (buf)); @@ -1108,7 +1138,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) /* * The pool config lock is needed to hold a dataset_t via - * sync_error_list() -> get_head_and_birth_txg(), and lock ordering + * sync_error_list() -> get_head_ds(), and lock ordering * requires that we get it before the spa_errlog_lock. */ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); |