diff options
author | George Amanakis <[email protected]> | 2021-12-17 21:35:28 +0100 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2023-05-18 11:59:42 -0700 |
commit | 482eeef804f0f325faddb102f112c0f1ec86a1b6 (patch) | |
tree | f5b052e7fed06ad527285841ec6de2d7503d39ed /module/zfs | |
parent | e34e15ed6d1882d29e314321b7642305d99f1b78 (diff) |
Teach zpool scrub to scrub only blocks in error log
Added a flag '-e' in zpool scrub to scrub only blocks in error log. A
user can pause, resume and cancel the error scrub by passing additional
command line arguments -p -s just like a regular scrub. This involves
adding a new flag, creating new libzfs interfaces, a new ioctl, and the
actual iteration and read-issuing logic. Error scrubbing is executed in
multiple txg to make sure pool performance is not affected.
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Co-authored-by: TulsiJain [email protected]
Signed-off-by: George Amanakis <[email protected]>
Closes #8995
Closes #12355
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/dsl_scan.c | 696 | ||||
-rw-r--r-- | module/zfs/spa.c | 6 | ||||
-rw-r--r-- | module/zfs/spa_errlog.c | 82 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 25 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 46 |
5 files changed, 820 insertions, 35 deletions
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index d398b6705..5e3559b25 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -54,6 +54,7 @@ #include <sys/zfeature.h> #include <sys/abd.h> #include <sys/range_tree.h> +#include <sys/dbuf.h> #ifdef _KERNEL #include <sys/zfs_vfsops.h> #endif @@ -129,6 +130,7 @@ static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); static uint64_t dsl_scan_count_data_disks(spa_t *spa); +static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb); extern uint_t zfs_vdev_async_write_active_min_dirty_percent; static int zfs_scan_blkstats = 0; @@ -231,6 +233,9 @@ static int zfs_resilver_disable_defer = B_FALSE; */ static int zfs_free_bpobj_enabled = 1; +/* Error blocks to be scrubbed in one txg. */ +unsigned long zfs_scrub_error_blocks_per_txg = 1 << 12; + /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, @@ -512,8 +517,16 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) &scn->scn_phys.scn_queue_obj); } else { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRORSCRUB, sizeof (uint64_t), + ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys); + + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); + /* * Detect if the pool contains the signature of #2094. If it * does properly update the scn->scn_phys structure and notify @@ -664,12 +677,90 @@ dsl_scan_scrubbing(const dsl_pool_t *dp) } boolean_t +dsl_errorscrubbing(const dsl_pool_t *dp) +{ + dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys; + + return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING && + errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB); +} + +boolean_t +dsl_errorscrub_is_paused(const dsl_scan_t *scn) +{ + return (dsl_errorscrubbing(scn->scn_dp) && + scn->errorscrub_phys.dep_paused_flags); +} + +boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn) { return (dsl_scan_scrubbing(scn->scn_dp) && scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); } +static void +dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) +{ + scn->errorscrub_phys.dep_cursor = + zap_cursor_serialize(&scn->errorscrub_cursor); + + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS, + &scn->errorscrub_phys, tx)); +} + +static void +dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + pool_scan_func_t *funcp = arg; + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + ASSERT(!dsl_scan_is_running(scn)); + ASSERT(!dsl_errorscrubbing(scn->scn_dp)); + ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + + memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); + scn->errorscrub_phys.dep_func = *funcp; + scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING; + scn->errorscrub_phys.dep_start_time = gethrestime_sec(); + scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa); + scn->errorscrub_phys.dep_examined = 0; + scn->errorscrub_phys.dep_errors = 0; + scn->errorscrub_phys.dep_cursor = 0; + zap_cursor_init_serialized(&scn->errorscrub_cursor, + spa->spa_meta_objset, spa->spa_errlog_last, + scn->errorscrub_phys.dep_cursor); + + vdev_config_dirty(spa->spa_root_vdev); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START); + + dsl_errorscrub_sync_state(scn, tx); + + spa_history_log_internal(spa, "error scrub setup", tx, + "func=%u mintxg=%u maxtxg=%llu", + *funcp, 0, (u_longlong_t)tx->tx_txg); +} + +static int +dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) { + return (SET_ERROR(EBUSY)); + } + + if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) { + return (ECANCELED); + } + return (0); +} + /* * Writes out a persistent dsl_scan_phys_t record to the pool directory. * Because we can be running in the block sorting algorithm, we do not always @@ -745,7 +836,8 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd)) + if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) || + dsl_errorscrubbing(scn->scn_dp)) return (SET_ERROR(EBUSY)); return (0); @@ -754,6 +846,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { + (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; @@ -763,6 +856,14 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) ASSERT(!dsl_scan_is_running(scn)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); memset(&scn->scn_phys, 0, sizeof (scn->scn_phys)); + + /* + * If we are starting a fresh scrub, we erase the error scrub + * information from disk. + */ + memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); + dsl_errorscrub_sync_state(scn, tx); + scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = 0; @@ -856,8 +957,9 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) } /* - * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. - * Can also be called to resume a paused scrub. + * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub, + * error scrub or resilver. Can also be called to resume a paused scrub or + * error scrub. */ int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) @@ -883,6 +985,26 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) return (0); } + if (func == POOL_SCAN_ERRORSCRUB) { + if (dsl_errorscrub_is_paused(dp->dp_scan)) { + /* + * got error scrub start cmd, resume paused error scrub. + */ + int err = dsl_scrub_set_pause_resume(scn->scn_dp, + POOL_SCRUB_NORMAL); + if (err == 0) { + spa_event_notify(spa, NULL, NULL, + ESC_ZFS_ERRORSCRUB_RESUME); + return (ECANCELED); + } + return (SET_ERROR(err)); + } + + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync, + &func, 0, ZFS_SPACE_CHECK_RESERVED)); + } + if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { /* got scrub start cmd, resume paused scrub */ int err = dsl_scrub_set_pause_resume(scn->scn_dp, @@ -891,7 +1013,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); return (SET_ERROR(ECANCELED)); } - return (SET_ERROR(err)); } @@ -900,6 +1021,33 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) } static void +dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + if (complete) { + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH); + spa_history_log_internal(spa, "error scrub done", tx, + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); + } else { + spa_history_log_internal(spa, "error scrub canceled", tx, + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); + } + + scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED; + spa->spa_scrub_active = B_FALSE; + spa_errlog_rotate(spa); + scn->errorscrub_phys.dep_end_time = gethrestime_sec(); + zap_cursor_fini(&scn->errorscrub_cursor); + + if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) + spa->spa_errata = 0; + + ASSERT(!dsl_errorscrubbing(scn->scn_dp)); +} + +static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { static const char *old_names[] = { @@ -1046,6 +1194,92 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) } static int +dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + /* + * can't pause a error scrub when there is no in-progress + * error scrub. + */ + if (!dsl_errorscrubbing(dp)) + return (SET_ERROR(ENOENT)); + + /* can't pause a paused error scrub */ + if (dsl_errorscrub_is_paused(scn)) + return (SET_ERROR(EBUSY)); + } else if (*cmd != POOL_SCRUB_NORMAL) { + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +static void +dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + spa->spa_scan_pass_errorscrub_pause = gethrestime_sec(); + scn->errorscrub_phys.dep_paused_flags = B_TRUE; + dsl_errorscrub_sync_state(scn, tx); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED); + } else { + ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); + if (dsl_errorscrub_is_paused(scn)) { + /* + * We need to keep track of how much time we spend + * paused per pass so that we can adjust the error scrub + * rate shown in the output of 'zpool status'. + */ + spa->spa_scan_pass_errorscrub_spent_paused += + gethrestime_sec() - + spa->spa_scan_pass_errorscrub_pause; + + spa->spa_scan_pass_errorscrub_pause = 0; + scn->errorscrub_phys.dep_paused_flags = B_FALSE; + + zap_cursor_init_serialized( + &scn->errorscrub_cursor, + spa->spa_meta_objset, spa->spa_errlog_last, + scn->errorscrub_phys.dep_cursor); + + dsl_errorscrub_sync_state(scn, tx); + } + } +} + +static int +dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + /* can't cancel a error scrub when there is no one in-progress */ + if (!dsl_errorscrubbing(scn->scn_dp)) + return (SET_ERROR(ENOENT)); + return (0); +} + +static void +dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + dsl_errorscrub_done(scn, B_FALSE, tx); + dsl_errorscrub_sync_state(scn, tx); + spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, + ESC_ZFS_ERRORSCRUB_ABORT); +} + +static int dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { (void) arg; @@ -1070,6 +1304,11 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) int dsl_scan_cancel(dsl_pool_t *dp) { + if (dsl_errorscrubbing(dp)) { + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync, + NULL, 3, ZFS_SPACE_CHECK_RESERVED)); + } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } @@ -1136,6 +1375,12 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) int dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) { + if (dsl_errorscrubbing(dp)) { + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_pause_resume_check, + dsl_errorscrub_pause_resume_sync, &cmd, 3, + ZFS_SPACE_CHECK_RESERVED)); + } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, ZFS_SPACE_CHECK_RESERVED)); @@ -1422,6 +1667,42 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) return (B_FALSE); } +static boolean_t +dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) +{ + /* + * We suspend if: + * - we have scrubbed for at least the minimum time (default 1 sec + * for error scrub), someone is explicitly waiting for this txg + * to complete, or we have used up all of the time in the txg + * timeout (default 5 sec). + * or + * - the spa is shutting down because this pool is being exported + * or the machine is rebooting. + */ + uint64_t curr_time_ns = gethrtime(); + uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int mintime = zfs_scrub_min_time_ms; + + if ((NSEC2MSEC(error_scrub_time_ns) > mintime && + (txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa)) { + if (zb) { + dprintf("error scrub suspending at bookmark " + "%llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + } + return (B_TRUE); + } + return (B_FALSE); +} + typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; @@ -3352,6 +3633,19 @@ dsl_scan_active(dsl_scan_t *scn) return ((used != 0) || (clones_left)); } +boolean_t +dsl_errorscrub_active(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + if (spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + if (spa_shutting_down(spa)) + return (B_FALSE); + if (dsl_errorscrubbing(scn->scn_dp)) + return (B_TRUE); + return (B_FALSE); +} + static boolean_t dsl_scan_check_deferred(vdev_t *vd) { @@ -3568,6 +3862,387 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) return (0); } +static void +name_to_bookmark(char *buf, zbookmark_phys_t *zb) +{ + zb->zb_objset = zfs_strtonum(buf, &buf); + ASSERT(*buf == ':'); + zb->zb_object = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_blkid = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} + +static void +name_to_object(char *buf, uint64_t *obj) +{ + *obj = zfs_strtonum(buf, &buf); + ASSERT(*buf == '\0'); +} + +static void +read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb) +{ + dsl_pool_t *dp = scn->scn_dp; + dsl_dataset_t *ds; + objset_t *os; + if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0) + return; + + if (dmu_objset_from_ds(ds, &os) != 0) { + dsl_dataset_rele(ds, FTAG); + return; + } + + /* + * If the key is not loaded dbuf_dnode_findbp() will error out with + * EACCES. However in that case dnode_hold() will eventually call + * dbuf_read()->zio_wait() which may call spa_log_error(). This will + * lead to a deadlock due to us holding the mutex spa_errlist_lock. + * Avoid this by checking here if the keys are loaded, if not return. + * If the keys are not loaded the head_errlog feature is meaningless + * as we cannot figure out the birth txg of the block pointer. + */ + if (dsl_dataset_get_keystatus(ds->ds_dir) == + ZFS_KEYSTATUS_UNAVAILABLE) { + dsl_dataset_rele(ds, FTAG); + return; + } + + dnode_t *dn; + blkptr_t bp; + + if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) { + dsl_dataset_rele(ds, FTAG); + return; + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL, + NULL); + + if (error) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + return; + } + + if (!error && BP_IS_HOLE(&bp)) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + return; + } + + int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | + ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB; + + /* If it's an intent log block, failure is expected. */ + if (zb.zb_level == ZB_ZIL_LEVEL) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + ASSERT(!BP_IS_EMBEDDED(&bp)); + scan_exec_io(dp, &bp, zio_flags, &zb, NULL); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); +} + +/* + * We keep track of the scrubbed error blocks in "count". This will be used + * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This + * function is modelled after check_filesystem(). + */ +static int +scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep, + int *count) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t latest_txg; + uint64_t txg_to_consider = spa->spa_syncing_txg; + boolean_t check_snapshot = B_TRUE; + + error = find_birth_txg(ds, zep, &latest_txg); + + /* + * If find_birth_txg() errors out, then err on the side of caution and + * proceed. In worst case scenario scrub all objects. If zep->zb_birth + * is 0 (e.g. in case of encryption with unloaded keys) also proceed to + * scrub all objects. + */ + if (error == 0 && zep->zb_birth == latest_txg) { + /* Block neither free nor re written. */ + zbookmark_phys_t zb; + zep_to_zb(fs, zep, &zb); + scn->scn_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + /* We have already acquired the config lock for spa */ + read_by_block_level(scn, zb); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined++; + scn->errorscrub_phys.dep_to_examine--; + (*count)++; + if ((*count) == zfs_scrub_error_blocks_per_txg || + dsl_error_scrub_check_suspend(scn, &zb)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EFAULT)); + } + + check_snapshot = B_FALSE; + } else if (error == 0) { + txg_to_consider = latest_txg; + } + + /* + * Retrieve the number of snapshots if the dataset is not a snapshot. + */ + uint64_t snap_count = 0; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { + + error = zap_count(spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + } + + if (snap_count == 0) { + /* Filesystem without snapshots. */ + dsl_dataset_rele(ds, FTAG); + return (0); + } + + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + dsl_dataset_rele(ds, FTAG); + + /* Check only snapshots created from this file system. */ + while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && + snap_obj_txg <= txg_to_consider) { + + error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); + if (error != 0) + return (error); + + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) { + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsl_dataset_rele(ds, FTAG); + continue; + } + + boolean_t affected = B_TRUE; + if (check_snapshot) { + uint64_t blk_txg; + error = find_birth_txg(ds, zep, &blk_txg); + + /* + * Scrub the snapshot also when zb_birth == 0 or when + * find_birth_txg() returns an error. + */ + affected = (error == 0 && zep->zb_birth == blk_txg) || + (error != 0) || (zep->zb_birth == 0); + } + + /* Scrub snapshots. */ + if (affected) { + zbookmark_phys_t zb; + zep_to_zb(snap_obj, zep, &zb); + scn->scn_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + /* We have already acquired the config lock for spa */ + read_by_block_level(scn, zb); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined++; + scn->errorscrub_phys.dep_to_examine--; + (*count)++; + if ((*count) == zfs_scrub_error_blocks_per_txg || + dsl_error_scrub_check_suspend(scn, &zb)) { + dsl_dataset_rele(ds, FTAG); + return (EFAULT); + } + } + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsl_dataset_rele(ds, FTAG); + } + return (0); +} + +void +dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + /* + * Only process scans in sync pass 1. + */ + + if (spa_sync_pass(spa) > 1) + return; + + /* + * If the spa is shutting down, then stop scanning. This will + * ensure that the scan does not dirty any new data during the + * shutdown phase. + */ + if (spa_shutting_down(spa)) + return; + + if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) { + return; + } + + if (dsl_scan_resilvering(scn->scn_dp)) { + /* cancel the error scrub if resilver started */ + dsl_scan_cancel(scn->scn_dp); + return; + } + + spa->spa_scrub_active = B_TRUE; + scn->scn_sync_start_time = gethrtime(); + + /* + * zfs_scan_suspend_progress can be set to disable scrub progress. + * See more detailed comment in dsl_scan_sync(). + */ + if (zfs_scan_suspend_progress) { + uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; + int mintime = zfs_scrub_min_time_ms; + + while (zfs_scan_suspend_progress && + !txg_sync_waiting(scn->scn_dp) && + !spa_shutting_down(scn->scn_dp->dp_spa) && + NSEC2MSEC(scan_time_ns) < mintime) { + delay(hz); + scan_time_ns = gethrtime() - scn->scn_sync_start_time; + } + return; + } + + int i = 0; + zap_attribute_t *za; + zbookmark_phys_t *zb; + boolean_t limit_exceeded = B_FALSE; + + za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; + zap_cursor_advance(&scn->errorscrub_cursor)) { + name_to_bookmark(za->za_name, zb); + + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + dsl_pool_config_enter(dp, FTAG); + read_by_block_level(scn, *zb); + dsl_pool_config_exit(dp, FTAG); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined += 1; + scn->errorscrub_phys.dep_to_examine -= 1; + i++; + if (i == zfs_scrub_error_blocks_per_txg || + dsl_error_scrub_check_suspend(scn, zb)) { + limit_exceeded = B_TRUE; + break; + } + } + + if (!limit_exceeded) + dsl_errorscrub_done(scn, B_TRUE, tx); + + dsl_errorscrub_sync_state(scn, tx); + kmem_free(za, sizeof (*za)); + kmem_free(zb, sizeof (*zb)); + return; + } + + int error = 0; + for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; + zap_cursor_advance(&scn->errorscrub_cursor)) { + + zap_cursor_t *head_ds_cursor; + zap_attribute_t *head_ds_attr; + zbookmark_err_phys_t head_ds_block; + + head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + + uint64_t head_ds_err_obj = za->za_first_integer; + uint64_t head_ds; + name_to_object(za->za_name, &head_ds); + boolean_t config_held = B_FALSE; + uint64_t top_affected_fs; + + for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, + head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { + + name_to_errphys(head_ds_attr->za_name, &head_ds_block); + + /* + * In case we are called from spa_sync the pool + * config is already held. + */ + if (!dsl_pool_config_held(dp)) { + dsl_pool_config_enter(dp, FTAG); + config_held = B_TRUE; + } + + error = find_top_affected_fs(spa, + head_ds, &head_ds_block, &top_affected_fs); + if (error) + break; + + error = scrub_filesystem(spa, top_affected_fs, + &head_ds_block, &i); + + if (error == SET_ERROR(EFAULT)) { + limit_exceeded = B_TRUE; + break; + } + } + + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); + + if (config_held) + dsl_pool_config_exit(dp, FTAG); + } + + kmem_free(za, sizeof (*za)); + kmem_free(zb, sizeof (*zb)); + if (!limit_exceeded) + dsl_errorscrub_done(scn, B_TRUE, tx); + + dsl_errorscrub_sync_state(scn, tx); +} + /* * This is the primary entry point for scans that is called from syncing * context. Scans must happen entirely during syncing context so that we @@ -4109,7 +4784,14 @@ dsl_scan_scrub_done(zio_t *zio) if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { - atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); + if (dsl_errorscrubbing(spa->spa_dsl_pool) && + !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan + ->errorscrub_phys.dep_errors); + } else { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys + .scn_errors); + } } } @@ -4559,3 +5241,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); + +ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, U64, ZMOD_RW, + "Error blocks to be scrubbed in one txg"); +/* END CSTYLED */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 51d6de910..1fc2c5e8c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8173,6 +8173,7 @@ spa_scan_stop(spa_t *spa) ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); + return (dsl_scan_cancel(spa->spa_dsl_pool)); } @@ -8198,6 +8199,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func) return (0); } + if (func == POOL_SCAN_ERRORSCRUB && + !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) + return (SET_ERROR(ENOTSUP)); + return (dsl_scan(spa->spa_dsl_pool, func)); } @@ -9249,6 +9254,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) brt_sync(spa, txg); ddt_sync(spa, txg); dsl_scan_sync(dp, tx); + dsl_errorscrub_sync(dp, tx); svr_sync(spa, tx); spa_sync_upgrades(spa, tx); diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 5fe352786..2e5c22c11 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -110,7 +110,7 @@ errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) /* * Convert a string to a err_phys. */ -static void +void name_to_errphys(char *buf, zbookmark_err_phys_t *zep) { zep->zb_object = zfs_strtonum(buf, &buf); @@ -139,8 +139,7 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) ASSERT(*buf == '\0'); } -#ifdef _KERNEL -static void +void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) { zb->zb_objset = dataset; @@ -148,7 +147,6 @@ zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) zb->zb_level = zep->zb_level; zb->zb_blkid = zep->zb_blkid; } -#endif static void name_to_object(char *buf, uint64_t *obj) @@ -238,8 +236,7 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth) mutex_exit(&spa->spa_errlist_lock); } -#ifdef _KERNEL -static int +int find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, uint64_t *birth_txg) { @@ -268,6 +265,34 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, } /* + * This function finds the oldest affected filesystem containing an error + * block. + */ +int +find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *top_affected_fs) +{ + uint64_t oldest_dsobj; + int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, + &oldest_dsobj); + if (error != 0) + return (error); + + dsl_dataset_t *ds; + error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + if (error != 0) + return (error); + + *top_affected_fs = + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + return (0); +} + + +#ifdef _KERNEL +/* * Copy the bookmark to the end of the user-space buffer which starts at * uaddr and has *count unused entries, and decrement *count by 1. */ @@ -288,7 +313,8 @@ copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count) * Each time the error block is referenced by a snapshot or clone, add a * zbookmark_phys_t entry to the userspace array at uaddr. The array is * filled from the back and the in-out parameter *count is modified to be the - * number of unused entries at the beginning of the array. + * number of unused entries at the beginning of the array. The function + * scrub_filesystem() is modelled after this one. */ static int check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, @@ -450,28 +476,6 @@ out: } static int -find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, - uint64_t *top_affected_fs) -{ - uint64_t oldest_dsobj; - int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, - &oldest_dsobj); - if (error != 0) - return (error); - - dsl_dataset_t *ds; - error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj, - DS_HOLD_FLAG_DECRYPT, FTAG, &ds); - if (error != 0) - return (error); - - *top_affected_fs = - dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; - dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); - return (0); -} - -static int process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, void *uaddr, uint64_t *count) { @@ -536,6 +540,21 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, } #endif +/* Return the number of errors in the error log */ +uint64_t +spa_get_last_errlog_size(spa_t *spa) +{ + uint64_t total = 0, count; + mutex_enter(&spa->spa_errlog_lock); + + if (spa->spa_errlog_last != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + return (total); +} + /* * If a healed bookmark matches an entry in the error log we stash it in a tree * so that we can later remove the related log entries in sync context. @@ -1447,6 +1466,7 @@ spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, /* error handling */ EXPORT_SYMBOL(spa_log_error); EXPORT_SYMBOL(spa_approx_errlog_size); +EXPORT_SYMBOL(spa_get_last_errlog_size); EXPORT_SYMBOL(spa_get_errlog); EXPORT_SYMBOL(spa_errlog_rotate); EXPORT_SYMBOL(spa_errlog_drain); @@ -1456,6 +1476,10 @@ EXPORT_SYMBOL(spa_delete_dataset_errlog); EXPORT_SYMBOL(spa_swap_errlog); EXPORT_SYMBOL(sync_error_list); EXPORT_SYMBOL(spa_upgrade_errlog); +EXPORT_SYMBOL(find_top_affected_fs); +EXPORT_SYMBOL(find_birth_txg); +EXPORT_SYMBOL(zep_to_zb); +EXPORT_SYMBOL(name_to_errphys); #endif /* BEGIN CSTYLED */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 54a0eeccf..89e1ce716 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2579,9 +2579,18 @@ spa_scan_stat_init(spa_t *spa) spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_scrub_pause = 0; + + if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) + spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start; + else + spa->spa_scan_pass_errorscrub_pause = 0; + spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; + + // error scrub stats + spa->spa_scan_pass_errorscrub_spent_paused = 0; } /* @@ -2592,8 +2601,10 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; - if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) + if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE && + scn->errorscrub_phys.dep_func == POOL_SCAN_NONE)) return (SET_ERROR(ENOENT)); + memset(ps, 0, sizeof (pool_scan_stat_t)); /* data stored on disk */ @@ -2616,6 +2627,18 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) ps->pss_issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; + /* error scrub data stored on disk */ + ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func; + ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state; + ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time; + ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time; + ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined; + ps->pss_error_scrub_to_be_examined = + scn->errorscrub_phys.dep_to_examine; + + /* error scrub data not stored on disk */ + ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause; + return (0); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index efaf6f9b3..f91a2f3bb 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1685,6 +1685,47 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * poolname name of the pool + * scan_type scan func (pool_scan_func_t) + * scan_command scrub pause/resume flag (pool_scrub_cmd_t) + */ +static const zfs_ioc_key_t zfs_keys_pool_scrub[] = { + {"scan_type", DATA_TYPE_UINT64, 0}, + {"scan_command", DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + uint64_t scan_type, scan_cmd; + + if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0) + return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(innvl, "scan_command", &scan_cmd) != 0) + return (SET_ERROR(EINVAL)); + + if (scan_cmd >= POOL_SCRUB_FLAGS_END) + return (SET_ERROR(EINVAL)); + + if ((error = spa_open(poolname, &spa, FTAG)) != 0) + return (error); + + if (scan_cmd == POOL_SCRUB_PAUSE) { + error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); + } else if (scan_type == POOL_SCAN_NONE) { + error = spa_scan_stop(spa); + } else { + error = spa_scan(spa, scan_type); + } + + spa_close(spa, FTAG); + return (error); +} + static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { @@ -7218,6 +7259,11 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props)); + zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB, + zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_NONE, B_TRUE, B_TRUE, + zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, |