diff options
Diffstat (limited to 'module')
30 files changed, 2328 insertions, 461 deletions
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index ea1bccf50..b010c8843 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. @@ -210,6 +210,11 @@ zpool_feature_init(void) hole_birth_deps); } + zfeature_register(SPA_FEATURE_POOL_CHECKPOINT, + "com.delphix:zpool_checkpoint", "zpool_checkpoint", + "Pool state can be checkpointed, allowing rewind later.", + ZFEATURE_FLAG_READONLY_COMPAT, NULL); + zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, "com.delphix:extensible_dataset", "extensible_dataset", "Enhanced dataset functionality, used by other features.", diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index bc38eca7d..dc0bb59bc 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #include <sys/zio.h> @@ -79,6 +79,8 @@ zpool_prop_init(void) ZFS_TYPE_POOL, "<size>", "FREE"); zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "FREEING"); + zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0, + PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CKPOINT"); zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "LEAKED"); zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 1c2187c56..d8d1e3a23 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -68,6 +68,7 @@ $(MODULE)-objs += sha256.o $(MODULE)-objs += skein_zfs.o $(MODULE)-objs += spa.o $(MODULE)-objs += spa_boot.o +$(MODULE)-objs += spa_checkpoint.o $(MODULE)-objs += spa_config.o $(MODULE)-objs += spa_errlog.o $(MODULE)-objs += spa_history.o diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index a5f468ac8..f0b535618 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -81,8 +81,8 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) if (BP_IS_HOLE(bp)) return (0); - if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) - return (0); + if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa)) + return (-1); SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); @@ -121,20 +121,17 @@ static void traverse_zil(traverse_data_t *td, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; - zilog_t *zilog; /* * We only want to visit blocks that have been claimed but not yet - * replayed; plus, in read-only mode, blocks that are already stable. + * replayed; plus blocks that are already stable in read-only mode. */ if (claim_txg == 0 && spa_writeable(td->td_spa)) return; - zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); - + zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT)); - zil_free(zilog); } diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index ab687f7cc..fddad607d 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1284,6 +1284,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, (spa_is_root(os->os_spa) && spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); + ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE)); + if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT || object == DMU_PROJECTUSED_OBJECT) { if (object == DMU_USERUSED_OBJECT) @@ -1519,7 +1521,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, mutex_exit(&dn->dn_mtx); dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); - return (SET_ERROR(type == DMU_OT_NONE ? ENOENT : EEXIST)); + return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ? + ENOENT : EEXIST)); } if (refcount_add(&dn->dn_holds, tag) == 1) diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 96e7bccc9..044031e4f 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -644,7 +644,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dn->dn_maxblkid == 0 || list_head(list) != NULL || dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == dnp->dn_datablkszsec || - range_tree_space(dn->dn_free_ranges[txgoff]) != 0); + !range_tree_is_empty(dn->dn_free_ranges[txgoff])); dnp->dn_datablkszsec = dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; dn->dn_next_blksz[txgoff] = 0; diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 9db6d1e0b..bb9b4a1c7 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -46,6 +46,7 @@ #include <sys/zfs_context.h> #include <sys/zfs_ioctl.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/vdev.h> #include <sys/zfs_znode.h> #include <sys/zfs_onexit.h> @@ -208,7 +209,9 @@ int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async) { - int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + int used = bp_get_dsize_sync(spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); @@ -3821,7 +3824,8 @@ dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, ddsqra.ddsqra_value = refquota; return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, - dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); + dsl_dataset_set_refquota_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static int @@ -3936,8 +3940,8 @@ dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, ddsqra.ddsqra_value = refreservation; return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, - dsl_dataset_set_refreservation_sync, &ddsqra, - 0, ZFS_SPACE_CHECK_NONE)); + dsl_dataset_set_refreservation_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index b3296ceee..b450073cc 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -1036,7 +1036,7 @@ dsl_destroy_head(const char *name) error = dsl_sync_task(name, dsl_destroy_head_check, dsl_destroy_head_begin_sync, &ddha, - 0, ZFS_SPACE_CHECK_NONE); + 0, ZFS_SPACE_CHECK_DESTROY); if (error != 0) return (error); @@ -1062,7 +1062,7 @@ dsl_destroy_head(const char *name) } return (dsl_sync_task(name, dsl_destroy_head_check, - dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE)); + dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY)); } /* diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 36abfe024..519c94b64 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -942,14 +942,14 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); if (pds) { - VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, + VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, name, sizeof (uint64_t), 1, &ddobj, tx)); } else { /* it's the root dir */ - VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); } - VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); + VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); ddphys = dbuf->db_data; @@ -988,6 +988,12 @@ dsl_dir_get_used(dsl_dir_t *dd) } uint64_t +dsl_dir_get_compressed(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_compressed_bytes); +} + +uint64_t dsl_dir_get_quota(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_quota); @@ -1215,7 +1221,8 @@ dsl_dir_space_available(dsl_dir_t *dd, used += dsl_dir_space_towrite(dd); if (dd->dd_parent == NULL) { - uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); + uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, + ZFS_SPACE_CHECK_NORMAL); quota = MIN(quota, poolsize); } @@ -1326,11 +1333,12 @@ top_of_function: */ uint64_t deferred = 0; if (dd->dd_parent == NULL) { - spa_t *spa = dd->dd_pool->dp_spa; - uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); - deferred = metaslab_class_get_deferred(spa_normal_class(spa)); - if (poolsize - deferred < quota) { - quota = poolsize - deferred; + uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool, + (netfree) ? + ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL); + + if (avail < quota) { + quota = avail; retval = ENOSPC; } } @@ -1684,7 +1692,8 @@ dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) ddsqra.ddsqra_value = quota; return (dsl_sync_task(ddname, dsl_dir_set_quota_check, - dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); + dsl_dir_set_quota_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); } int @@ -1727,7 +1736,8 @@ dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) avail = dsl_dir_space_available(dd->dd_parent, NULL, 0, FALSE); } else { - avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; + avail = dsl_pool_adjustedsize(dd->dd_pool, + ZFS_SPACE_CHECK_NORMAL) - used; } if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { @@ -1805,7 +1815,8 @@ dsl_dir_set_reservation(const char *ddname, zprop_source_t source, ddsqra.ddsqra_value = reservation; return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, - dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); + dsl_dir_set_reservation_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static dsl_dir_t * diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 1bb49c13a..e8f519b18 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -43,6 +43,8 @@ #include <sys/zfs_znode.h> #include <sys/spa_impl.h> #include <sys/dsl_deadlist.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab_impl.h> #include <sys/bptree.h> #include <sys/zfeature.h> #include <sys/zil_impl.h> @@ -201,6 +203,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); + txg_list_create(&dp->dp_early_sync_tasks, spa, + offsetof(dsl_sync_task_t, dst_node)); dp->dp_sync_taskq = taskq_create("dp_sync_taskq", zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, @@ -385,6 +389,7 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); + txg_list_destroy(&dp->dp_early_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); taskq_destroy(dp->dp_zil_clean_taskq); @@ -574,6 +579,29 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) cv_signal(&dp->dp_spaceavail_cv); } +#ifdef ZFS_DEBUG +static boolean_t +dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) +{ + spa_t *spa = dp->dp_spa; + vdev_t *rvd = spa->spa_root_vdev; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + txg_list_t *tl = &vd->vdev_ms_list; + metaslab_t *ms; + + for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms; + ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) { + VERIFY(range_tree_is_empty(ms->ms_freeing)); + VERIFY(range_tree_is_empty(ms->ms_checkpointing)); + } + } + + return (B_TRUE); +} +#endif + void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { @@ -590,6 +618,23 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); /* + * Run all early sync tasks before writing out any dirty blocks. + * For more info on early sync tasks see block comment in + * dsl_early_sync_task(). + */ + if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) { + dsl_sync_task_t *dst; + + ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); + while ((dst = + txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) { + ASSERT(dsl_early_sync_task_verify(dp, txg)); + dsl_sync_task_sync(dst, tx); + } + ASSERT(dsl_early_sync_task_verify(dp, txg)); + } + + /* * Write out all dirty blocks of dirty datasets. */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); @@ -744,22 +789,66 @@ dsl_pool_sync_context(dsl_pool_t *dp) taskq_member(dp->dp_sync_taskq, curthread)); } +/* + * This function returns the amount of allocatable space in the pool + * minus whatever space is currently reserved by ZFS for specific + * purposes. Specifically: + * + * 1] Any reserved SLOP space + * 2] Any space used by the checkpoint + * 3] Any space used for deferred frees + * + * The latter 2 are especially important because they are needed to + * rectify the SPA's and DMU's different understanding of how much space + * is used. Now the DMU is aware of that extra space tracked by the SPA + * without having to maintain a separate special dir (e.g similar to + * $MOS, $FREEING, and $LEAKED). + * + * Note: By deferred frees here, we mean the frees that were deferred + * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the + * segments placed in ms_defer trees during metaslab_sync_done(). + */ uint64_t -dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) +dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy) { - uint64_t space, resv; - - /* - * If we're trying to assess whether it's OK to do a free, - * cut the reservation in half to allow forward progress - * (e.g. make it possible to rm(1) files from a full pool). - */ - space = spa_get_dspace(dp->dp_spa); - resv = spa_get_slop_space(dp->dp_spa); - if (netfree) + spa_t *spa = dp->dp_spa; + uint64_t space, resv, adjustedsize; + uint64_t spa_deferred_frees = + spa->spa_deferred_bpobj.bpo_phys->bpo_bytes; + + space = spa_get_dspace(spa) + - spa_get_checkpoint_space(spa) - spa_deferred_frees; + resv = spa_get_slop_space(spa); + + switch (slop_policy) { + case ZFS_SPACE_CHECK_NORMAL: + break; + case ZFS_SPACE_CHECK_RESERVED: resv >>= 1; + break; + case ZFS_SPACE_CHECK_EXTRA_RESERVED: + resv >>= 2; + break; + case ZFS_SPACE_CHECK_NONE: + resv = 0; + break; + default: + panic("invalid slop policy value: %d", slop_policy); + break; + } + adjustedsize = (space >= resv) ? (space - resv) : 0; - return (space - resv); + return (adjustedsize); +} + +uint64_t +dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy) +{ + uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy); + uint64_t deferred = + metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); + uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0; + return (quota); } boolean_t diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 2c3494746..986dccdea 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -733,7 +733,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) } return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, - dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE)); + dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* ARGSUSED */ @@ -810,13 +810,23 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) * If the scrub/resilver completed, update all DTLs to * reflect this. Whether it succeeded or not, vacate * all temporary scrub DTLs. + * + * As the scrub does not currently support traversing + * data that have been freed but are part of a checkpoint, + * we don't mark the scrub as done in the DTLs as faults + * may still exist in those vdevs. */ - vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, - complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); - if (complete) { + if (complete && + !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, + scn->scn_phys.scn_max_txg, B_TRUE); + spa_event_notify(spa, NULL, NULL, scn->scn_phys.scn_min_txg ? ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + } else { + vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, + 0, B_TRUE); } spa_errlog_rotate(spa); @@ -1217,7 +1227,7 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) * (on-disk) even if it hasn't been claimed (even though for * scrub there's nothing to do to it). */ - if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) + if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], @@ -1268,11 +1278,13 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) zil_scan_arg_t zsa = { dp, zh }; zilog_t *zilog; + ASSERT(spa_writeable(dp->dp_spa)); + /* * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ - if (claim_txg == 0 && spa_writeable(dp->dp_spa)) + if (claim_txg == 0) return; zilog = zil_alloc(dp->dp_meta_objset, zh); @@ -3004,79 +3016,16 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, return (B_TRUE); } -/* - * This is the primary entry point for scans that is called from syncing - * context. Scans must happen entirely during syncing context so that we - * cna guarantee that blocks we are currently scanning will not change out - * from under us. While a scan is active, this function controls how quickly - * transaction groups proceed, instead of the normal handling provided by - * txg_sync_thread(). - */ -void -dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) +static int +dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) { - int err = 0; dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; - state_sync_type_t sync_type = SYNC_OPTIONAL; - - /* - * Check for scn_restart_txg before checking spa_load_state, so - * that we can restart an old-style scan while the pool is being - * imported (see dsl_scan_init). - */ - if (dsl_scan_restarting(scn, tx)) { - pool_scan_func_t func = POOL_SCAN_SCRUB; - dsl_scan_done(scn, B_FALSE, tx); - if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) - func = POOL_SCAN_RESILVER; - zfs_dbgmsg("restarting scan func=%u txg=%llu", - func, (longlong_t)tx->tx_txg); - dsl_scan_setup_sync(&func, tx); - } - - /* - * Only process scans in sync pass 1. - */ - if (spa_sync_pass(spa) > 1) - return; - - /* - * If the spa is shutting down, then stop scanning. This will - * ensure that the scan does not dirty any new data during the - * shutdown phase. - */ - if (spa_shutting_down(spa)) - return; - - /* - * If the scan is inactive due to a stalled async destroy, try again. - */ - if (!scn->scn_async_stalled && !dsl_scan_active(scn)) - return; + int err = 0; - /* reset scan statistics */ - scn->scn_visited_this_txg = 0; - scn->scn_holes_this_txg = 0; - scn->scn_lt_min_this_txg = 0; - scn->scn_gt_max_this_txg = 0; - scn->scn_ddt_contained_this_txg = 0; - scn->scn_objsets_visited_this_txg = 0; - scn->scn_avg_seg_size_this_txg = 0; - scn->scn_segs_this_txg = 0; - scn->scn_avg_zio_size_this_txg = 0; - scn->scn_zios_this_txg = 0; - scn->scn_suspending = B_FALSE; - scn->scn_sync_start_time = gethrtime(); - spa->spa_scrub_active = B_TRUE; + if (spa_suspend_async_destroy(spa)) + return (0); - /* - * First process the async destroys. If we suspend, don't do - * any scrubbing or resilvering. This ensures that there are no - * async destroys while we are scanning, so the scan code doesn't - * have to worry about traversing it. It is also faster to free the - * blocks than to scrub them. - */ if (zfs_free_bpobj_enabled && spa_version(spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; @@ -3152,7 +3101,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) ddt_sync(spa, tx->tx_txg); } if (err != 0) - return; + return (err); if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && zfs_free_leak_on_eio && (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || @@ -3205,6 +3154,85 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) dsl_pool_destroy_obsolete_bpobj(dp, tx); } + return (0); +} + +/* + * This is the primary entry point for scans that is called from syncing + * context. Scans must happen entirely during syncing context so that we + * cna guarantee that blocks we are currently scanning will not change out + * from under us. While a scan is active, this function controls how quickly + * transaction groups proceed, instead of the normal handling provided by + * txg_sync_thread(). + */ +void +dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + int err = 0; + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + state_sync_type_t sync_type = SYNC_OPTIONAL; + + /* + * Check for scn_restart_txg before checking spa_load_state, so + * that we can restart an old-style scan while the pool is being + * imported (see dsl_scan_init). + */ + if (dsl_scan_restarting(scn, tx)) { + pool_scan_func_t func = POOL_SCAN_SCRUB; + dsl_scan_done(scn, B_FALSE, tx); + if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + func = POOL_SCAN_RESILVER; + zfs_dbgmsg("restarting scan func=%u txg=%llu", + func, (longlong_t)tx->tx_txg); + dsl_scan_setup_sync(&func, tx); + } + + /* + * Only process scans in sync pass 1. + */ + if (spa_sync_pass(spa) > 1) + return; + + /* + * If the spa is shutting down, then stop scanning. This will + * ensure that the scan does not dirty any new data during the + * shutdown phase. + */ + if (spa_shutting_down(spa)) + return; + + /* + * If the scan is inactive due to a stalled async destroy, try again. + */ + if (!scn->scn_async_stalled && !dsl_scan_active(scn)) + return; + + /* reset scan statistics */ + scn->scn_visited_this_txg = 0; + scn->scn_holes_this_txg = 0; + scn->scn_lt_min_this_txg = 0; + scn->scn_gt_max_this_txg = 0; + scn->scn_ddt_contained_this_txg = 0; + scn->scn_objsets_visited_this_txg = 0; + scn->scn_avg_seg_size_this_txg = 0; + scn->scn_segs_this_txg = 0; + scn->scn_avg_zio_size_this_txg = 0; + scn->scn_zios_this_txg = 0; + scn->scn_suspending = B_FALSE; + scn->scn_sync_start_time = gethrtime(); + spa->spa_scrub_active = B_TRUE; + + /* + * First process the async destroys. If we suspend, don't do + * any scrubbing or resilvering. This ensures that there are no + * async destroys while we are scanning, so the scan code doesn't + * have to worry about traversing it. It is also faster to free the + * blocks than to scrub them. + */ + err = dsl_process_async_destroys(dp, tx); + if (err != 0) + return; if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) return; diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c index d8eb10d37..b63ce5cad 100644 --- a/module/zfs/dsl_synctask.c +++ b/module/zfs/dsl_synctask.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -39,33 +39,10 @@ dsl_null_checkfunc(void *arg, dmu_tx_t *tx) return (0); } -/* - * Called from open context to perform a callback in syncing context. Waits - * for the operation to complete. - * - * The checkfunc will be called from open context as a preliminary check - * which can quickly fail. If it succeeds, it will be called again from - * syncing context. The checkfunc should generally be designed to work - * properly in either context, but if necessary it can check - * dmu_tx_is_syncing(tx). - * - * The synctask infrastructure enforces proper locking strategy with respect - * to the dp_config_rwlock -- the lock will always be held when the callbacks - * are called. It will be held for read during the open-context (preliminary) - * call to the checkfunc, and then held for write from syncing context during - * the calls to the check and sync funcs. - * - * A dataset or pool name can be passed as the first argument. Typically, - * the check func will hold, check the return value of the hold, and then - * release the dataset. The sync func will VERIFYO(hold()) the dataset. - * This is safe because no changes can be made between the check and sync funcs, - * and the sync func will only be called if the check func successfully opened - * the dataset. - */ -int -dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, +static int +dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check) + int blocks_modified, zfs_space_check_t space_check, boolean_t early) { spa_t *spa; dmu_tx_t *tx; @@ -102,7 +79,9 @@ top: return (err); } - VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg)); + txg_list_t *task_list = (early) ? + &dp->dp_early_sync_tasks : &dp->dp_sync_tasks; + VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg)); dmu_tx_commit(tx); @@ -117,9 +96,64 @@ top: return (dst.dst_error); } -void -dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) +/* + * Called from open context to perform a callback in syncing context. Waits + * for the operation to complete. + * + * The checkfunc will be called from open context as a preliminary check + * which can quickly fail. If it succeeds, it will be called again from + * syncing context. The checkfunc should generally be designed to work + * properly in either context, but if necessary it can check + * dmu_tx_is_syncing(tx). + * + * The synctask infrastructure enforces proper locking strategy with respect + * to the dp_config_rwlock -- the lock will always be held when the callbacks + * are called. It will be held for read during the open-context (preliminary) + * call to the checkfunc, and then held for write from syncing context during + * the calls to the check and sync funcs. + * + * A dataset or pool name can be passed as the first argument. Typically, + * the check func will hold, check the return value of the hold, and then + * release the dataset. The sync func will VERIFYO(hold()) the dataset. + * This is safe because no changes can be made between the check and sync funcs, + * and the sync func will only be called if the check func successfully opened + * the dataset. + */ +int +dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check) +{ + return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg, + blocks_modified, space_check, B_FALSE)); +} + +/* + * An early synctask works exactly as a standard synctask with one important + * difference on the way it is handled during syncing context. Standard + * synctasks run after we've written out all the dirty blocks of dirty + * datasets. Early synctasks are executed before writing out any dirty data, + * and thus before standard synctasks. + * + * For that reason, early synctasks can affect the process of writing dirty + * changes to disk for the txg that they run and should be used with caution. + * In addition, early synctasks should not dirty any metaslabs as this would + * invalidate the precodition/invariant for subsequent early synctasks. + * [see dsl_pool_sync() and dsl_early_sync_task_verify()] + */ +int +dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check) +{ + return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg, + blocks_modified, space_check, B_TRUE)); +} + +static void +dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx, + boolean_t early) { dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); @@ -133,7 +167,25 @@ dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, dst->dst_error = 0; dst->dst_nowaiter = B_TRUE; - VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg)); + txg_list_t *task_list = (early) ? + &dp->dp_early_sync_tasks : &dp->dp_sync_tasks; + VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg)); +} + +void +dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) +{ + dsl_sync_task_nowait_common(dp, syncfunc, arg, + blocks_modified, space_check, tx, B_FALSE); +} + +void +dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) +{ + dsl_sync_task_nowait_common(dp, syncfunc, arg, + blocks_modified, space_check, tx, B_TRUE); } /* @@ -160,12 +212,12 @@ dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx) * (arc_tempreserve, dsl_pool_tempreserve). */ if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) { - uint64_t quota = dsl_pool_adjustedsize(dp, - dst->dst_space_check == ZFS_SPACE_CHECK_RESERVED) - - metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); + uint64_t quota = dsl_pool_unreserved_space(dp, + dst->dst_space_check); uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; + /* MOS space is triple-dittoed, so we multiply by 3. */ - if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) { + if (used + dst->dst_space * 3 > quota) { dst->dst_error = SET_ERROR(ENOSPC); if (dst->dst_nowaiter) kmem_free(dst, sizeof (*dst)); diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c index b5a684f0b..c80b35d48 100644 --- a/module/zfs/dsl_userhold.c +++ b/module/zfs/dsl_userhold.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. */ @@ -604,7 +604,8 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, KM_SLEEP)); error = dsl_sync_task(pool, dsl_dataset_user_release_check, - dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE); + dsl_dataset_user_release_sync, &ddura, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED); fnvlist_free(ddura.ddura_todelete); fnvlist_free(ddura.ddura_chkholds); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index c11e459e0..76fa99e8b 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -34,6 +34,7 @@ #include <sys/spa_impl.h> #include <sys/zfeature.h> #include <sys/vdev_indirect_mapping.h> +#include <sys/zap.h> #define WITH_DF_BLOCK_ALLOCATOR @@ -54,6 +55,14 @@ unsigned long metaslab_aliquot = 512 << 10; unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* + * Since we can touch multiple metaslabs (and their respective space maps) + * with each transaction group, we benefit from having a smaller space map + * block size since it allows us to issue more I/O operations scattered + * around the disk. + */ +int zfs_metaslab_sm_blksz = (1 << 12); + +/* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. @@ -211,7 +220,7 @@ uint64_t metaslab_trace_max_entries = 5000; static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); -static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t); +static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); #ifdef _METASLAB_TRACING @@ -484,11 +493,11 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg) */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { allocated += - range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]); + range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } - msp_free_space = range_tree_space(msp->ms_tree) + allocated + - msp->ms_deferspace + range_tree_space(msp->ms_freedtree); + msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + + msp->ms_deferspace + range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); } @@ -1021,7 +1030,7 @@ metaslab_rangesize_compare(const void *x1, const void *x2) uint64_t metaslab_block_maxsize(metaslab_t *msp) { - avl_tree_t *t = &msp->ms_size_tree; + avl_tree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; if (t == NULL || (rs = avl_last(t)) == NULL) @@ -1101,7 +1110,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size) */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - avl_tree_t *t = &msp->ms_tree->rt_root; + avl_tree_t *t = &msp->ms_allocatable->rt_root; return (metaslab_block_picker(t, cursor, size, align)); } @@ -1134,13 +1143,14 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - range_tree_t *rt = msp->ms_tree; + range_tree_t *rt = msp->ms_allocatable; avl_tree_t *t = &rt->rt_root; uint64_t max_size = metaslab_block_maxsize(msp); int free_pct = range_tree_space(rt) * 100 / msp->ms_size; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); + ASSERT3U(avl_numnodes(t), ==, + avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); @@ -1151,7 +1161,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) */ if (max_size < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { - t = &msp->ms_size_tree; + t = &msp->ms_allocatable_by_size; *cursor = 0; } @@ -1178,8 +1188,8 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { - range_tree_t *rt = msp->ms_tree; - avl_tree_t *t = &msp->ms_size_tree; + range_tree_t *rt = msp->ms_allocatable; + avl_tree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; @@ -1192,7 +1202,7 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) if ((*cursor + size) > *cursor_end) { range_seg_t *rs; - rs = avl_last(&msp->ms_size_tree); + rs = avl_last(&msp->ms_allocatable_by_size); if (rs == NULL || (rs->rs_end - rs->rs_start) < size) return (-1ULL); @@ -1232,7 +1242,7 @@ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { - avl_tree_t *t = &msp->ms_tree->rt_root; + avl_tree_t *t = &msp->ms_allocatable->rt_root; avl_index_t where; range_seg_t *rs, rsearch; uint64_t hbit = highbit64(size); @@ -1240,7 +1250,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) uint64_t max_size = metaslab_block_maxsize(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); + ASSERT3U(avl_numnodes(t), ==, + avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); @@ -1250,7 +1261,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) rs = avl_find(t, &rsearch, &where); if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { - t = &msp->ms_size_tree; + t = &msp->ms_allocatable_by_size; rsearch.rs_start = 0; rsearch.rs_end = MIN(max_size, @@ -1316,13 +1327,15 @@ metaslab_load(metaslab_t *msp) /* * If the space map has not been allocated yet, then treat - * all the space in the metaslab as free and add it to the - * ms_tree. + * all the space in the metaslab as free and add it to ms_allocatable. */ - if (msp->ms_sm != NULL) - error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); - else - range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); + if (msp->ms_sm != NULL) { + error = space_map_load(msp->ms_sm, msp->ms_allocatable, + SM_FREE); + } else { + range_tree_add(msp->ms_allocatable, + msp->ms_start, msp->ms_size); + } success = (error == 0); @@ -1333,9 +1346,16 @@ metaslab_load(metaslab_t *msp) ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defertree[t], - range_tree_remove, msp->ms_tree); + /* + * If the metaslab already has a spacemap, then we need to + * remove all segments from the defer tree; otherwise, the + * metaslab is completely empty and we can skip this. + */ + if (msp->ms_sm != NULL) { + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defer[t], + range_tree_remove, msp->ms_allocatable); + } } msp->ms_max_size = metaslab_block_maxsize(msp); } @@ -1347,7 +1367,7 @@ void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); - range_tree_vacate(msp->ms_tree, NULL, NULL); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; msp->ms_max_size = 0; @@ -1393,8 +1413,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, * addition of new space; and for debugging, it ensures that we'd * data fault on any attempt to use this metaslab before it's ready. */ - ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree, - metaslab_rangesize_compare, 0); + ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, + &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms); @@ -1446,20 +1466,21 @@ metaslab_fini(metaslab_t *msp) space_map_close(msp->ms_sm); metaslab_unload(msp); - range_tree_destroy(msp->ms_tree); - range_tree_destroy(msp->ms_freeingtree); - range_tree_destroy(msp->ms_freedtree); + range_tree_destroy(msp->ms_allocatable); + range_tree_destroy(msp->ms_freeing); + range_tree_destroy(msp->ms_freed); for (int t = 0; t < TXG_SIZE; t++) { - range_tree_destroy(msp->ms_alloctree[t]); + range_tree_destroy(msp->ms_allocating[t]); } for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_destroy(msp->ms_defertree[t]); + range_tree_destroy(msp->ms_defer[t]); } - ASSERT0(msp->ms_deferspace); + range_tree_destroy(msp->ms_checkpointing); + mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); @@ -1679,7 +1700,7 @@ metaslab_weight_from_range_tree(metaslab_t *msp) int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; segments <<= 1; - segments += msp->ms_tree->rt_histogram[i]; + segments += msp->ms_allocatable->rt_histogram[i]; /* * The range tree provides more precision than the space map @@ -1895,7 +1916,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight) */ ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || size >= SPA_MINBLOCKSIZE || - range_tree_space(msp->ms_tree) == 0); + range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); msp->ms_activation_weight = 0; @@ -2028,18 +2049,37 @@ metaslab_should_condense(metaslab_t *msp) range_seg_t *rs; uint64_t size, entries, segsz, object_size, optimal_size, record_size; dmu_object_info_t doi; - uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift; + vdev_t *vd = msp->ms_group->mg_vd; + uint64_t vdev_blocksize = 1 << vd->vdev_ashift; + uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); /* - * Use the ms_size_tree range tree, which is ordered by size, to - * obtain the largest segment in the free tree. We always condense - * metaslabs that are empty and metaslabs for which a condense - * request has been made. + * Allocations and frees in early passes are generally more space + * efficient (in terms of blocks described in space map entries) + * than the ones in later passes (e.g. we don't compress after + * sync pass 5) and condensing a metaslab multiple times in a txg + * could degrade performance. + * + * Thus we prefer condensing each metaslab at most once every txg at + * the earliest sync pass possible. If a metaslab is eligible for + * condensing again after being considered for condensing within the + * same txg, it will hopefully be dirty in the next txg where it will + * be condensed at an earlier pass. + */ + if (msp->ms_condense_checked_txg == current_txg) + return (B_FALSE); + msp->ms_condense_checked_txg = current_txg; + + /* + * Use the ms_allocatable_by_size range tree, which is ordered by + * size, to obtain the largest segment in the free tree. We always + * condense metaslabs that are empty and metaslabs for which a + * condense request has been made. */ - rs = avl_last(&msp->ms_size_tree); + rs = avl_last(&msp->ms_allocatable_by_size); if (rs == NULL || msp->ms_condense_wanted) return (B_TRUE); @@ -2053,7 +2093,8 @@ metaslab_should_condense(metaslab_t *msp) entries = size / (MIN(size, SM_RUN_MAX)); segsz = entries * sizeof (uint64_t); - optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); + optimal_size = + sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root); object_size = space_map_length(msp->ms_sm); dmu_object_info_from_db(sm->sm_dbuf, &doi); @@ -2076,7 +2117,6 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) space_map_t *sm = msp->ms_sm; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(spa_sync_pass(msp->ms_group->mg_vd->vdev_spa), ==, 1); ASSERT(msp->ms_loaded); @@ -2084,7 +2124,8 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, msp->ms_group->mg_vd->vdev_spa->spa_name, - space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), + space_map_length(msp->ms_sm), + avl_numnodes(&msp->ms_allocatable->rt_root), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; @@ -2099,20 +2140,16 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) condense_tree = range_tree_create(NULL, NULL); range_tree_add(condense_tree, msp->ms_start, msp->ms_size); - /* - * Remove what's been freed in this txg from the condense_tree. - * Since we're in sync_pass 1, we know that all the frees from - * this txg are in the freeingtree. - */ - range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree); + range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); + range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defertree[t], + range_tree_walk(msp->ms_defer[t], range_tree_remove, condense_tree); } for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { - range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], + range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], range_tree_remove, condense_tree); } @@ -2122,13 +2159,13 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) * metaslab's ms_condensing flag to ensure that * allocations on this metaslab do not occur while we're * in the middle of committing it to disk. This is only critical - * for the ms_tree as all other range trees use per txg + * for ms_allocatable as all other range trees use per txg * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); - space_map_truncate(sm, tx); + space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); /* * While we would ideally like to create a space map representation @@ -2144,7 +2181,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - space_map_write(sm, msp->ms_tree, SM_FREE, tx); + space_map_write(sm, msp->ms_allocatable, SM_FREE, tx); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; } @@ -2159,7 +2196,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); - range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; + range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; uint64_t object = space_map_object(msp->ms_sm); @@ -2168,23 +2205,24 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * This metaslab has just been added so there's no work to do now. */ - if (msp->ms_freeingtree == NULL) { + if (msp->ms_freeing == NULL) { ASSERT3P(alloctree, ==, NULL); return; } ASSERT3P(alloctree, !=, NULL); - ASSERT3P(msp->ms_freeingtree, !=, NULL); - ASSERT3P(msp->ms_freedtree, !=, NULL); + ASSERT3P(msp->ms_freeing, !=, NULL); + ASSERT3P(msp->ms_freed, !=, NULL); + ASSERT3P(msp->ms_checkpointing, !=, NULL); /* - * Normally, we don't want to process a metaslab if there - * are no allocations or frees to perform. However, if the metaslab - * is being forced to condense and it's loaded, we need to let it - * through. + * Normally, we don't want to process a metaslab if there are no + * allocations or frees to perform. However, if the metaslab is being + * forced to condense and it's loaded, we need to let it through. */ - if (range_tree_space(alloctree) == 0 && - range_tree_space(msp->ms_freeingtree) == 0 && + if (range_tree_is_empty(alloctree) && + range_tree_is_empty(msp->ms_freeing) && + range_tree_is_empty(msp->ms_checkpointing) && !(msp->ms_loaded && msp->ms_condense_wanted)) return; @@ -2193,10 +2231,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * The only state that can actually be changing concurrently with - * metaslab_sync() is the metaslab's ms_tree. No other thread can - * be modifying this txg's alloctree, freeingtree, freedtree, or - * space_map_phys_t. We drop ms_lock whenever we could call - * into the DMU, because the DMU can call down to us + * metaslab_sync() is the metaslab's ms_allocatable. No other + * thread can be modifying this txg's alloc, freeing, + * freed, or space_map_phys_t. We drop ms_lock whenever we + * could call into the DMU, because the DMU can call down to us * (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state @@ -2204,13 +2242,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * that the ms_lock is insufficient for this, because it is dropped * by space_map_write(). */ - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); if (msp->ms_sm == NULL) { uint64_t new_object; - new_object = space_map_alloc(mos, tx); + new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, @@ -2218,6 +2255,28 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) ASSERT(msp->ms_sm != NULL); } + if (!range_tree_is_empty(msp->ms_checkpointing) && + vd->vdev_checkpoint_sm == NULL) { + ASSERT(spa_has_checkpoint(spa)); + + uint64_t new_object = space_map_alloc(mos, + vdev_standard_sm_blksz, tx); + VERIFY3U(new_object, !=, 0); + + VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, + mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); + ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + + /* + * We save the space map object as an entry in vdev_top_zap + * so it can be retrieved when the pool is reopened after an + * export or through zdb. + */ + VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (new_object), 1, &new_object, tx)); + } + mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); @@ -2230,16 +2289,40 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); - if (msp->ms_loaded && spa_sync_pass(spa) == 1 && - metaslab_should_condense(msp)) { + if (msp->ms_loaded && metaslab_should_condense(msp)) { metaslab_condense(msp, txg, tx); } else { mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); - space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx); + space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx); mutex_enter(&msp->ms_lock); } + if (!range_tree_is_empty(msp->ms_checkpointing)) { + ASSERT(spa_has_checkpoint(spa)); + ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + + /* + * Since we are doing writes to disk and the ms_checkpointing + * tree won't be changing during that time, we drop the + * ms_lock while writing to the checkpoint space map. + */ + mutex_exit(&msp->ms_lock); + space_map_write(vd->vdev_checkpoint_sm, + msp->ms_checkpointing, SM_FREE, tx); + mutex_enter(&msp->ms_lock); + space_map_update(vd->vdev_checkpoint_sm); + + spa->spa_checkpoint_info.sci_dspace += + range_tree_space(msp->ms_checkpointing); + vd->vdev_stat.vs_checkpoint_space += + range_tree_space(msp->ms_checkpointing); + ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, + -vd->vdev_checkpoint_sm->sm_alloc); + + range_tree_vacate(msp->ms_checkpointing, NULL, NULL); + } + if (msp->ms_loaded) { /* * When the space map is loaded, we have an accurate @@ -2248,7 +2331,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * it first before updating it. */ space_map_histogram_clear(msp->ms_sm); - space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); + space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); /* * Since we've cleared the histogram we need to add back @@ -2257,7 +2340,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * to accurately reflect all free space even if some space * is not yet available for allocation (i.e. deferred). */ - space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx); + space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); /* * Add back any deferred free space that has not been @@ -2268,7 +2351,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, - msp->ms_defertree[t], tx); + msp->ms_defer[t], tx); } } @@ -2279,7 +2362,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * then we will lose some accuracy but will correct it the next * time we load the space map. */ - space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx); + space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); @@ -2287,21 +2370,23 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * For sync pass 1, we avoid traversing this txg's free range tree - * and instead will just swap the pointers for freeingtree and - * freedtree. We can safely do this since the freed_tree is + * and instead will just swap the pointers for freeing and + * freed. We can safely do this since the freed_tree is * guaranteed to be empty on the initial pass. */ if (spa_sync_pass(spa) == 1) { - range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree); + range_tree_swap(&msp->ms_freeing, &msp->ms_freed); } else { - range_tree_vacate(msp->ms_freeingtree, - range_tree_add, msp->ms_freedtree); + range_tree_vacate(msp->ms_freeing, + range_tree_add, msp->ms_freed); } range_tree_vacate(alloctree, NULL, NULL); - ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_freeingtree)); + ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) + & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); @@ -2336,29 +2421,34 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * If this metaslab is just becoming available, initialize its * range trees and add its capacity to the vdev. */ - if (msp->ms_freedtree == NULL) { + if (msp->ms_freed == NULL) { for (int t = 0; t < TXG_SIZE; t++) { - ASSERT(msp->ms_alloctree[t] == NULL); + ASSERT(msp->ms_allocating[t] == NULL); - msp->ms_alloctree[t] = range_tree_create(NULL, NULL); + msp->ms_allocating[t] = range_tree_create(NULL, NULL); } - ASSERT3P(msp->ms_freeingtree, ==, NULL); - msp->ms_freeingtree = range_tree_create(NULL, NULL); + ASSERT3P(msp->ms_freeing, ==, NULL); + msp->ms_freeing = range_tree_create(NULL, NULL); - ASSERT3P(msp->ms_freedtree, ==, NULL); - msp->ms_freedtree = range_tree_create(NULL, NULL); + ASSERT3P(msp->ms_freed, ==, NULL); + msp->ms_freed = range_tree_create(NULL, NULL); for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ASSERT(msp->ms_defertree[t] == NULL); + ASSERT(msp->ms_defer[t] == NULL); - msp->ms_defertree[t] = range_tree_create(NULL, NULL); + msp->ms_defer[t] = range_tree_create(NULL, NULL); } + ASSERT3P(msp->ms_checkpointing, ==, NULL); + msp->ms_checkpointing = range_tree_create(NULL, NULL); + vdev_space_update(vd, 0, 0, msp->ms_size); } + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); - defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; + defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); @@ -2369,7 +2459,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) defer_delta = 0; alloc_delta = space_map_alloc_delta(msp->ms_sm); if (defer_allowed) { - defer_delta = range_tree_space(msp->ms_freedtree) - + defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); } else { defer_delta -= range_tree_space(*defer_tree); @@ -2385,19 +2475,19 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * Move the frees from the defer_tree back to the free - * range tree (if it's loaded). Swap the freed_tree and the - * defer_tree -- this is safe to do because we've just emptied out - * the defer_tree. + * range tree (if it's loaded). Swap the freed_tree and + * the defer_tree -- this is safe to do because we've + * just emptied out the defer_tree. */ range_tree_vacate(*defer_tree, - msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); + msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { - range_tree_swap(&msp->ms_freedtree, defer_tree); + range_tree_swap(&msp->ms_freed, defer_tree); } else { - range_tree_vacate(msp->ms_freedtree, - msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); + range_tree_vacate(msp->ms_freed, + msp->ms_loaded ? range_tree_add : NULL, + msp->ms_allocatable); } - space_map_update(msp->ms_sm); msp->ms_deferspace += defer_delta; @@ -2426,16 +2516,17 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( - msp->ms_alloctree[(txg + t) & TXG_MASK])); + msp->ms_allocating[(txg + t) & TXG_MASK])); } if (!metaslab_debug_unload) metaslab_unload(msp); } - ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_freeingtree)); - ASSERT0(range_tree_space(msp->ms_freedtree)); + ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_freed)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); } @@ -2666,7 +2757,7 @@ static uint64_t metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) { uint64_t start; - range_tree_t *rt = msp->ms_tree; + range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; VERIFY(!msp->ms_condensing); @@ -2681,10 +2772,10 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); range_tree_remove(rt, start, size); - if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) + if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size); + range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); /* Track the last successful allocation */ msp->ms_alloc_txg = txg; @@ -3183,12 +3274,11 @@ next: void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, - uint64_t txg) + boolean_t checkpoint) { metaslab_t *msp; - ASSERTV(spa_t *spa = vd->vdev_spa); + spa_t *spa = vd->vdev_spa; - ASSERT3U(txg, ==, spa->spa_syncing_txg); ASSERT(vdev_is_concrete(vd)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); @@ -3202,11 +3292,19 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); metaslab_check_free_impl(vd, offset, asize); + mutex_enter(&msp->ms_lock); - if (range_tree_space(msp->ms_freeingtree) == 0) { - vdev_dirty(vd, VDD_METASLAB, msp, txg); + if (range_tree_is_empty(msp->ms_freeing) && + range_tree_is_empty(msp->ms_checkpointing)) { + vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); + } + + if (checkpoint) { + ASSERT(spa_has_checkpoint(spa)); + range_tree_add(msp->ms_checkpointing, offset, asize); + } else { + range_tree_add(msp->ms_freeing, offset, asize); } - range_tree_add(msp->ms_freeingtree, offset, asize); mutex_exit(&msp->ms_lock); } @@ -3215,23 +3313,25 @@ void metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { - uint64_t *txgp = arg; + boolean_t *checkpoint = arg; + + ASSERT3P(checkpoint, !=, NULL); if (vd->vdev_ops->vdev_op_remap != NULL) - vdev_indirect_mark_obsolete(vd, offset, size, *txgp); + vdev_indirect_mark_obsolete(vd, offset, size); else - metaslab_free_impl(vd, offset, size, *txgp); + metaslab_free_impl(vd, offset, size, *checkpoint); } static void metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, - uint64_t txg) + boolean_t checkpoint) { spa_t *spa = vd->vdev_spa; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); - if (txg > spa_freeze_txg(spa)) + if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) return; if (spa->spa_vdev_removal != NULL && @@ -3243,13 +3343,13 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, * an indirect vdev (in open context), and then (in syncing * context) clear spa_vdev_removal. */ - free_from_removing_vdev(vd, offset, size, txg); + free_from_removing_vdev(vd, offset, size); } else if (vd->vdev_ops->vdev_op_remap != NULL) { - vdev_indirect_mark_obsolete(vd, offset, size, txg); + vdev_indirect_mark_obsolete(vd, offset, size); vd->vdev_ops->vdev_op_remap(vd, offset, size, - metaslab_free_impl_cb, &txg); + metaslab_free_impl_cb, &checkpoint); } else { - metaslab_free_concrete(vd, offset, size, txg); + metaslab_free_concrete(vd, offset, size, checkpoint); } } @@ -3426,26 +3526,25 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); - range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], + range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); - VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, + VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - range_tree_add(msp->ms_tree, offset, size); + range_tree_add(msp->ms_allocatable, offset, size); mutex_exit(&msp->ms_lock); } /* - * Free the block represented by DVA in the context of the specified - * transaction group. + * Free the block represented by the given DVA. */ void -metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); @@ -3459,7 +3558,7 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); } - metaslab_free_impl(vd, offset, size, txg); + metaslab_free_impl(vd, offset, size, checkpoint); } /* @@ -3529,7 +3628,8 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) + if (error == 0 && + !range_tree_contains(msp->ms_allocatable, offset, size)) error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ @@ -3540,13 +3640,15 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, VERIFY(!msp->ms_condensing); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); - range_tree_remove(msp->ms_tree, offset, size); + VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, + msp->ms_size); + range_tree_remove(msp->ms_allocatable, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ - if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) + if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); - range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); + range_tree_add(msp->ms_allocating[txg & TXG_MASK], + offset, size); } mutex_exit(&msp->ms_lock); @@ -3691,13 +3793,41 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) ASSERT(!BP_IS_HOLE(bp)); ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); + /* + * If we have a checkpoint for the pool we need to make sure that + * the blocks that we free that are part of the checkpoint won't be + * reused until the checkpoint is discarded or we revert to it. + * + * The checkpoint flag is passed down the metaslab_free code path + * and is set whenever we want to add a block to the checkpoint's + * accounting. That is, we "checkpoint" blocks that existed at the + * time the checkpoint was created and are therefore referenced by + * the checkpointed uberblock. + * + * Note that, we don't checkpoint any blocks if the current + * syncing txg <= spa_checkpoint_txg. We want these frees to sync + * normally as they will be referenced by the checkpointed uberblock. + */ + boolean_t checkpoint = B_FALSE; + if (bp->blk_birth <= spa->spa_checkpoint_txg && + spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { + /* + * At this point, if the block is part of the checkpoint + * there is no way it was created in the current txg. + */ + ASSERT(!now); + ASSERT3U(spa_syncing_txg(spa), ==, txg); + checkpoint = B_TRUE; + } + spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { if (now) { metaslab_unalloc_dva(spa, &dva[d], txg); } else { - metaslab_free_dva(spa, &dva[d], txg); + ASSERT3U(txg, ==, spa_syncing_txg(spa)); + metaslab_free_dva(spa, &dva[d], checkpoint); } } @@ -3818,12 +3948,13 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) mutex_enter(&msp->ms_lock); if (msp->ms_loaded) - range_tree_verify(msp->ms_tree, offset, size); + range_tree_verify(msp->ms_allocatable, offset, size); - range_tree_verify(msp->ms_freeingtree, offset, size); - range_tree_verify(msp->ms_freedtree, offset, size); + range_tree_verify(msp->ms_freeing, offset, size); + range_tree_verify(msp->ms_checkpointing, offset, size); + range_tree_verify(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) - range_tree_verify(msp->ms_defertree[j], offset, size); + range_tree_verify(msp->ms_defer[j], offset, size); mutex_exit(&msp->ms_lock); } diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 448d00c1e..2181a92df 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> diff --git a/module/zfs/spa.c b/module/zfs/spa.c index cdc03e66c..8ab7c3428 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -153,8 +153,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); -static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport, - boolean_t reloading); +static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ @@ -216,6 +215,7 @@ unsigned long zfs_max_missing_tvds = 0; * and we get a chance to retrieve the trusted config. */ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; + /* * In the case where config was assembled by scanning device paths (/dev/dsks * by default) we are less tolerant since all the existing devices should have @@ -224,6 +224,11 @@ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; uint64_t zfs_max_missing_tvds_scan = 0; /* + * Debugging aid that pauses spa_sync() towards the end. + */ +boolean_t zfs_pause_spa_sync = B_FALSE; + +/* * ========================================================================== * SPA properties routines * ========================================================================== @@ -274,6 +279,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, + spa->spa_checkpoint_info.sci_dspace, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); @@ -811,6 +818,12 @@ spa_change_guid_check(void *arg, dmu_tx_t *tx) vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + int error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (SET_ERROR(error)); + } + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); @@ -1452,6 +1465,12 @@ spa_unload(spa_t *spa) spa->spa_condense_zthr = NULL; } + if (spa->spa_checkpoint_discard_zthr != NULL) { + ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr)); + zthr_destroy(spa->spa_checkpoint_discard_zthr); + spa->spa_checkpoint_discard_zthr = NULL; + } + spa_condense_fini(spa); bpobj_close(&spa->spa_deferred_bpobj); @@ -1535,6 +1554,18 @@ spa_load_spares(spa_t *spa) int i; vdev_t *vd, *tvd; +#ifndef _KERNEL + /* + * zdb opens both the current state of the pool and the + * checkpointed state (if present), with a different spa_t. + * + * As spare vdevs are shared among open pools, we skip loading + * them when we load the checkpointed state of the pool. + */ + if (!spa_writeable(spa)) + return; +#endif + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* @@ -1654,6 +1685,19 @@ spa_load_l2cache(spa_t *spa) vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; +#ifndef _KERNEL + /* + * zdb opens both the current state of the pool and the + * checkpointed state (if present), with a different spa_t. + * + * As L2 caches are part of the ARC which is shared among open + * pools, we skip loading them when we load the checkpointed + * state of the pool. + */ + if (!spa_writeable(spa)) + return; +#endif + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); oldvdevs = sav->sav_vdevs; @@ -2206,6 +2250,11 @@ spa_spawn_aux_threads(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_start_indirect_condensing_thread(spa); + + ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); + spa->spa_checkpoint_discard_zthr = + zthr_create(spa_checkpoint_discard_thread_check, + spa_checkpoint_discard_thread, spa); } /* @@ -2299,7 +2348,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) spa->spa_load_state = state; gethrestime(&spa->spa_loaded_ts); - error = spa_load_impl(spa, type, &ereport, B_FALSE); + error = spa_load_impl(spa, type, &ereport); /* * Don't count references from objsets that are already closed @@ -2606,8 +2655,25 @@ spa_ld_parse_config(spa_t *spa, spa_import_type_t type) return (SET_ERROR(EINVAL)); } - if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == - SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { + /* + * If we are doing an import, ensure that the pool is not already + * imported by checking if its pool guid already exists in the + * spa namespace. + * + * The only case that we allow an already imported pool to be + * imported again, is when the pool is checkpointed and we want to + * look at its checkpointed state from userland tools like zdb. + */ +#ifdef _KERNEL + if ((spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0)) { +#else + if ((spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0) && + !spa_importing_readonly_checkpoint(spa)) { +#endif spa_load_failed(spa, "a pool with guid %llu is already open", (u_longlong_t)pool_guid); return (SET_ERROR(EEXIST)); @@ -2766,6 +2832,19 @@ spa_ld_validate_vdevs(spa_t *spa) return (0); } +static void +spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) +{ + spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_ubsync = spa->spa_uberblock; + spa->spa_verify_min_txg = spa->spa_extreme_rewind ? + TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; + spa->spa_first_txg = spa->spa_last_ubsync_txg ? + spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; + spa->spa_claim_max_txg = spa->spa_first_txg; + spa->spa_prev_software_version = ub->ub_software_version; +} + static int spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) { @@ -2775,6 +2854,29 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) boolean_t activity_check = B_FALSE; /* + * If we are opening the checkpointed state of the pool by + * rewinding to it, at this point we will have written the + * checkpointed uberblock to the vdev labels, so searching + * the labels will find the right uberblock. However, if + * we are opening the checkpointed state read-only, we have + * not modified the labels. Therefore, we must ignore the + * labels and continue using the spa_uberblock that was set + * by spa_ld_checkpoint_rewind. + * + * Note that it would be fine to ignore the labels when + * rewinding (opening writeable) as well. However, if we + * crash just after writing the labels, we will end up + * searching the labels. Doing so in the common case means + * that this code path gets exercised normally, rather than + * just in the edge case. + */ + if (ub->ub_checkpoint_txg != 0 && + spa_importing_readonly_checkpoint(spa)) { + spa_ld_select_uberblock_done(spa, ub); + return (0); + } + + /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); @@ -2905,14 +3007,7 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) /* * Initialize internal SPA structures. */ - spa->spa_state = POOL_STATE_ACTIVE; - spa->spa_ubsync = spa->spa_uberblock; - spa->spa_verify_min_txg = spa->spa_extreme_rewind ? - TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; - spa->spa_first_txg = spa->spa_last_ubsync_txg ? - spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; - spa->spa_claim_max_txg = spa->spa_first_txg; - spa->spa_prev_software_version = ub->ub_software_version; + spa_ld_select_uberblock_done(spa, ub); return (0); } @@ -2935,7 +3030,7 @@ spa_ld_open_rootbp(spa_t *spa) } static int -spa_ld_load_trusted_config(spa_t *spa, spa_import_type_t type, +spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t reloading) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; @@ -3609,7 +3704,7 @@ spa_ld_claim_log_blocks(spa_t *spa) static void spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, - boolean_t reloading) + boolean_t update_config_cache) { vdev_t *rvd = spa->spa_root_vdev; int need_update = B_FALSE; @@ -3621,7 +3716,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ - if (reloading || config_cache_txg != spa->spa_config_txg || + if (update_config_cache || config_cache_txg != spa->spa_config_txg || spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) @@ -3657,18 +3752,38 @@ spa_ld_prepare_for_reload(spa_t *spa) spa->spa_async_suspended = async_suspended; } -/* - * Load an existing storage pool, using the config provided. This config - * describes which vdevs are part of the pool and is later validated against - * partial configs present in each vdev's label and an entire copy of the - * config stored in the MOS. - */ static int -spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport, - boolean_t reloading) +spa_ld_read_checkpoint_txg(spa_t *spa) +{ + uberblock_t checkpoint; + int error = 0; + + ASSERT0(spa->spa_checkpoint_txg); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error == ENOENT) + return (0); + + if (error != 0) + return (error); + + ASSERT3U(checkpoint.ub_txg, !=, 0); + ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); + ASSERT3U(checkpoint.ub_timestamp, !=, 0); + spa->spa_checkpoint_txg = checkpoint.ub_txg; + spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; + + return (0); +} + +static int +spa_ld_mos_init(spa_t *spa, spa_import_type_t type) { int error = 0; - boolean_t missing_feat_write = B_FALSE; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); @@ -3684,11 +3799,6 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport, if (type != SPA_IMPORT_ASSEMBLE) spa->spa_trust_config = B_FALSE; - if (reloading) - spa_load_note(spa, "RELOADING"); - else - spa_load_note(spa, "LOADING"); - /* * Parse the config provided to create a vdev tree. */ @@ -3721,11 +3831,11 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport, } /* - * Read vdev labels to find the best uberblock (i.e. latest, unless - * spa_load_max_txg is set) and store it in spa_uberblock. We get the - * list of features required to read blkptrs in the MOS from the vdev - * label with the best uberblock and verify that our version of zfs - * supports them all. + * Read all vdev labels to find the best uberblock (i.e. latest, + * unless spa_load_max_txg is set) and store it in spa_uberblock. We + * get the list of features required to read blkptrs in the MOS from + * the vdev label with the best uberblock and verify that our version + * of zfs supports them all. */ error = spa_ld_select_uberblock(spa, type); if (error != 0) @@ -3740,23 +3850,211 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport, if (error != 0) return (error); + return (0); +} + +static int +spa_ld_checkpoint_rewind(spa_t *spa) +{ + uberblock_t checkpoint; + int error = 0; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error != 0) { + spa_load_failed(spa, "unable to retrieve checkpointed " + "uberblock from the MOS config [error=%d]", error); + + if (error == ENOENT) + error = ZFS_ERR_NO_CHECKPOINT; + + return (error); + } + + ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); + ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); + + /* + * We need to update the txg and timestamp of the checkpointed + * uberblock to be higher than the latest one. This ensures that + * the checkpointed uberblock is selected if we were to close and + * reopen the pool right after we've written it in the vdev labels. + * (also see block comment in vdev_uberblock_compare) + */ + checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; + checkpoint.ub_timestamp = gethrestime_sec(); + + /* + * Set current uberblock to be the checkpointed uberblock. + */ + spa->spa_uberblock = checkpoint; + + /* + * If we are doing a normal rewind, then the pool is open for + * writing and we sync the "updated" checkpointed uberblock to + * disk. Once this is done, we've basically rewound the whole + * pool and there is no way back. + * + * There are cases when we don't want to attempt and sync the + * checkpointed uberblock to disk because we are opening a + * pool as read-only. Specifically, verifying the checkpointed + * state with zdb, and importing the checkpointed state to get + * a "preview" of its content. + */ + if (spa_writeable(spa)) { + vdev_t *rvd = spa->spa_root_vdev; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; + int svdcount = 0; + int children = rvd->vdev_children; + int c0 = spa_get_random(children); + + for (int c = 0; c < children; c++) { + vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; + + /* Stop when revisiting the first vdev */ + if (c > 0 && svd[0] == vd) + break; + + if (vd->vdev_ms_array == 0 || vd->vdev_islog || + !vdev_is_concrete(vd)) + continue; + + svd[svdcount++] = vd; + if (svdcount == SPA_SYNC_MIN_VDEVS) + break; + } + error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); + if (error == 0) + spa->spa_last_synced_guid = rvd->vdev_guid; + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_load_failed(spa, "failed to write checkpointed " + "uberblock to the vdev labels [error=%d]", error); + return (error); + } + } + + return (0); +} + +static int +spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, + boolean_t *update_config_cache) +{ + int error; + + /* + * Parse the config for pool, open and validate vdevs, + * select an uberblock, and use that uberblock to open + * the MOS. + */ + error = spa_ld_mos_init(spa, type); + if (error != 0) + return (error); + /* * Retrieve the trusted config stored in the MOS and use it to create * a new, exact version of the vdev tree, then reopen all vdevs. */ - error = spa_ld_load_trusted_config(spa, type, reloading); + error = spa_ld_trusted_config(spa, type, B_FALSE); if (error == EAGAIN) { - VERIFY(!reloading); + if (update_config_cache != NULL) + *update_config_cache = B_TRUE; + /* * Redo the loading process with the trusted config if it is * too different from the untrusted config. */ spa_ld_prepare_for_reload(spa); - return (spa_load_impl(spa, type, ereport, B_TRUE)); + spa_load_note(spa, "RELOADING"); + error = spa_ld_mos_init(spa, type); + if (error != 0) + return (error); + + error = spa_ld_trusted_config(spa, type, B_TRUE); + if (error != 0) + return (error); + } else if (error != 0) { return (error); } + return (0); +} + +/* + * Load an existing storage pool, using the config provided. This config + * describes which vdevs are part of the pool and is later validated against + * partial configs present in each vdev's label and an entire copy of the + * config stored in the MOS. + */ +static int +spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) +{ + int error = 0; + boolean_t missing_feat_write = B_FALSE; + boolean_t checkpoint_rewind = + (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + boolean_t update_config_cache = B_FALSE; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); + + spa_load_note(spa, "LOADING"); + + error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); + if (error != 0) + return (error); + + /* + * If we are rewinding to the checkpoint then we need to repeat + * everything we've done so far in this function but this time + * selecting the checkpointed uberblock and using that to open + * the MOS. + */ + if (checkpoint_rewind) { + /* + * If we are rewinding to the checkpoint update config cache + * anyway. + */ + update_config_cache = B_TRUE; + + /* + * Extract the checkpointed uberblock from the current MOS + * and use this as the pool's uberblock from now on. If the + * pool is imported as writeable we also write the checkpoint + * uberblock to the labels, making the rewind permanent. + */ + error = spa_ld_checkpoint_rewind(spa); + if (error != 0) + return (error); + + /* + * Redo the loading process process again with the + * checkpointed uberblock. + */ + spa_ld_prepare_for_reload(spa); + spa_load_note(spa, "LOADING checkpointed uberblock"); + error = spa_ld_mos_with_trusted_config(spa, type, NULL); + if (error != 0) + return (error); + } + + /* + * Retrieve the checkpoint txg if the pool has a checkpoint. + */ + error = spa_ld_read_checkpoint_txg(spa); + if (error != 0) + return (error); + /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed * from the pool and their contents were re-mapped to other vdevs. Note @@ -3860,6 +4158,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport, ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* + * In case of a checkpoint rewind, log the original txg + * of the checkpointed uberblock. + */ + if (checkpoint_rewind) { + spa_history_log_internal(spa, "checkpoint rewind", + NULL, "rewound state to txg=%llu", + (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); + } + + /* * Traverse the ZIL and claim all blocks. */ spa_ld_claim_log_blocks(spa); @@ -3886,7 +4194,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport, * and the cachefile (by default /etc/zfs/zpool.cache). */ spa_ld_check_for_config_update(spa, config_cache_txg, - reloading); + update_config_cache); /* * Check all DTLs to see if anything needs resilvering. @@ -3970,6 +4278,15 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); + if (load_error == ZFS_ERR_NO_CHECKPOINT) { + /* + * When attempting checkpoint-rewind on a pool with no + * checkpoint, we should not attempt to load uberblocks + * from previous txgs when spa_load fails. + */ + ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + return (load_error); + } if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); @@ -5564,6 +5881,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); @@ -5776,6 +6100,27 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) vd = spa_lookup_by_guid(spa, guid, B_FALSE); + /* + * Besides being called directly from the userland through the + * ioctl interface, spa_vdev_detach() can be potentially called + * at the end of spa_vdev_resilver_done(). + * + * In the regular case, when we have a checkpoint this shouldn't + * happen as we never empty the DTLs of a vdev during the scrub + * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() + * should never get here when we have a checkpoint. + * + * That said, even in a case when we checkpoint the pool exactly + * as spa_vdev_resilver_done() calls this function everything + * should be fine as the resilver will return right away. + */ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); @@ -6014,6 +6359,13 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, txg = spa_vdev_enter(spa); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); @@ -6665,6 +7017,10 @@ spa_async_suspend(spa_t *spa) zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL && zthr_isrunning(condense_thread)) VERIFY0(zthr_cancel(condense_thread)); + + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; + if (discard_thread != NULL && zthr_isrunning(discard_thread)) + VERIFY0(zthr_cancel(discard_thread)); } void @@ -6679,6 +7035,10 @@ spa_async_resume(spa_t *spa) zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL && !zthr_isrunning(condense_thread)) zthr_resume(condense_thread); + + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; + if (discard_thread != NULL && !zthr_isrunning(discard_thread)) + zthr_resume(discard_thread); } static boolean_t @@ -7454,6 +7814,8 @@ spa_sync(spa_t *spa, uint64_t txg) txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); + ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, + txg)); break; } spa_sync_deferred_frees(spa, tx); @@ -7499,16 +7861,22 @@ spa_sync(spa_t *spa, uint64_t txg) spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { - vdev_t *svd[SPA_SYNC_MIN_VDEVS]; + vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; + + /* Stop when revisiting the first vdev */ + if (c > 0 && svd[0] == vd) + break; + if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; + svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; @@ -7572,6 +7940,9 @@ spa_sync(spa_t *spa, uint64_t txg) ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); + while (zfs_pause_spa_sync) + delay(1); + spa->spa_sync_pass = 0; /* diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c new file mode 100644 index 000000000..544658821 --- /dev/null +++ b/module/zfs/spa_checkpoint.c @@ -0,0 +1,638 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +/* + * Storage Pool Checkpoint + * + * A storage pool checkpoint can be thought of as a pool-wide snapshot or + * a stable version of extreme rewind that guarantees no blocks from the + * checkpointed state will have been overwritten. It remembers the entire + * state of the storage pool (e.g. snapshots, dataset names, etc..) from the + * point that it was taken and the user can rewind back to that point even if + * they applied destructive operations on their datasets or even enabled new + * zpool on-disk features. If a pool has a checkpoint that is no longer + * needed, the user can discard it. + * + * == On disk data structures used == + * + * - The pool has a new feature flag and a new entry in the MOS. The feature + * flag is set to active when we create the checkpoint and remains active + * until the checkpoint is fully discarded. The entry in the MOS config + * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that + * references the state of the pool when we take the checkpoint. The entry + * remains populated until we start discarding the checkpoint or we rewind + * back to it. + * + * - Each vdev contains a vdev-wide space map while the pool has a checkpoint, + * which persists until the checkpoint is fully discarded. The space map + * contains entries that have been freed in the current state of the pool + * but we want to keep around in case we decide to rewind to the checkpoint. + * [see vdev_checkpoint_sm] + * + * - Each metaslab's ms_sm space map behaves the same as without the + * checkpoint, with the only exception being the scenario when we free + * blocks that belong to the checkpoint. In this case, these blocks remain + * ALLOCATED in the metaslab's space map and they are added as FREE in the + * vdev's checkpoint space map. + * + * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that + * the uberblock was checkpointed. For normal uberblocks this field is 0. + * + * == Overview of operations == + * + * - To create a checkpoint, we first wait for the current TXG to be synced, + * so we can use the most recently synced uberblock (spa_ubsync) as the + * checkpointed uberblock. Then we use an early synctask to place that + * uberblock in MOS config, increment the feature flag for the checkpoint + * (marking it active), and setting spa_checkpoint_txg (see its use below) + * to the TXG of the checkpointed uberblock. We use an early synctask for + * the aforementioned operations to ensure that no blocks were dirtied + * between the current TXG and the TXG of the checkpointed uberblock + * (e.g the previous txg). + * + * - When a checkpoint exists, we need to ensure that the blocks that + * belong to the checkpoint are freed but never reused. This means that + * these blocks should never end up in the ms_allocatable or the ms_freeing + * trees of a metaslab. Therefore, whenever there is a checkpoint the new + * ms_checkpointing tree is used in addition to the aforementioned ones. + * + * Whenever a block is freed and we find out that it is referenced by the + * checkpoint (we find out by comparing its birth to spa_checkpoint_txg), + * we place it in the ms_checkpointing tree instead of the ms_freeingtree. + * This way, we divide the blocks that are being freed into checkpointed + * and not-checkpointed blocks. + * + * In order to persist these frees, we write the extents from the + * ms_freeingtree to the ms_sm as usual, and the extents from the + * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these + * checkpointed extents will remain allocated in the metaslab's ms_sm space + * map, and therefore won't be reused [see metaslab_sync()]. In addition, + * when we discard the checkpoint, we can find the entries that have + * actually been freed in vdev_checkpoint_sm. + * [see spa_checkpoint_discard_thread_sync()] + * + * - To discard the checkpoint we use an early synctask to delete the + * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0, + * and wakeup the discarding zthr thread (an open-context async thread). + * We use an early synctask to ensure that the operation happens before any + * new data end up in the checkpoint's data structures. + * + * Once the synctask is done and the discarding zthr is awake, we discard + * the checkpointed data over multiple TXGs by having the zthr prefetching + * entries from vdev_checkpoint_sm and then starting a synctask that places + * them as free blocks in to their respective ms_allocatable and ms_sm + * structures. + * [see spa_checkpoint_discard_thread()] + * + * When there are no entries left in the vdev_checkpoint_sm of all + * top-level vdevs, a final synctask runs that decrements the feature flag. + * + * - To rewind to the checkpoint, we first use the current uberblock and + * open the MOS so we can access the checkpointed uberblock from the MOS + * config. After we retrieve the checkpointed uberblock, we use it as the + * current uberblock for the pool by writing it to disk with an updated + * TXG, opening its version of the MOS, and moving on as usual from there. + * [see spa_ld_checkpoint_rewind()] + * + * An important note on rewinding to the checkpoint has to do with how we + * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL + * blocks that have not been claimed by the time we took the checkpoint + * as they should no longer be valid. + * [see comment in zil_claim()] + * + * == Miscellaneous information == + * + * - In the hypothetical event that we take a checkpoint, remove a vdev, + * and attempt to rewind, the rewind would fail as the checkpointed + * uberblock would reference data in the removed device. For this reason + * and others of similar nature, we disallow the following operations that + * can change the config: + * vdev removal and attach/detach, mirror splitting, and pool reguid. + * + * - As most of the checkpoint logic is implemented in the SPA and doesn't + * distinguish datasets when it comes to space accounting, having a + * checkpoint can potentially break the boundaries set by dataset + * reservations. + */ + +#include <sys/dmu_tx.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/metaslab_impl.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/spa_checkpoint.h> +#include <sys/vdev_impl.h> +#include <sys/zap.h> +#include <sys/zfeature.h> + +/* + * The following parameter limits the amount of memory to be used for the + * prefetching of the checkpoint space map done on each vdev while + * discarding the checkpoint. + * + * The reason it exists is because top-level vdevs with long checkpoint + * space maps can potentially take up a lot of memory depending on the + * amount of checkpointed data that has been freed within them while + * the pool had a checkpoint. + */ +unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024; + +int +spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); + + bzero(pcs, sizeof (pool_checkpoint_stat_t)); + + int error = zap_contains(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT); + ASSERT(error == 0 || error == ENOENT); + + if (error == ENOENT) + pcs->pcs_state = CS_CHECKPOINT_DISCARDING; + else + pcs->pcs_state = CS_CHECKPOINT_EXISTS; + + pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace; + pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp; + + return (0); +} + +static void +spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + + spa->spa_checkpoint_info.sci_timestamp = 0; + + spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); + + spa_history_log_internal(spa, "spa discard checkpoint", tx, + "finished discarding checkpointed state from the pool"); +} + +typedef struct spa_checkpoint_discard_sync_callback_arg { + vdev_t *sdc_vd; + uint64_t sdc_txg; + uint64_t sdc_entry_limit; +} spa_checkpoint_discard_sync_callback_arg_t; + +static int +spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset, + uint64_t size, void *arg) +{ + spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; + vdev_t *vd = sdc->sdc_vd; + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + uint64_t end = offset + size; + + if (sdc->sdc_entry_limit == 0) + return (EINTR); + + /* + * Since the space map is not condensed, we know that + * none of its entries is crossing the boundaries of + * its respective metaslab. + * + * That said, there is no fundamental requirement that + * the checkpoint's space map entries should not cross + * metaslab boundaries. So if needed we could add code + * that handles metaslab-crossing segments in the future. + */ + VERIFY3U(type, ==, SM_FREE); + VERIFY3U(offset, >=, ms->ms_start); + VERIFY3U(end, <=, ms->ms_start + ms->ms_size); + + /* + * At this point we should not be processing any + * other frees concurrently, so the lock is technically + * unnecessary. We use the lock anyway though to + * potentially save ourselves from future headaches. + */ + mutex_enter(&ms->ms_lock); + if (range_tree_is_empty(ms->ms_freeing)) + vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); + range_tree_add(ms->ms_freeing, offset, size); + mutex_exit(&ms->ms_lock); + + ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size); + ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size); + + vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size; + vd->vdev_stat.vs_checkpoint_space -= size; + sdc->sdc_entry_limit--; + + return (0); +} + +#ifdef ZFS_DEBUG +static void +spa_checkpoint_accounting_verify(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t ckpoint_sm_space_sum = 0; + uint64_t vs_ckpoint_space_sum = 0; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + if (vd->vdev_checkpoint_sm != NULL) { + ckpoint_sm_space_sum += + -vd->vdev_checkpoint_sm->sm_alloc; + vs_ckpoint_space_sum += + vd->vdev_stat.vs_checkpoint_space; + ASSERT3U(ckpoint_sm_space_sum, ==, + vs_ckpoint_space_sum); + } else { + ASSERT0(vd->vdev_stat.vs_checkpoint_space); + } + } + ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum); +} +#endif + +static void +spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *vd = arg; + int error; + + /* + * The space map callback is applied only to non-debug entries. + * Because the number of debug entries is less or equal to the + * number of non-debug entries, we want to ensure that we only + * read what we prefetched from open-context. + * + * Thus, we set the maximum entries that the space map callback + * will be applied to be half the entries that could fit in the + * imposed memory limit. + */ + uint64_t max_entry_limit = + (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1; + + uint64_t entries_in_sm = + space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); + + /* + * Iterate from the end of the space map towards the beginning, + * placing its entries on ms_freeing and removing them from the + * space map. The iteration stops if one of the following + * conditions is true: + * + * 1] We reached the beginning of the space map. At this point + * the space map should be completely empty and + * space_map_incremental_destroy should have returned 0. + * The next step would be to free and close the space map + * and remove its entry from its vdev's top zap. This allows + * spa_checkpoint_discard_thread() to move on to the next vdev. + * + * 2] We reached the memory limit (amount of memory used to hold + * space map entries in memory) and space_map_incremental_destroy + * returned EINTR. This means that there are entries remaining + * in the space map that will be cleared in a future invocation + * of this function by spa_checkpoint_discard_thread(). + */ + spa_checkpoint_discard_sync_callback_arg_t sdc; + sdc.sdc_vd = vd; + sdc.sdc_txg = tx->tx_txg; + sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit); + + uint64_t entries_before = entries_in_sm; + + error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, + spa_checkpoint_discard_sync_callback, &sdc, tx); + + uint64_t entries_after = + space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); + +#ifdef ZFS_DEBUG + spa_checkpoint_accounting_verify(vd->vdev_spa); +#endif + + zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, " + "deleted %llu entries - %llu entries are left", + tx->tx_txg, vd->vdev_id, (entries_before - entries_after), + entries_after); + + if (error != EINTR) { + if (error != 0) { + zfs_panic_recover("zfs: error %d was returned " + "while incrementally destroying the checkpoint " + "space map of vdev %llu\n", + error, vd->vdev_id); + } + ASSERT0(entries_after); + ASSERT0(vd->vdev_checkpoint_sm->sm_alloc); + ASSERT0(vd->vdev_checkpoint_sm->sm_length); + + space_map_free(vd->vdev_checkpoint_sm, tx); + space_map_close(vd->vdev_checkpoint_sm); + vd->vdev_checkpoint_sm = NULL; + + VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); + } +} + +static boolean_t +spa_checkpoint_discard_is_done(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(!spa_has_checkpoint(spa)); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)); + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL) + return (B_FALSE); + ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space); + } + + return (B_TRUE); +} + +/* ARGSUSED */ +boolean_t +spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (B_FALSE); + + if (spa_has_checkpoint(spa)) + return (B_FALSE); + + return (B_TRUE); +} + +int +spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_t *rvd = spa->spa_root_vdev; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + while (vd->vdev_checkpoint_sm != NULL) { + space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm; + int numbufs; + dmu_buf_t **dbp; + + if (zthr_iscancelled(zthr)) + return (0); + + ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); + + uint64_t size = MIN(space_map_length(checkpoint_sm), + zfs_spa_discard_memory_limit); + uint64_t offset = + space_map_length(checkpoint_sm) - size; + + /* + * Ensure that the part of the space map that will + * be destroyed by the synctask, is prefetched in + * memory before the synctask runs. + */ + int error = dmu_buf_hold_array_by_bonus( + checkpoint_sm->sm_dbuf, offset, size, + B_TRUE, FTAG, &numbufs, &dbp); + if (error != 0) { + zfs_panic_recover("zfs: error %d was returned " + "while prefetching checkpoint space map " + "entries of vdev %llu\n", + error, vd->vdev_id); + } + + VERIFY0(dsl_sync_task(spa->spa_name, NULL, + spa_checkpoint_discard_thread_sync, vd, + 0, ZFS_SPACE_CHECK_NONE)); + + dmu_buf_rele_array(dbp, numbufs, FTAG); + } + } + + VERIFY(spa_checkpoint_discard_is_done(spa)); + VERIFY0(spa->spa_checkpoint_info.sci_dspace); + VERIFY0(dsl_sync_task(spa->spa_name, NULL, + spa_checkpoint_discard_complete_sync, spa, + 0, ZFS_SPACE_CHECK_NONE)); + + return (0); +} + + +/* ARGSUSED */ +static int +spa_checkpoint_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ENOTSUP)); + + if (!spa_top_vdevs_spacemap_addressable(spa)) + return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG)); + + if (spa->spa_vdev_removal != NULL) + return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); + + if (spa->spa_checkpoint_txg != 0) + return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); + + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); + + return (0); +} + +/* ARGSUSED */ +static void +spa_checkpoint_sync(void *arg, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + uberblock_t checkpoint = spa->spa_ubsync; + + /* + * At this point, there should not be a checkpoint in the MOS. + */ + ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT); + + ASSERT0(spa->spa_checkpoint_info.sci_timestamp); + ASSERT0(spa->spa_checkpoint_info.sci_dspace); + + /* + * Since the checkpointed uberblock is the one that just got synced + * (we use spa_ubsync), its txg must be equal to the txg number of + * the txg we are syncing, minus 1. + */ + ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1); + + /* + * Once the checkpoint is in place, we need to ensure that none of + * its blocks will be marked for reuse after it has been freed. + * When there is a checkpoint and a block is freed, we compare its + * birth txg to the txg of the checkpointed uberblock to see if the + * block is part of the checkpoint or not. Therefore, we have to set + * spa_checkpoint_txg before any frees happen in this txg (which is + * why this is done as an early_synctask as explained in the comment + * in spa_checkpoint()). + */ + spa->spa_checkpoint_txg = checkpoint.ub_txg; + spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; + + checkpoint.ub_checkpoint_txg = checkpoint.ub_txg; + VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, + sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), + &checkpoint, tx)); + + /* + * Increment the feature refcount and thus activate the feature. + * Note that the feature will be deactivated when we've + * completely discarded all checkpointed state (both vdev + * space maps and uberblock). + */ + spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); + + spa_history_log_internal(spa, "spa checkpoint", tx, + "checkpointed uberblock txg=%llu", checkpoint.ub_txg); +} + +/* + * Create a checkpoint for the pool. + */ +int +spa_checkpoint(const char *pool) +{ + int error; + spa_t *spa; + + error = spa_open(pool, &spa, FTAG); + if (error != 0) + return (error); + + mutex_enter(&spa->spa_vdev_top_lock); + + /* + * Wait for current syncing txg to finish so the latest synced + * uberblock (spa_ubsync) has all the changes that we expect + * to see if we were to revert later to the checkpoint. In other + * words we want the checkpointed uberblock to include/reference + * all the changes that were pending at the time that we issued + * the checkpoint command. + */ + txg_wait_synced(spa_get_dsl(spa), 0); + + /* + * As the checkpointed uberblock references blocks from the previous + * txg (spa_ubsync) we want to ensure that are not freeing any of + * these blocks in the same txg that the following synctask will + * run. Thus, we run it as an early synctask, so the dirty changes + * that are synced to disk afterwards during zios and other synctasks + * do not reuse checkpointed blocks. + */ + error = dsl_early_sync_task(pool, spa_checkpoint_check, + spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL); + + mutex_exit(&spa->spa_vdev_top_lock); + + spa_close(spa, FTAG); + return (error); +} + +/* ARGSUSED */ +static int +spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); + + if (spa->spa_checkpoint_txg == 0) + return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); + + VERIFY0(zap_contains(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT)); + + return (0); +} + +/* ARGSUSED */ +static void +spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, tx)); + + spa->spa_checkpoint_txg = 0; + + zthr_wakeup(spa->spa_checkpoint_discard_zthr); + + spa_history_log_internal(spa, "spa discard checkpoint", tx, + "started discarding checkpointed state from the pool"); +} + +/* + * Discard the checkpoint from a pool. + */ +int +spa_checkpoint_discard(const char *pool) +{ + /* + * Similarly to spa_checkpoint(), we want our synctask to run + * before any pending dirty data are written to disk so they + * won't end up in the checkpoint's data structures (e.g. + * ms_checkpointing and vdev_checkpoint_sm) and re-create any + * space maps that the discarding open-context thread has + * deleted. + * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread] + */ + return (dsl_early_sync_task(pool, spa_checkpoint_discard_check, + spa_checkpoint_discard_sync, NULL, 0, + ZFS_SPACE_CHECK_DISCARD_CHECKPOINT)); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(spa_checkpoint_get_stats); +EXPORT_SYMBOL(spa_checkpoint_discard_thread); +EXPORT_SYMBOL(spa_checkpoint_discard_thread_check); + +/* BEGIN CSTYLED */ +module_param(zfs_spa_discard_memory_limit, ulong, 0644); +MODULE_PARM_DESC(zfs_spa_discard_memory_limit, + "Maximum memory for prefetching checkpoint space " + "map per top-level vdev while discarding checkpoint"); +/* END CSTYLED */ +#endif diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 6c7e2f55c..9410fab07 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -357,12 +357,15 @@ int spa_asize_inflation = 24; * These are the operations that call dsl_pool_adjustedsize() with the netfree * argument set to TRUE. * + * Operations that are almost guaranteed to free up space in the absence of + * a pool checkpoint can use up to three quarters of the slop space + * (e.g zfs destroy). + * * A very restricted set of operations are always permitted, regardless of * the amount of free space. These are the operations that call - * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these - * operations result in a net increase in the amount of space used, - * it is possible to run the pool completely out of space, causing it to - * be permanently read-only. + * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net + * increase in the amount of space used, it is possible to run the pool + * completely out of space, causing it to be permanently read-only. * * Note that on very small pools, the slop space will be larger than * 3.2%, in an effort to have it be at least spa_min_slop (128MB), @@ -1718,6 +1721,12 @@ spa_get_dspace(spa_t *spa) return (spa->spa_dspace); } +uint64_t +spa_get_checkpoint_space(spa_t *spa) +{ + return (spa->spa_checkpoint_info.sci_dspace); +} + void spa_update_dspace(spa_t *spa) { @@ -2065,7 +2074,8 @@ spa_writeable(spa_t *spa) boolean_t spa_has_pending_synctask(spa_t *spa) { - return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks)); + return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || + !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); } int @@ -2293,6 +2303,63 @@ spa_state_to_name(spa_t *spa) return ("UNKNOWN"); } +boolean_t +spa_top_vdevs_spacemap_addressable(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) + return (B_FALSE); + } + return (B_TRUE); +} + +boolean_t +spa_has_checkpoint(spa_t *spa) +{ + return (spa->spa_checkpoint_txg != 0); +} + +boolean_t +spa_importing_readonly_checkpoint(spa_t *spa) +{ + return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && + spa->spa_mode == FREAD); +} + +uint64_t +spa_min_claim_txg(spa_t *spa) +{ + uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; + + if (checkpoint_txg != 0) + return (checkpoint_txg + 1); + + return (spa->spa_first_txg); +} + +/* + * If there is a checkpoint, async destroys may consume more space from + * the pool instead of freeing it. In an attempt to save the pool from + * getting suspended when it is about to run out of space, we stop + * processing async destroys. + */ +boolean_t +spa_suspend_async_destroy(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + + uint64_t unreserved = dsl_pool_unreserved_space(dp, + ZFS_SPACE_CHECK_EXTRA_RESERVED); + uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; + uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; + + if (spa_has_checkpoint(spa) && avail == 0) + return (B_TRUE); + + return (B_FALSE); +} + #if defined(_KERNEL) #include <linux/mod_compat.h> @@ -2446,6 +2513,11 @@ EXPORT_SYMBOL(spa_trust_config); EXPORT_SYMBOL(spa_missing_tvds_allowed); EXPORT_SYMBOL(spa_set_missing_tvds); EXPORT_SYMBOL(spa_state_to_name); +EXPORT_SYMBOL(spa_importing_readonly_checkpoint); +EXPORT_SYMBOL(spa_min_claim_txg); +EXPORT_SYMBOL(spa_suspend_async_destroy); +EXPORT_SYMBOL(spa_has_checkpoint); +EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable); /* BEGIN CSTYLED */ module_param(zfs_flags, uint, 0644); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index d84dd7583..0e5a4b976 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -38,12 +38,13 @@ #include <sys/zfeature.h> /* + * Note on space map block size: + * * The data for a given space map can be kept on blocks of any size. * Larger blocks entail fewer i/o operations, but they also cause the * DMU to keep more data in-core, and also to waste more i/o bandwidth * when only a few blocks have changed since the last transaction group. */ -int space_map_blksz = (1 << 12); /* * Iterate through the space map, invoking the callback on each (non-debug) @@ -105,6 +106,137 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) return (error); } +/* + * Note: This function performs destructive actions - specifically + * it deletes entries from the end of the space map. Thus, callers + * should ensure that they are holding the appropriate locks for + * the space map that they provide. + */ +int +space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, + dmu_tx_t *tx) +{ + uint64_t bufsize, len; + uint64_t *entry_map; + int error = 0; + + len = space_map_length(sm); + bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); + entry_map = zio_buf_alloc(bufsize); + + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + /* + * Since we can't move the starting offset of the space map + * (e.g there are reference on-disk pointing to it), we destroy + * its entries incrementally starting from the end. + * + * The logic that follows is basically the same as the one used + * in space_map_iterate() but it traverses the space map + * backwards: + * + * 1] We figure out the size of the buffer that we want to use + * to read the on-disk space map entries. + * 2] We figure out the offset at the end of the space map where + * we will start reading entries into our buffer. + * 3] We read the on-disk entries into the buffer. + * 4] We iterate over the entries from end to beginning calling + * the callback function on each one. As we move from entry + * to entry we decrease the size of the space map, deleting + * effectively each entry. + * 5] If there are no more entries in the space map or the + * callback returns a value other than 0, we stop iterating + * over the space map. If there are entries remaining and + * the callback returned zero we go back to step [1]. + */ + uint64_t offset = 0, size = 0; + while (len > 0 && error == 0) { + size = MIN(bufsize, len); + + VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); + VERIFY3U(size, >, 0); + ASSERT3U(sm->sm_blksz, !=, 0); + + offset = len - size; + + IMPLY(bufsize > len, offset == 0); + IMPLY(bufsize == len, offset == 0); + IMPLY(bufsize < len, offset > 0); + + + EQUIV(size == len, offset == 0); + IMPLY(size < len, bufsize < len); + + dprintf("object=%llu offset=%llx size=%llx\n", + space_map_object(sm), offset, size); + + error = dmu_read(sm->sm_os, space_map_object(sm), + offset, size, entry_map, DMU_READ_PREFETCH); + if (error != 0) + break; + + uint64_t num_entries = size / sizeof (uint64_t); + + ASSERT3U(num_entries, >, 0); + + while (num_entries > 0) { + uint64_t e, entry_offset, entry_size; + maptype_t type; + + e = entry_map[num_entries - 1]; + + ASSERT3U(num_entries, >, 0); + ASSERT0(error); + + if (SM_DEBUG_DECODE(e)) { + sm->sm_phys->smp_objsize -= sizeof (uint64_t); + space_map_update(sm); + len -= sizeof (uint64_t); + num_entries--; + continue; + } + + type = SM_TYPE_DECODE(e); + entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) + + sm->sm_start; + entry_size = SM_RUN_DECODE(e) << sm->sm_shift; + + VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); + VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift)); + VERIFY3U(entry_offset, >=, sm->sm_start); + VERIFY3U(entry_offset + entry_size, <=, + sm->sm_start + sm->sm_size); + + error = callback(type, entry_offset, entry_size, arg); + if (error != 0) + break; + + if (type == SM_ALLOC) + sm->sm_phys->smp_alloc -= entry_size; + else + sm->sm_phys->smp_alloc += entry_size; + + sm->sm_phys->smp_objsize -= sizeof (uint64_t); + space_map_update(sm); + len -= sizeof (uint64_t); + num_entries--; + } + IMPLY(error == 0, num_entries == 0); + EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0); + } + + if (len == 0) { + ASSERT0(error); + ASSERT0(offset); + ASSERT0(sm->sm_length); + ASSERT0(sm->sm_phys->smp_objsize); + ASSERT0(sm->sm_alloc); + } + + zio_buf_free(entry_map, bufsize); + return (error); +} + typedef struct space_map_load_arg { space_map_t *smla_sm; range_tree_t *smla_rt; @@ -279,7 +411,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, */ sm->sm_phys->smp_object = sm->sm_object; - if (range_tree_space(rt) == 0) { + if (range_tree_is_empty(rt)) { VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object); return; } @@ -418,7 +550,7 @@ space_map_close(space_map_t *sm) } void -space_map_truncate(space_map_t *sm, dmu_tx_t *tx) +space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) { objset_t *os = sm->sm_os; spa_t *spa = dmu_objset_spa(os); @@ -440,7 +572,7 @@ space_map_truncate(space_map_t *sm, dmu_tx_t *tx) */ if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && doi.doi_bonus_size != sizeof (space_map_phys_t)) || - doi.doi_data_block_size != space_map_blksz) { + doi.doi_data_block_size != blocksize) { zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating " "object[%llu]: old bonus %u, old blocksz %u", dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object, @@ -449,7 +581,7 @@ space_map_truncate(space_map_t *sm, dmu_tx_t *tx) space_map_free(sm, tx); dmu_buf_rele(sm->sm_dbuf, sm); - sm->sm_object = space_map_alloc(sm->sm_os, tx); + sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx); VERIFY0(space_map_open_impl(sm)); } else { VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx)); @@ -482,7 +614,7 @@ space_map_update(space_map_t *sm) } uint64_t -space_map_alloc(objset_t *os, dmu_tx_t *tx) +space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(os); uint64_t object; @@ -496,8 +628,7 @@ space_map_alloc(objset_t *os, dmu_tx_t *tx) bonuslen = SPACE_MAP_SIZE_V0; } - object = dmu_object_alloc(os, - DMU_OT_SPACE_MAP, space_map_blksz, + object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); return (object); diff --git a/module/zfs/uberblock.c b/module/zfs/uberblock.c index c1e85bdce..3b8526076 100644 --- a/module/zfs/uberblock.c +++ b/module/zfs/uberblock.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -60,6 +60,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay) ub->ub_mmp_magic = MMP_MAGIC; ub->ub_mmp_delay = spa_multihost(rvd->vdev_spa) ? mmp_delay : 0; ub->ub_mmp_seq = 0; + ub->ub_checkpoint_txg = 0; return (ub->ub_rootbp.blk_birth == txg); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 37bb5a0c5..cf1bf2837 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -52,11 +52,22 @@ #include <sys/zvol.h> #include <sys/zfs_ratelimit.h> +/* maximum number of metaslabs per top-level vdev */ +int vdev_max_ms_count = 200; + +/* minimum amount of metaslabs per top-level vdev */ +int vdev_min_ms_count = 16; + +/* see comment in vdev_metaslab_set_size() */ +int vdev_default_ms_shift = 29; + +int vdev_validate_skip = B_FALSE; + /* - * When a vdev is added, it will be divided into approximately (but no - * more than) this number of metaslabs. + * Since the DTL space map of a vdev is not expected to have a lot of + * entries, we default its block size to 4K. */ -int metaslabs_per_vdev = 200; +int vdev_dtl_sm_blksz = (1 << 12); /* * Rate limit delay events to this many IO delays per second. @@ -74,7 +85,12 @@ unsigned int zfs_checksums_per_second = 20; */ int zfs_scan_ignore_errors = 0; -int vdev_validate_skip = B_FALSE; +/* + * vdev-wide space maps that have lots of entries written to them at + * the end of each transaction can benefit from a higher I/O bandwidth + * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. + */ +int vdev_standard_sm_blksz = (1 << 17); /*PRINTFLIKE2*/ void @@ -926,6 +942,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; + tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; + svd->vdev_checkpoint_sm = NULL; + tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; @@ -1169,6 +1188,21 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) void vdev_metaslab_fini(vdev_t *vd) { + if (vd->vdev_checkpoint_sm != NULL) { + ASSERT(spa_feature_is_active(vd->vdev_spa, + SPA_FEATURE_POOL_CHECKPOINT)); + space_map_close(vd->vdev_checkpoint_sm); + /* + * Even though we close the space map, we need to set its + * pointer to NULL. The reason is that vdev_metaslab_fini() + * may be called multiple times for certain operations + * (i.e. when destroying a pool) so we need to ensure that + * this clause never executes twice. This logic is similar + * to the one used for the vdev_ms clause below. + */ + vd->vdev_checkpoint_sm = NULL; + } + if (vd->vdev_ms != NULL) { uint64_t count = vd->vdev_ms_count; @@ -2095,11 +2129,39 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) void vdev_metaslab_set_size(vdev_t *vd) { + uint64_t asize = vd->vdev_asize; + uint64_t ms_shift = 0; + /* - * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. + * For vdevs that are bigger than 8G the metaslab size varies in + * a way that the number of metaslabs increases in powers of two, + * linearly in terms of vdev_asize, starting from 16 metaslabs. + * So for vdev_asize of 8G we get 16 metaslabs, for 16G, we get 32, + * and so on, until we hit the maximum metaslab count limit + * [vdev_max_ms_count] from which point the metaslab count stays + * the same. */ - vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); - vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); + ms_shift = vdev_default_ms_shift; + + if ((asize >> ms_shift) < vdev_min_ms_count) { + /* + * For devices that are less than 8G we want to have + * exactly 16 metaslabs. We don't want less as integer + * division rounds down, so less metaslabs mean more + * wasted space. We don't want more as these vdevs are + * small and in the likely event that we are running + * out of space, the SPA will have a hard time finding + * space due to fragmentation. + */ + ms_shift = highbit64(asize / vdev_min_ms_count); + ms_shift = MAX(ms_shift, SPA_MAXBLOCKSHIFT); + + } else if ((asize >> ms_shift) > vdev_max_ms_count) { + ms_shift = highbit64(asize / vdev_max_ms_count); + } + + vd->vdev_ms_shift = ms_shift; + ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void @@ -2204,7 +2266,7 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) return (B_FALSE); mutex_enter(&vd->vdev_dtl_lock); - if (range_tree_space(rt) != 0) + if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); @@ -2218,7 +2280,7 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); - empty = (range_tree_space(rt) == 0); + empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); @@ -2292,7 +2354,7 @@ vdev_dtl_should_excise(vdev_t *vd) return (B_FALSE); if (vd->vdev_resilver_txg == 0 || - range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) + range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); /* @@ -2396,8 +2458,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) * the top level so that we persist the change. */ if (vd->vdev_resilver_txg != 0 && - range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && - range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { + range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && + range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } @@ -2557,7 +2619,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; - new_object = space_map_alloc(mos, tx); + new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, @@ -2571,7 +2633,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); - space_map_truncate(vd->vdev_dtl_sm, tx); + space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); range_tree_vacate(rtsync, NULL, NULL); @@ -2642,7 +2704,7 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); - if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && + if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); @@ -2670,6 +2732,28 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) return (needed); } +/* + * Gets the checkpoint space map object from the vdev's ZAP. + * Returns the spacemap object, or 0 if it wasn't in the ZAP + * or the ZAP doesn't exist yet. + */ +int +vdev_checkpoint_sm_object(vdev_t *vd) +{ + ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + if (vd->vdev_top_zap == 0) { + return (0); + } + + uint64_t sm_obj = 0; + int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj); + + VERIFY(err == 0 || err == ENOENT); + + return (sm_obj); +} + int vdev_load(vdev_t *vd) { @@ -2705,6 +2789,35 @@ vdev_load(vdev_t *vd) VDEV_AUX_CORRUPT_DATA); return (error); } + + uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd); + if (checkpoint_sm_obj != 0) { + objset_t *mos = spa_meta_objset(vd->vdev_spa); + ASSERT(vd->vdev_asize != 0); + ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); + + if ((error = space_map_open(&vd->vdev_checkpoint_sm, + mos, checkpoint_sm_obj, 0, vd->vdev_asize, + vd->vdev_ashift))) { + vdev_dbgmsg(vd, "vdev_load: space_map_open " + "failed for checkpoint spacemap (obj %llu) " + "[error=%d]", + (u_longlong_t)checkpoint_sm_obj, error); + return (error); + } + ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + space_map_update(vd->vdev_checkpoint_sm); + + /* + * Since the checkpoint_sm contains free entries + * exclusively we can use sm_alloc to indicate the + * culmulative checkpointed space that has been freed. + */ + vd->vdev_stat.vs_checkpoint_space = + -vd->vdev_checkpoint_sm->sm_alloc; + vd->vdev_spa->spa_checkpoint_info.sci_dspace += + vd->vdev_stat.vs_checkpoint_space; + } } /* @@ -2722,7 +2835,7 @@ vdev_load(vdev_t *vd) if (obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); - ASSERT(vd->vdev_obsolete_sm == NULL); + ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { @@ -2848,6 +2961,12 @@ vdev_remove_empty(vdev_t *vd, uint64_t txg) mutex_exit(&msp->ms_lock); } + if (vd->vdev_checkpoint_sm != NULL) { + ASSERT(spa_has_checkpoint(spa)); + space_map_close(vd->vdev_checkpoint_sm); + vd->vdev_checkpoint_sm = NULL; + } + metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) @@ -3181,6 +3300,17 @@ top: error = spa_reset_logs(spa); + /* + * If the log device was successfully reset but has + * checkpointed data, do not offline it. + */ + if (error == 0 && + tvd->vdev_checkpoint_sm != NULL) { + ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc, + !=, 0); + error = ZFS_ERR_CHECKPOINT_EXISTS; + } + spa_vdev_state_enter(spa, SCL_ALLOC); /* @@ -3419,6 +3549,23 @@ vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) } +boolean_t +vdev_is_spacemap_addressable(vdev_t *vd) +{ + /* + * Assuming 47 bits of the space map entry dedicated for the entry's + * offset (see description in space_map.h), we calculate the maximum + * address that can be described by a space map entry for the given + * device. + */ + uint64_t shift = vd->vdev_ashift + 47; + + if (shift >= 63) /* detect potential overflow */ + return (B_TRUE); + + return (vd->vdev_asize < (1ULL << shift)); +} + /* * Get statistics for the given vdev. */ @@ -4243,11 +4390,15 @@ EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); /* BEGIN CSTYLED */ -module_param(metaslabs_per_vdev, int, 0644); -MODULE_PARM_DESC(metaslabs_per_vdev, +module_param(vdev_max_ms_count, int, 0644); +MODULE_PARM_DESC(vdev_max_ms_count, "Divide added vdev into approximately (but no more than) this number " "of metaslabs"); +module_param(vdev_min_ms_count, int, 0644); +MODULE_PARM_DESC(vdev_min_ms_count, + "Minimum number of metaslabs per top-level vdev"); + module_param(zfs_delays_per_second, uint, 0644); MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many " "IO delays per second"); diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index a93e41258..b14b153b2 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -298,14 +298,13 @@ static const zio_vsd_ops_t vdev_indirect_vsd_ops = { }; /* - * Mark the given offset and size as being obsolete in the given txg. + * Mark the given offset and size as being obsolete. */ void -vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size, - uint64_t txg) +vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; - ASSERT3U(spa_syncing_txg(spa), ==, txg); + ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(size > 0); @@ -316,7 +315,7 @@ vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size, mutex_enter(&vd->vdev_obsolete_lock); range_tree_add(vd->vdev_obsolete_segments, offset, size); mutex_exit(&vd->vdev_obsolete_lock); - vdev_dirty(vd, 0, NULL, txg); + vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); } } @@ -334,7 +333,7 @@ spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, /* The DMU can only remap indirect vdevs. */ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); - vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx)); + vdev_indirect_mark_obsolete(vd, offset, size); } static spa_condensing_indirect_t * @@ -727,7 +726,8 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr) return (0); VERIFY0(dsl_sync_task(spa_name(spa), NULL, - spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE)); + spa_condense_indirect_complete_sync, sci, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); return (0); } @@ -804,7 +804,8 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) if (vdev_obsolete_sm_object(vd) == 0) { uint64_t obsolete_sm_object = - space_map_alloc(spa->spa_meta_objset, tx); + space_map_alloc(spa->spa_meta_objset, + vdev_standard_sm_blksz, tx); ASSERT(vd->vdev_top_zap != 0); VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7ea8da1e6..29d7d651b 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* @@ -352,6 +352,37 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) kmem_free(vsx, sizeof (*vsx)); } +static void +root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) +{ + spa_t *spa = vd->vdev_spa; + + if (vd != spa->spa_root_vdev) + return; + + /* provide either current or previous scan information */ + pool_scan_stat_t ps; + if (spa_scan_get_stats(spa, &ps) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, + sizeof (pool_scan_stat_t) / sizeof (uint64_t)); + } + + pool_removal_stat_t prs; + if (spa_removal_get_stats(spa, &prs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, + sizeof (prs) / sizeof (uint64_t)); + } + + pool_checkpoint_stat_t pcs; + if (spa_checkpoint_get_stats(spa, &pcs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, + sizeof (pcs) / sizeof (uint64_t)); + } +} + /* * Generate the nvlist representing this vdev's config. */ @@ -474,20 +505,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (getstats) { vdev_config_generate_stats(vd, nv); - /* provide either current or previous scan information */ - pool_scan_stat_t ps; - if (spa_scan_get_stats(spa, &ps) == 0) { - fnvlist_add_uint64_array(nv, - ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, - sizeof (pool_scan_stat_t) / sizeof (uint64_t)); - } - - pool_removal_stat_t prs; - if (spa_removal_get_stats(spa, &prs) == 0) { - fnvlist_add_uint64_array(nv, - ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, - sizeof (prs) / sizeof (uint64_t)); - } + root_vdev_actions_getprogress(vd, nv); /* * Note: this can be called from open context @@ -1525,11 +1543,10 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) { spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; - vdev_t *vd; - zio_t *zio; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + ASSERT(svdcount != 0); retry: /* * Normally, we don't want to try too hard to write every label and @@ -1571,9 +1588,10 @@ retry: * written in this txg will be committed to stable storage * before any uberblock that references them. */ - zio = zio_root(spa, NULL, NULL, flags); + zio_t *zio = zio_root(spa, NULL, NULL, flags); - for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd; + for (vdev_t *vd = + txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL; vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) zio_flush(zio, vd); @@ -1588,8 +1606,14 @@ retry: * the new labels to disk to ensure that all even-label updates * are committed to stable storage before the uberblock update. */ - if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) + if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_label_sync_list() returned error %d " + "for pool '%s' when syncing out the even labels " + "of dirty vdevs", error, spa_name(spa)); + } goto retry; + } /* * Sync the uberblocks to all vdevs in svd[]. @@ -1606,8 +1630,13 @@ retry: * been successfully committed) will be valid with respect * to the new uberblocks. */ - if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) + if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_uberblock_sync_list() returned error " + "%d for pool '%s'", error, spa_name(spa)); + } goto retry; + } if (spa_multihost(spa)) mmp_update_uberblock(spa, ub); @@ -1622,8 +1651,14 @@ retry: * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ - if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) + if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_label_sync_list() returned error %d " + "for pool '%s' when syncing out the odd labels of " + "dirty vdevs", error, spa_name(spa)); + } goto retry; + } return (0); } diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index f9084e8cf..f2bdd6389 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -117,6 +117,12 @@ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; */ int vdev_removal_max_span = 32 * 1024; +/* + * This is used by the test suite so that it can ensure that certain + * actions happen while in the middle of a removal. + */ +unsigned long zfs_remove_max_bytes_pause = -1UL; + #define VDEV_REMOVAL_ZAP_OBJS "lzap" static void spa_vdev_remove_thread(void *arg); @@ -286,11 +292,11 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) * be copied. */ spa->spa_removing_phys.sr_to_copy -= - range_tree_space(ms->ms_freeingtree); + range_tree_space(ms->ms_freeing); - ASSERT0(range_tree_space(ms->ms_freedtree)); + ASSERT0(range_tree_space(ms->ms_freed)); for (int t = 0; t < TXG_SIZE; t++) - ASSERT0(range_tree_space(ms->ms_alloctree[t])); + ASSERT0(range_tree_space(ms->ms_allocating[t])); } /* @@ -467,19 +473,18 @@ spa_restart_removal(spa_t *spa) * and we correctly free already-copied data. */ void -free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size, - uint64_t txg) +free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t txg = spa_syncing_txg(spa); uint64_t max_offset_yet = 0; ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, vdev_indirect_mapping_object(vim)); ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id); - ASSERT3U(spa_syncing_txg(spa), ==, txg); mutex_enter(&svr->svr_lock); @@ -494,8 +499,13 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size, * held, so that the remove_thread can not load this metaslab and then * visit this offset between the time that we metaslab_free_concrete() * and when we check to see if it has been visited. + * + * Note: The checkpoint flag is set to false as having/taking + * a checkpoint and removing a device can't happen at the same + * time. */ - metaslab_free_concrete(vd, offset, size, txg); + ASSERT(!spa_has_checkpoint(spa)); + metaslab_free_concrete(vd, offset, size, B_FALSE); uint64_t synced_size = 0; uint64_t synced_offset = 0; @@ -627,16 +637,17 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size, * of this free. */ if (synced_size > 0) { - vdev_indirect_mark_obsolete(vd, synced_offset, synced_size, - txg); + vdev_indirect_mark_obsolete(vd, synced_offset, synced_size); + /* * Note: this can only be called from syncing context, * and the vdev_indirect_mapping is only changed from the * sync thread, so we don't need svr_lock while doing * metaslab_free_impl_cb. */ + boolean_t checkpoint = B_FALSE; vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, - metaslab_free_impl_cb, &txg); + metaslab_free_impl_cb, &checkpoint); } } @@ -684,10 +695,10 @@ static void free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) { vdev_t *vd = arg; - vdev_indirect_mark_obsolete(vd, offset, size, - vd->vdev_spa->spa_syncing_txg); + vdev_indirect_mark_obsolete(vd, offset, size); + boolean_t checkpoint = B_FALSE; vdev_indirect_ops.vdev_op_remap(vd, offset, size, - metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg); + metaslab_free_impl_cb, &checkpoint); } /* @@ -1363,7 +1374,7 @@ spa_vdev_remove_thread(void *arg) * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) { - ASSERT0(range_tree_space(msp->ms_alloctree[i])); + ASSERT0(range_tree_space(msp->ms_allocating[i])); } /* @@ -1393,7 +1404,7 @@ spa_vdev_remove_thread(void *arg) SM_ALLOC)); space_map_close(sm); - range_tree_walk(msp->ms_freeingtree, + range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); /* @@ -1412,7 +1423,7 @@ spa_vdev_remove_thread(void *arg) msp->ms_id); while (!svr->svr_thread_exit && - range_tree_space(svr->svr_allocd_segs) != 0) { + !range_tree_is_empty(svr->svr_allocd_segs)) { mutex_exit(&svr->svr_lock); @@ -1427,6 +1438,19 @@ spa_vdev_remove_thread(void *arg) */ spa_config_exit(spa, SCL_CONFIG, FTAG); + /* + * This delay will pause the removal around the point + * specified by zfs_remove_max_bytes_pause. We do this + * solely from the test suite or during debugging. + */ + uint64_t bytes_copied = + spa->spa_removing_phys.sr_copied; + for (int i = 0; i < TXG_SIZE; i++) + bytes_copied += svr->svr_bytes_done[i]; + while (zfs_remove_max_bytes_pause <= bytes_copied && + !svr->svr_thread_exit) + delay(hz); + mutex_enter(&vca.vca_lock); while (vca.vca_outstanding_bytes > zfs_remove_max_copy_bytes) { @@ -1567,10 +1591,10 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) - ASSERT0(range_tree_space(msp->ms_alloctree[i])); + ASSERT0(range_tree_space(msp->ms_allocating[i])); for (int i = 0; i < TXG_DEFER_SIZE; i++) - ASSERT0(range_tree_space(msp->ms_defertree[i])); - ASSERT0(range_tree_space(msp->ms_freedtree)); + ASSERT0(range_tree_space(msp->ms_defer[i])); + ASSERT0(range_tree_space(msp->ms_freed)); if (msp->ms_sm != NULL) { /* @@ -1586,7 +1610,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) mutex_enter(&svr->svr_lock); VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); - range_tree_walk(msp->ms_freeingtree, + range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); /* @@ -1662,7 +1686,8 @@ spa_vdev_remove_cancel(spa_t *spa) uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, - spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE); + spa_vdev_remove_cancel_sync, NULL, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED); if (error == 0) { spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); @@ -1999,6 +2024,17 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) if (!locked) txg = spa_vdev_enter(spa); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + + if (!locked) + return (spa_vdev_exit(spa, NULL, txg, error)); + + return (error); + } + vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (spa->spa_spares.sav_vdevs != NULL && @@ -2111,6 +2147,13 @@ module_param(vdev_removal_max_span, int, 0644); MODULE_PARM_DESC(vdev_removal_max_span, "Largest span of free chunks a remap segment can span"); +/* BEGIN CSTYLED */ +module_param(zfs_remove_max_bytes_pause, ulong, 0644); +MODULE_PARM_DESC(zfs_remove_max_bytes_pause, + "Pause device removal after this many bytes are copied " + "(debug use only - causes removal to hang)"); +/* END CSTYLED */ + EXPORT_SYMBOL(free_from_removing_vdev); EXPORT_SYMBOL(spa_removal_get_stats); EXPORT_SYMBOL(spa_remove_init); diff --git a/module/zfs/zcp.c b/module/zfs/zcp.c index dad09da50..475138621 100644 --- a/module/zfs/zcp.c +++ b/module/zfs/zcp.c @@ -1142,7 +1142,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync, if (sync) { err = dsl_sync_task(poolname, NULL, - zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_NONE); + zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_ZCP_EVAL); if (err != 0) zcp_pool_error(&evalargs, poolname); } else { diff --git a/module/zfs/zcp_synctask.c b/module/zfs/zcp_synctask.c index 196a3d4b7..e089666f2 100644 --- a/module/zfs/zcp_synctask.c +++ b/module/zfs/zcp_synctask.c @@ -110,7 +110,7 @@ static zcp_synctask_info_t zcp_synctask_destroy_info = { {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN}, {NULL, 0} }, - .space_check = ZFS_SPACE_CHECK_NONE, + .space_check = ZFS_SPACE_CHECK_DESTROY, .blocks_modified = 0 }; @@ -303,10 +303,9 @@ zcp_synctask_wrapper(lua_State *state) zcp_parse_args(state, info->name, info->pargs, info->kwargs); err = 0; - if (info->space_check != ZFS_SPACE_CHECK_NONE && funcspace > 0) { - uint64_t quota = dsl_pool_adjustedsize(dp, - info->space_check == ZFS_SPACE_CHECK_RESERVED) - - metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); + if (info->space_check != ZFS_SPACE_CHECK_NONE) { + uint64_t quota = dsl_pool_unreserved_space(dp, + info->space_check); uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes + ri->zri_space_used; diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index f95b77db7..e70207aa5 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3731,6 +3731,29 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, } /* + * innvl: unused + * outnvl: empty + */ +/* ARGSUSED */ +static int +zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + return (spa_checkpoint(poolname)); +} + +/* + * innvl: unused + * outnvl: empty + */ +/* ARGSUSED */ +static int +zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, + nvlist_t *outnvl) +{ + return (spa_checkpoint_discard(poolname)); +} + +/* * inputs: * zc_name name of dataset to destroy * zc_objset_type type of objset @@ -6422,6 +6445,15 @@ zfs_ioctl_init(void) POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, + zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("zpool_discard_checkpoint", + ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, + zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/module/zfs/zil.c b/module/zfs/zil.c index e8adc6d99..d0b1c1d14 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -29,6 +29,7 @@ #include <sys/zfs_context.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/dmu.h> #include <sys/zap.h> #include <sys/arc.h> @@ -430,6 +431,35 @@ done: return (error); } +/* ARGSUSED */ +static int +zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) +{ + ASSERT(!BP_IS_HOLE(bp)); + + /* + * As we call this function from the context of a rewind to a + * checkpoint, each ZIL block whose txg is later than the txg + * that we rewind to is invalid. Thus, we return -1 so + * zil_parse() doesn't attempt to read it. + */ + if (bp->blk_birth >= first_txg) + return (-1); + + if (zil_bp_tree_add(zilog, bp) != 0) + return (0); + + zio_free(zilog->zl_spa, first_txg, bp); + return (0); +} + +/* ARGSUSED */ +static int +zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) +{ + return (0); +} + static int zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) { @@ -476,7 +506,7 @@ zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) static int zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) { - zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); + zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } @@ -662,7 +692,7 @@ zil_create(zilog_t *zilog) txg = dmu_tx_get_txg(tx); if (!BP_IS_HOLE(&blk)) { - zio_free_zil(zilog->zl_spa, txg, &blk); + zio_free(zilog->zl_spa, txg, &blk); BP_ZERO(&blk); } @@ -767,8 +797,8 @@ int zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) { dmu_tx_t *tx = txarg; - uint64_t first_txg = dmu_tx_get_txg(tx); zilog_t *zilog; + uint64_t first_txg; zil_header_t *zh; objset_t *os; int error; @@ -790,10 +820,43 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); + ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); + first_txg = spa_min_claim_txg(zilog->zl_spa); - if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { - if (!BP_IS_HOLE(&zh->zh_log)) - zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); + /* + * If the spa_log_state is not set to be cleared, check whether + * the current uberblock is a checkpoint one and if the current + * header has been claimed before moving on. + * + * If the current uberblock is a checkpointed uberblock then + * one of the following scenarios took place: + * + * 1] We are currently rewinding to the checkpoint of the pool. + * 2] We crashed in the middle of a checkpoint rewind but we + * did manage to write the checkpointed uberblock to the + * vdev labels, so when we tried to import the pool again + * the checkpointed uberblock was selected from the import + * procedure. + * + * In both cases we want to zero out all the ZIL blocks, except + * the ones that have been claimed at the time of the checkpoint + * (their zh_claim_txg != 0). The reason is that these blocks + * may be corrupted since we may have reused their locations on + * disk after we took the checkpoint. + * + * We could try to set spa_log_state to SPA_LOG_CLEAR earlier + * when we first figure out whether the current uberblock is + * checkpointed or not. Unfortunately, that would discard all + * the logs, including the ones that are claimed, and we would + * leak space. + */ + if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || + (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && + zh->zh_claim_txg == 0)) { + if (!BP_IS_HOLE(&zh->zh_log)) { + (void) zil_parse(zilog, zil_clear_log_block, + zil_noop_log_record, tx, first_txg, B_FALSE); + } BP_ZERO(&zh->zh_log); if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; @@ -803,6 +866,12 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) } /* + * If we are not rewinding and opening the pool normally, then + * the min_claim_txg should be equal to the first txg of the pool. + */ + ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); + + /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can * read only part of the log now (e.g. due to a missing device), @@ -855,16 +924,17 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) zilog = dmu_objset_zil(os); bp = (blkptr_t *)&zilog->zl_header->zh_log; - /* - * Check the first block and determine if it's on a log device - * which may have been removed or faulted prior to loading this - * pool. If so, there's no point in checking the rest of the log - * as its content should have already been synced to the pool. - */ if (!BP_IS_HOLE(bp)) { vdev_t *vd; boolean_t valid = B_TRUE; + /* + * Check the first block and determine if it's on a log device + * which may have been removed or faulted prior to loading this + * pool. If so, there's no point in checking the rest of the + * log as its content should have already been synced to the + * pool. + */ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); if (vd->vdev_islog && vdev_is_dead(vd)) @@ -873,6 +943,18 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) if (!valid) return (0); + + /* + * Check whether the current uberblock is checkpointed (e.g. + * we are rewinding) and whether the current header has been + * claimed or not. If it hasn't then skip verifying it. We + * do this because its ZIL blocks may be part of the pool's + * state before the rewind, which is no longer valid. + */ + zil_header_t *zh = zil_header_in_syncing_context(zilog); + if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && + zh->zh_claim_txg == 0) + return (0); } /* @@ -883,8 +965,8 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) * which will update spa_max_claim_txg. See spa_load() for details. */ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, - zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa), - B_FALSE); + zilog->zl_header->zh_claim_txg ? -1ULL : + spa_min_claim_txg(os->os_spa), B_FALSE); return ((error == ECKSUM || error == ENOENT) ? 0 : error); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 8a495988b..9a98d4fc0 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1147,8 +1147,9 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ - ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); - ASSERT(txg == spa_first_txg(spa) || txg == 0); + ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, + spa_min_claim_txg(spa)); + ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), @@ -3458,18 +3459,6 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, } /* - * Free an intent log block. - */ -void -zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) -{ - ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); - ASSERT(!BP_IS_GANG(bp)); - - zio_free(spa, txg, bp); -} - -/* * ========================================================================== * Read and write to physical devices * ========================================================================== diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c index dc0f6d983..1c4a8e02c 100644 --- a/module/zfs/zthr.c +++ b/module/zfs/zthr.c @@ -235,8 +235,6 @@ zthr_destroy(zthr_t *t) void zthr_wakeup(zthr_t *t) { - ASSERT3P(t->zthr_thread, !=, NULL); - mutex_enter(&t->zthr_lock); cv_broadcast(&t->zthr_cv); mutex_exit(&t->zthr_lock); |